aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_super.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_super.c')
-rw-r--r--fs/xfs/xfs_super.c335
1 files changed, 206 insertions, 129 deletions
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 0d9de41a715..ab8839b2627 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,7 +49,7 @@
#include "xfs_extfree_item.h"
#include "xfs_mru_cache.h"
#include "xfs_inode_item.h"
-#include "xfs_sync.h"
+#include "xfs_icache.h"
#include "xfs_trace.h"
#include <linux/namei.h>
@@ -88,6 +88,8 @@ mempool_t *xfs_ioend_pool;
* unwritten extent conversion */
#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */
#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */
+#define MNTOPT_32BITINODE "inode32" /* inode allocation limited to
+ * XFS_MAXINUMBER_32 */
#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */
#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */
#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */
@@ -120,12 +122,18 @@ mempool_t *xfs_ioend_pool;
* in the future, too.
*/
enum {
- Opt_barrier, Opt_nobarrier, Opt_err
+ Opt_barrier,
+ Opt_nobarrier,
+ Opt_inode64,
+ Opt_inode32,
+ Opt_err
};
static const match_table_t tokens = {
{Opt_barrier, "barrier"},
{Opt_nobarrier, "nobarrier"},
+ {Opt_inode64, "inode64"},
+ {Opt_inode32, "inode32"},
{Opt_err, NULL}
};
@@ -197,7 +205,9 @@ xfs_parseargs(
*/
mp->m_flags |= XFS_MOUNT_BARRIER;
mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+#if !XFS_BIG_INUMS
mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+#endif
/*
* These can be overridden by the mount option parsing.
@@ -294,6 +304,8 @@ xfs_parseargs(
return EINVAL;
}
dswidth = simple_strtoul(value, &eov, 10);
+ } else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
+ mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
#if !XFS_BIG_INUMS
@@ -492,6 +504,7 @@ xfs_showargs(
{ XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
{ XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
+ { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE },
{ 0, NULL }
};
static struct proc_xfs_info xfs_info_unset[] = {
@@ -591,6 +604,80 @@ xfs_max_file_offset(
return (((__uint64_t)pagefactor) << bitshift) - 1;
}
+xfs_agnumber_t
+xfs_set_inode32(struct xfs_mount *mp)
+{
+ xfs_agnumber_t index = 0;
+ xfs_agnumber_t maxagi = 0;
+ xfs_sb_t *sbp = &mp->m_sb;
+ xfs_agnumber_t max_metadata;
+ xfs_agino_t agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0);
+ xfs_ino_t ino = XFS_AGINO_TO_INO(mp, sbp->sb_agcount -1, agino);
+ xfs_perag_t *pag;
+
+ /* Calculate how much should be reserved for inodes to meet
+ * the max inode percentage.
+ */
+ if (mp->m_maxicount) {
+ __uint64_t icount;
+
+ icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+ do_div(icount, 100);
+ icount += sbp->sb_agblocks - 1;
+ do_div(icount, sbp->sb_agblocks);
+ max_metadata = icount;
+ } else {
+ max_metadata = sbp->sb_agcount;
+ }
+
+ for (index = 0; index < sbp->sb_agcount; index++) {
+ ino = XFS_AGINO_TO_INO(mp, index, agino);
+
+ if (ino > XFS_MAXINUMBER_32) {
+ pag = xfs_perag_get(mp, index);
+ pag->pagi_inodeok = 0;
+ pag->pagf_metadata = 0;
+ xfs_perag_put(pag);
+ continue;
+ }
+
+ pag = xfs_perag_get(mp, index);
+ pag->pagi_inodeok = 1;
+ maxagi++;
+ if (index < max_metadata)
+ pag->pagf_metadata = 1;
+ xfs_perag_put(pag);
+ }
+ mp->m_flags |= (XFS_MOUNT_32BITINODES |
+ XFS_MOUNT_SMALL_INUMS);
+
+ return maxagi;
+}
+
+xfs_agnumber_t
+xfs_set_inode64(struct xfs_mount *mp)
+{
+ xfs_agnumber_t index = 0;
+
+ for (index = 0; index < mp->m_sb.sb_agcount; index++) {
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, index);
+ pag->pagi_inodeok = 1;
+ pag->pagf_metadata = 0;
+ xfs_perag_put(pag);
+ }
+
+ /* There is no need for lock protection on m_flags,
+ * the rw_semaphore of the VFS superblock is locked
+ * during mount/umount/remount operations, so this is
+ * enough to avoid concurency on the m_flags field
+ */
+ mp->m_flags &= ~(XFS_MOUNT_32BITINODES |
+ XFS_MOUNT_SMALL_INUMS);
+ return index;
+}
+
STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
@@ -776,8 +863,30 @@ xfs_init_mount_workqueues(
WQ_MEM_RECLAIM, 0, mp->m_fsname);
if (!mp->m_cil_workqueue)
goto out_destroy_unwritten;
+
+ mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
+ WQ_NON_REENTRANT, 0, mp->m_fsname);
+ if (!mp->m_reclaim_workqueue)
+ goto out_destroy_cil;
+
+ mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
+ WQ_NON_REENTRANT, 0, mp->m_fsname);
+ if (!mp->m_log_workqueue)
+ goto out_destroy_reclaim;
+
+ mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
+ WQ_NON_REENTRANT, 0, mp->m_fsname);
+ if (!mp->m_eofblocks_workqueue)
+ goto out_destroy_log;
+
return 0;
+out_destroy_log:
+ destroy_workqueue(mp->m_log_workqueue);
+out_destroy_reclaim:
+ destroy_workqueue(mp->m_reclaim_workqueue);
+out_destroy_cil:
+ destroy_workqueue(mp->m_cil_workqueue);
out_destroy_unwritten:
destroy_workqueue(mp->m_unwritten_workqueue);
out_destroy_data_iodone_queue:
@@ -790,11 +899,32 @@ STATIC void
xfs_destroy_mount_workqueues(
struct xfs_mount *mp)
{
+ destroy_workqueue(mp->m_eofblocks_workqueue);
+ destroy_workqueue(mp->m_log_workqueue);
+ destroy_workqueue(mp->m_reclaim_workqueue);
destroy_workqueue(mp->m_cil_workqueue);
destroy_workqueue(mp->m_data_workqueue);
destroy_workqueue(mp->m_unwritten_workqueue);
}
+/*
+ * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
+ * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
+ * for IO to complete so that we effectively throttle multiple callers to the
+ * rate at which IO is completing.
+ */
+void
+xfs_flush_inodes(
+ struct xfs_mount *mp)
+{
+ struct super_block *sb = mp->m_super;
+
+ if (down_read_trylock(&sb->s_umount)) {
+ sync_inodes_sb(sb);
+ up_read(&sb->s_umount);
+ }
+}
+
/* Catch misguided souls that try to use this interface on XFS */
STATIC struct inode *
xfs_fs_alloc_inode(
@@ -868,67 +998,14 @@ xfs_fs_inode_init_once(
"xfsino", ip->i_ino);
}
-/*
- * This is called by the VFS when dirtying inode metadata. This can happen
- * for a few reasons, but we only care about timestamp updates, given that
- * we handled the rest ourselves. In theory no other calls should happen,
- * but for example generic_write_end() keeps dirtying the inode after
- * updating i_size. Thus we check that the flags are exactly I_DIRTY_SYNC,
- * and skip this call otherwise.
- *
- * We'll hopefull get a different method just for updating timestamps soon,
- * at which point this hack can go away, and maybe we'll also get real
- * error handling here.
- */
-STATIC void
-xfs_fs_dirty_inode(
- struct inode *inode,
- int flags)
-{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
- int error;
-
- if (flags != I_DIRTY_SYNC)
- return;
-
- trace_xfs_dirty_inode(ip);
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- goto trouble;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- /*
- * Grab all the latest timestamps from the Linux inode.
- */
- ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
- ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
- ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
- ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
-
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
- error = xfs_trans_commit(tp, 0);
- if (error)
- goto trouble;
- return;
-
-trouble:
- xfs_warn(mp, "failed to update timestamps for inode 0x%llx", ip->i_ino);
-}
-
STATIC void
xfs_fs_evict_inode(
struct inode *inode)
{
xfs_inode_t *ip = XFS_I(inode);
+ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+
trace_xfs_evict_inode(ip);
truncate_inode_pages(&inode->i_data, 0);
@@ -937,22 +1014,6 @@ xfs_fs_evict_inode(
XFS_STATS_INC(vn_remove);
XFS_STATS_DEC(vn_active);
- /*
- * The iolock is used by the file system to coordinate reads,
- * writes, and block truncates. Up to this point the lock
- * protected concurrent accesses by users of the inode. But
- * from here forward we're doing some final processing of the
- * inode because we're done with it, and although we reuse the
- * iolock for protection it is really a distinct lock class
- * (in the lockdep sense) from before. To keep lockdep happy
- * (and basically indicate what we are doing), we explicitly
- * re-init the iolock here.
- */
- ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
- mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
- lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
- &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
-
xfs_inactive(ip);
}
@@ -989,7 +1050,7 @@ xfs_fs_put_super(
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
- xfs_syncd_stop(mp);
+
xfs_freesb(mp);
xfs_icsb_destroy_counters(mp);
xfs_destroy_mount_workqueues(mp);
@@ -1004,7 +1065,6 @@ xfs_fs_sync_fs(
int wait)
{
struct xfs_mount *mp = XFS_M(sb);
- int error;
/*
* Doing anything during the async pass would be counterproductive.
@@ -1012,17 +1072,14 @@ xfs_fs_sync_fs(
if (!wait)
return 0;
- error = xfs_quiesce_data(mp);
- if (error)
- return -error;
-
+ xfs_log_force(mp, XFS_LOG_SYNC);
if (laptop_mode) {
/*
* The disk must be active because we're syncing.
- * We schedule xfssyncd now (now that the disk is
+ * We schedule log work now (now that the disk is
* active) instead of later (when it might not be).
*/
- flush_delayed_work_sync(&mp->m_sync_work);
+ flush_delayed_work(&mp->m_log->l_work);
}
return 0;
@@ -1099,6 +1156,48 @@ xfs_restore_resvblks(struct xfs_mount *mp)
xfs_reserve_blocks(mp, &resblks, NULL);
}
+/*
+ * Trigger writeback of all the dirty metadata in the file system.
+ *
+ * This ensures that the metadata is written to their location on disk rather
+ * than just existing in transactions in the log. This means after a quiesce
+ * there is no log replay required to write the inodes to disk - this is the
+ * primary difference between a sync and a quiesce.
+ *
+ * Note: xfs_log_quiesce() stops background log work - the callers must ensure
+ * it is started again when appropriate.
+ */
+void
+xfs_quiesce_attr(
+ struct xfs_mount *mp)
+{
+ int error = 0;
+
+ /* wait for all modifications to complete */
+ while (atomic_read(&mp->m_active_trans) > 0)
+ delay(100);
+
+ /* force the log to unpin objects from the now complete transactions */
+ xfs_log_force(mp, XFS_LOG_SYNC);
+
+ /* reclaim inodes to do any IO before the freeze completes */
+ xfs_reclaim_inodes(mp, 0);
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
+
+ /* Push the superblock and write an unmount record */
+ error = xfs_log_sbcount(mp);
+ if (error)
+ xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
+ "Frozen image may not be consistent.");
+ /*
+ * Just warn here till VFS can correctly support
+ * read-only remount without racing.
+ */
+ WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+
+ xfs_log_quiesce(mp);
+}
+
STATIC int
xfs_fs_remount(
struct super_block *sb,
@@ -1124,6 +1223,12 @@ xfs_fs_remount(
case Opt_nobarrier:
mp->m_flags &= ~XFS_MOUNT_BARRIER;
break;
+ case Opt_inode64:
+ mp->m_maxagi = xfs_set_inode64(mp);
+ break;
+ case Opt_inode32:
+ mp->m_maxagi = xfs_set_inode32(mp);
+ break;
default:
/*
* Logically we would return an error here to prevent
@@ -1173,20 +1278,18 @@ xfs_fs_remount(
* value if it is non-zero, otherwise go with the default.
*/
xfs_restore_resvblks(mp);
+ xfs_log_work_queue(mp);
}
/* rw -> ro */
if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
/*
- * After we have synced the data but before we sync the
- * metadata, we need to free up the reserve block pool so that
- * the used block count in the superblock on disk is correct at
- * the end of the remount. Stash the current reserve pool size
- * so that if we get remounted rw, we can return it to the same
- * size.
+ * Before we sync the metadata, we need to free up the reserve
+ * block pool so that the used block count in the superblock on
+ * disk is correct at the end of the remount. Stash the current
+ * reserve pool size so that if we get remounted rw, we can
+ * return it to the same size.
*/
-
- xfs_quiesce_data(mp);
xfs_save_resvblks(mp);
xfs_quiesce_attr(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -1218,6 +1321,7 @@ xfs_fs_unfreeze(
struct xfs_mount *mp = XFS_M(sb);
xfs_restore_resvblks(mp);
+ xfs_log_work_queue(mp);
return 0;
}
@@ -1296,6 +1400,8 @@ xfs_fs_fill_super(
spin_lock_init(&mp->m_sb_lock);
mutex_init(&mp->m_growlock);
atomic_set(&mp->m_active_trans, 0);
+ INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+ INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
mp->m_super = sb;
sb->s_fs_info = mp;
@@ -1346,10 +1452,6 @@ xfs_fs_fill_super(
/*
* we must configure the block size in the superblock before we run the
* full mount process as the mount process can lookup and cache inodes.
- * For the same reason we must also initialise the syncd and register
- * the inode cache shrinker so that inodes can be reclaimed during
- * operations like a quotacheck that iterate all inodes in the
- * filesystem.
*/
sb->s_magic = XFS_SB_MAGIC;
sb->s_blocksize = mp->m_sb.sb_blocksize;
@@ -1359,13 +1461,9 @@ xfs_fs_fill_super(
sb->s_time_gran = 1;
set_posix_acl_flag(sb);
- error = xfs_syncd_init(mp);
- if (error)
- goto out_filestream_unmount;
-
error = xfs_mountfs(mp);
if (error)
- goto out_syncd_stop;
+ goto out_filestream_unmount;
root = igrab(VFS_I(mp->m_rootip));
if (!root) {
@@ -1383,8 +1481,7 @@ xfs_fs_fill_super(
}
return 0;
- out_syncd_stop:
- xfs_syncd_stop(mp);
+
out_filestream_unmount:
xfs_filestream_unmount(mp);
out_free_sb:
@@ -1404,7 +1501,6 @@ out_destroy_workqueues:
out_unmount:
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
- xfs_syncd_stop(mp);
goto out_free_sb;
}
@@ -1436,7 +1532,6 @@ xfs_fs_free_cached_objects(
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
- .dirty_inode = xfs_fs_dirty_inode,
.evict_inode = xfs_fs_evict_inode,
.drop_inode = xfs_fs_drop_inode,
.put_super = xfs_fs_put_super,
@@ -1491,13 +1586,9 @@ xfs_init_zones(void)
if (!xfs_da_state_zone)
goto out_destroy_btree_cur_zone;
- xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
- if (!xfs_dabuf_zone)
- goto out_destroy_da_state_zone;
-
xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
if (!xfs_ifork_zone)
- goto out_destroy_dabuf_zone;
+ goto out_destroy_da_state_zone;
xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
if (!xfs_trans_zone)
@@ -1514,9 +1605,8 @@ xfs_init_zones(void)
* size possible under XFS. This wastes a little bit of memory,
* but it is much faster.
*/
- xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
- (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
- NBWORD) * sizeof(int))), "xfs_buf_item");
+ xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item),
+ "xfs_buf_item");
if (!xfs_buf_item_zone)
goto out_destroy_log_item_desc_zone;
@@ -1561,8 +1651,6 @@ xfs_init_zones(void)
kmem_zone_destroy(xfs_trans_zone);
out_destroy_ifork_zone:
kmem_zone_destroy(xfs_ifork_zone);
- out_destroy_dabuf_zone:
- kmem_zone_destroy(xfs_dabuf_zone);
out_destroy_da_state_zone:
kmem_zone_destroy(xfs_da_state_zone);
out_destroy_btree_cur_zone:
@@ -1582,6 +1670,11 @@ xfs_init_zones(void)
STATIC void
xfs_destroy_zones(void)
{
+ /*
+ * Make sure all delayed rcu free are flushed before we
+ * destroy caches.
+ */
+ rcu_barrier();
kmem_zone_destroy(xfs_ili_zone);
kmem_zone_destroy(xfs_inode_zone);
kmem_zone_destroy(xfs_efi_zone);
@@ -1590,7 +1683,6 @@ xfs_destroy_zones(void)
kmem_zone_destroy(xfs_log_item_desc_zone);
kmem_zone_destroy(xfs_trans_zone);
kmem_zone_destroy(xfs_ifork_zone);
- kmem_zone_destroy(xfs_dabuf_zone);
kmem_zone_destroy(xfs_da_state_zone);
kmem_zone_destroy(xfs_btree_cur_zone);
kmem_zone_destroy(xfs_bmap_free_item_zone);
@@ -1604,16 +1696,6 @@ STATIC int __init
xfs_init_workqueues(void)
{
/*
- * We never want to the same work item to run twice, reclaiming inodes
- * or idling the log is not going to get any faster by multiple CPUs
- * competing for ressources. Use the default large max_active value
- * so that even lots of filesystems can perform these task in parallel.
- */
- xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
- if (!xfs_syncd_wq)
- return -ENOMEM;
-
- /*
* The allocation workqueue can be used in memory reclaim situations
* (writepage path), and parallelism is only limited by the number of
* AGs in all the filesystems mounted. Hence use the default large
@@ -1621,20 +1703,15 @@ xfs_init_workqueues(void)
*/
xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
if (!xfs_alloc_wq)
- goto out_destroy_syncd;
+ return -ENOMEM;
return 0;
-
-out_destroy_syncd:
- destroy_workqueue(xfs_syncd_wq);
- return -ENOMEM;
}
STATIC void
xfs_destroy_workqueues(void)
{
destroy_workqueue(xfs_alloc_wq);
- destroy_workqueue(xfs_syncd_wq);
}
STATIC int __init