aboutsummaryrefslogtreecommitdiffstats
path: root/fs/buffer.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/buffer.c')
-rw-r--r--fs/buffer.c255
1 files changed, 160 insertions, 95 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index c7062c896d7..7a75c3e0fd5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -46,8 +46,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
-inline void
-init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
+void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
{
bh->b_end_io = handler;
bh->b_private = private;
@@ -555,7 +554,7 @@ void emergency_thaw_all(void)
*/
int sync_mapping_buffers(struct address_space *mapping)
{
- struct address_space *buffer_mapping = mapping->assoc_mapping;
+ struct address_space *buffer_mapping = mapping->private_data;
if (buffer_mapping == NULL || list_empty(&mapping->private_list))
return 0;
@@ -588,10 +587,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
struct address_space *buffer_mapping = bh->b_page->mapping;
mark_buffer_dirty(bh);
- if (!mapping->assoc_mapping) {
- mapping->assoc_mapping = buffer_mapping;
+ if (!mapping->private_data) {
+ mapping->private_data = buffer_mapping;
} else {
- BUG_ON(mapping->assoc_mapping != buffer_mapping);
+ BUG_ON(mapping->private_data != buffer_mapping);
}
if (!bh->b_assoc_map) {
spin_lock(&buffer_mapping->private_lock);
@@ -788,7 +787,7 @@ void invalidate_inode_buffers(struct inode *inode)
if (inode_has_buffers(inode)) {
struct address_space *mapping = &inode->i_data;
struct list_head *list = &mapping->private_list;
- struct address_space *buffer_mapping = mapping->assoc_mapping;
+ struct address_space *buffer_mapping = mapping->private_data;
spin_lock(&buffer_mapping->private_lock);
while (!list_empty(list))
@@ -811,7 +810,7 @@ int remove_inode_buffers(struct inode *inode)
if (inode_has_buffers(inode)) {
struct address_space *mapping = &inode->i_data;
struct list_head *list = &mapping->private_list;
- struct address_space *buffer_mapping = mapping->assoc_mapping;
+ struct address_space *buffer_mapping = mapping->private_data;
spin_lock(&buffer_mapping->private_lock);
while (!list_empty(list)) {
@@ -850,13 +849,10 @@ try_again:
if (!bh)
goto no_grow;
- bh->b_bdev = NULL;
bh->b_this_page = head;
bh->b_blocknr = -1;
head = bh;
- bh->b_state = 0;
- atomic_set(&bh->b_count, 0);
bh->b_size = size;
/* Link the buffer to its page */
@@ -911,17 +907,29 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
attach_page_buffers(page, head);
}
+static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
+{
+ sector_t retval = ~((sector_t)0);
+ loff_t sz = i_size_read(bdev->bd_inode);
+
+ if (sz) {
+ unsigned int sizebits = blksize_bits(size);
+ retval = (sz >> sizebits);
+ }
+ return retval;
+}
+
/*
* Initialise the state of a blockdev page's buffers.
*/
-static void
+static sector_t
init_page_buffers(struct page *page, struct block_device *bdev,
sector_t block, int size)
{
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
int uptodate = PageUptodate(page);
- sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode));
+ sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
do {
if (!buffer_mapped(bh)) {
@@ -936,33 +944,41 @@ init_page_buffers(struct page *page, struct block_device *bdev,
block++;
bh = bh->b_this_page;
} while (bh != head);
+
+ /*
+ * Caller needs to validate requested block against end of device.
+ */
+ return end_block;
}
/*
* Create the page-cache page that contains the requested block.
*
- * This is user purely for blockdev mappings.
+ * This is used purely for blockdev mappings.
*/
-static struct page *
+static int
grow_dev_page(struct block_device *bdev, sector_t block,
- pgoff_t index, int size)
+ pgoff_t index, int size, int sizebits)
{
struct inode *inode = bdev->bd_inode;
struct page *page;
struct buffer_head *bh;
+ sector_t end_block;
+ int ret = 0; /* Will call free_more_memory() */
page = find_or_create_page(inode->i_mapping, index,
(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
if (!page)
- return NULL;
+ return ret;
BUG_ON(!PageLocked(page));
if (page_has_buffers(page)) {
bh = page_buffers(page);
if (bh->b_size == size) {
- init_page_buffers(page, bdev, block, size);
- return page;
+ end_block = init_page_buffers(page, bdev,
+ index << sizebits, size);
+ goto done;
}
if (!try_to_free_buffers(page))
goto failed;
@@ -982,14 +998,14 @@ grow_dev_page(struct block_device *bdev, sector_t block,
*/
spin_lock(&inode->i_mapping->private_lock);
link_dev_buffers(page, bh);
- init_page_buffers(page, bdev, block, size);
+ end_block = init_page_buffers(page, bdev, index << sizebits, size);
spin_unlock(&inode->i_mapping->private_lock);
- return page;
-
+done:
+ ret = (block < end_block) ? 1 : -ENXIO;
failed:
unlock_page(page);
page_cache_release(page);
- return NULL;
+ return ret;
}
/*
@@ -999,7 +1015,6 @@ failed:
static int
grow_buffers(struct block_device *bdev, sector_t block, int size)
{
- struct page *page;
pgoff_t index;
int sizebits;
@@ -1023,22 +1038,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
bdevname(bdev, b));
return -EIO;
}
- block = index << sizebits;
+
/* Create a page with the proper size buffers.. */
- page = grow_dev_page(bdev, block, index, size);
- if (!page)
- return 0;
- unlock_page(page);
- page_cache_release(page);
- return 1;
+ return grow_dev_page(bdev, block, index, size, sizebits);
}
static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block, int size)
{
- int ret;
- struct buffer_head *bh;
-
/* Size must be multiple of hard sectorsize */
if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
(size < 512 || size > PAGE_SIZE))) {
@@ -1051,21 +1058,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
return NULL;
}
-retry:
- bh = __find_get_block(bdev, block, size);
- if (bh)
- return bh;
+ for (;;) {
+ struct buffer_head *bh;
+ int ret;
- ret = grow_buffers(bdev, block, size);
- if (ret == 0) {
- free_more_memory();
- goto retry;
- } else if (ret > 0) {
bh = __find_get_block(bdev, block, size);
if (bh)
return bh;
+
+ ret = grow_buffers(bdev, block, size);
+ if (ret < 0)
+ return NULL;
+ if (ret == 0)
+ free_more_memory();
}
- return NULL;
}
/*
@@ -1321,10 +1327,6 @@ EXPORT_SYMBOL(__find_get_block);
* which corresponds to the passed block_device, block and size. The
* returned buffer has its reference count incremented.
*
- * __getblk() cannot fail - it just keeps trying. If you pass it an
- * illegal block number, __getblk() will happily return a buffer_head
- * which represents the non-existent block. Very weird.
- *
* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
* attempt is failing. FIXME, perhaps?
*/
@@ -1559,6 +1561,28 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
EXPORT_SYMBOL(unmap_underlying_metadata);
/*
+ * Size is a power-of-two in the range 512..PAGE_SIZE,
+ * and the case we care about most is PAGE_SIZE.
+ *
+ * So this *could* possibly be written with those
+ * constraints in mind (relevant mostly if some
+ * architecture has a slow bit-scan instruction)
+ */
+static inline int block_size_bits(unsigned int blocksize)
+{
+ return ilog2(blocksize);
+}
+
+static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
+{
+ BUG_ON(!PageLocked(page));
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
+ return page_buffers(page);
+}
+
+/*
* NOTE! All mapped/uptodate combinations are valid:
*
* Mapped Uptodate Meaning
@@ -1595,19 +1619,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
sector_t block;
sector_t last_block;
struct buffer_head *bh, *head;
- const unsigned blocksize = 1 << inode->i_blkbits;
+ unsigned int blocksize, bbits;
int nr_underway = 0;
int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
WRITE_SYNC : WRITE);
- BUG_ON(!PageLocked(page));
-
- last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
-
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, blocksize,
+ head = create_page_buffers(page, inode,
(1 << BH_Dirty)|(1 << BH_Uptodate));
- }
/*
* Be very careful. We have no exclusion from __set_page_dirty_buffers
@@ -1619,9 +1637,12 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
* handle that here by just cleaning them.
*/
- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- head = page_buffers(page);
bh = head;
+ blocksize = bh->b_size;
+ bbits = block_size_bits(blocksize);
+
+ block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+ last_block = (i_size_read(inode) - 1) >> bbits;
/*
* Get all the dirty buffers mapped to disk addresses and
@@ -1812,12 +1833,10 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
BUG_ON(to > PAGE_CACHE_SIZE);
BUG_ON(from > to);
- blocksize = 1 << inode->i_blkbits;
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
- head = page_buffers(page);
+ head = create_page_buffers(page, inode, 0);
+ blocksize = head->b_size;
+ bbits = block_size_bits(blocksize);
- bbits = inode->i_blkbits;
block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
for(bh = head, block_start = 0; bh != head || !block_start;
@@ -1887,11 +1906,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
unsigned blocksize;
struct buffer_head *bh, *head;
- blocksize = 1 << inode->i_blkbits;
+ bh = head = page_buffers(page);
+ blocksize = bh->b_size;
- for(bh = head = page_buffers(page), block_start = 0;
- bh != head || !block_start;
- block_start=block_end, bh = bh->b_this_page) {
+ block_start = 0;
+ do {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (!buffer_uptodate(bh))
@@ -1901,7 +1920,10 @@ static int __block_commit_write(struct inode *inode, struct page *page,
mark_buffer_dirty(bh);
}
clear_buffer_new(bh);
- }
+
+ block_start = block_end;
+ bh = bh->b_this_page;
+ } while (bh != head);
/*
* If this is a partial write which happened to make all buffers
@@ -2026,7 +2048,6 @@ EXPORT_SYMBOL(generic_write_end);
int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
unsigned long from)
{
- struct inode *inode = page->mapping->host;
unsigned block_start, block_end, blocksize;
unsigned to;
struct buffer_head *bh, *head;
@@ -2035,13 +2056,13 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
if (!page_has_buffers(page))
return 0;
- blocksize = 1 << inode->i_blkbits;
+ head = page_buffers(page);
+ blocksize = head->b_size;
to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
to = from + to;
if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
return 0;
- head = page_buffers(page);
bh = head;
block_start = 0;
do {
@@ -2074,18 +2095,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
struct inode *inode = page->mapping->host;
sector_t iblock, lblock;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
- unsigned int blocksize;
+ unsigned int blocksize, bbits;
int nr, i;
int fully_mapped = 1;
- BUG_ON(!PageLocked(page));
- blocksize = 1 << inode->i_blkbits;
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
- head = page_buffers(page);
+ head = create_page_buffers(page, inode, 0);
+ blocksize = head->b_size;
+ bbits = block_size_bits(blocksize);
- iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
+ iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+ lblock = (i_size_read(inode)+blocksize-1) >> bbits;
bh = head;
nr = 0;
i = 0;
@@ -2306,8 +2325,8 @@ EXPORT_SYMBOL(block_commit_write);
* beyond EOF, then the page is guaranteed safe against truncation until we
* unlock the page.
*
- * Direct callers of this function should call vfs_check_frozen() so that page
- * fault does not busyloop until the fs is thawed.
+ * Direct callers of this function should protect against filesystem freezing
+ * using sb_start_write() - sb_end_write() functions.
*/
int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
@@ -2339,18 +2358,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
if (unlikely(ret < 0))
goto out_unlock;
- /*
- * Freezing in progress? We check after the page is marked dirty and
- * with page lock held so if the test here fails, we are sure freezing
- * code will wait during syncing until the page fault is done - at that
- * point page will be dirty and unlocked so freezing code will write it
- * and writeprotect it again.
- */
set_page_dirty(page);
- if (inode->i_sb->s_frozen != SB_UNFROZEN) {
- ret = -EAGAIN;
- goto out_unlock;
- }
wait_on_page_writeback(page);
return 0;
out_unlock:
@@ -2365,12 +2373,16 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
int ret;
struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
+ sb_start_pagefault(sb);
+
/*
- * This check is racy but catches the common case. The check in
- * __block_page_mkwrite() is reliable.
+ * Update file times before taking page lock. We may end up failing the
+ * fault so this update may be superfluous but who really cares...
*/
- vfs_check_frozen(sb, SB_FREEZE_WRITE);
+ file_update_time(vma->vm_file);
+
ret = __block_page_mkwrite(vma, vmf, get_block);
+ sb_end_pagefault(sb);
return block_page_mkwrite_return(ret);
}
EXPORT_SYMBOL(block_page_mkwrite);
@@ -2877,6 +2889,56 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
bio_put(bio);
}
+/*
+ * This allows us to do IO even on the odd last sectors
+ * of a device, even if the bh block size is some multiple
+ * of the physical sector size.
+ *
+ * We'll just truncate the bio to the size of the device,
+ * and clear the end of the buffer head manually.
+ *
+ * Truly out-of-range accesses will turn into actual IO
+ * errors, this only handles the "we need to be able to
+ * do IO at the final sector" case.
+ */
+static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
+{
+ sector_t maxsector;
+ unsigned bytes;
+
+ maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
+ if (!maxsector)
+ return;
+
+ /*
+ * If the *whole* IO is past the end of the device,
+ * let it through, and the IO layer will turn it into
+ * an EIO.
+ */
+ if (unlikely(bio->bi_sector >= maxsector))
+ return;
+
+ maxsector -= bio->bi_sector;
+ bytes = bio->bi_size;
+ if (likely((bytes >> 9) <= maxsector))
+ return;
+
+ /* Uhhuh. We've got a bh that straddles the device size! */
+ bytes = maxsector << 9;
+
+ /* Truncate the bio.. */
+ bio->bi_size = bytes;
+ bio->bi_io_vec[0].bv_len = bytes;
+
+ /* ..and clear the end of the buffer for reads */
+ if ((rw & RW_MASK) == READ) {
+ void *kaddr = kmap_atomic(bh->b_page);
+ memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(bh->b_page);
+ }
+}
+
int submit_bh(int rw, struct buffer_head * bh)
{
struct bio *bio;
@@ -2913,6 +2975,9 @@ int submit_bh(int rw, struct buffer_head * bh)
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
+ /* Take care of bh's that straddle the end of the device */
+ guard_bh_eod(rw, bio, bh);
+
bio_get(bio);
submit_bio(rw, bio);