From 6b520e0565422966cdf1c3759bd73df77b0f248c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 12 Dec 2011 15:51:45 -0500 Subject: vfs: fix the stupidity with i_dentry in inode destructors Seeing that just about every destructor got that INIT_LIST_HEAD() copied into it, there is no point whatsoever keeping this INIT_LIST_HEAD in inode_init_once(); the cost of taking it into inode_init_always() will be negligible for pipes and sockets and negative for everything else. Not to mention the removal of boilerplate code from ->destroy_inode() instances... Signed-off-by: Al Viro --- fs/exofs/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/exofs') diff --git a/fs/exofs/super.c b/fs/exofs/super.c index e6085ec192d..8addfe314dc 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -166,7 +166,6 @@ static struct inode *exofs_alloc_inode(struct super_block *sb) static void exofs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); } -- cgit v1.2.3 From 18bb1db3e7607e4a997d50991a6f9fa5b0f8722c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:41:39 -0400 Subject: switch vfs_mkdir() and ->mkdir() to umode_t vfs_mkdir() gets int, but immediately drops everything that might not fit into umode_t and that's the only caller of ->mkdir()... Signed-off-by: Al Viro --- fs/exofs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/exofs') diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index b54c43775f1..ff1c8286cd6 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir, return exofs_add_nondir(dentry, inode); } -static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; int err = -EMLINK; -- cgit v1.2.3 From 4acdaf27ebe2034c342f3be57ef49aed1ad885ef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:42:34 -0400 Subject: switch ->create() to umode_t vfs_create() ignores everything outside of 16bit subset of its mode argument; switching it to umode_t is obviously equivalent and it's the only caller of the method Signed-off-by: Al Viro --- fs/exofs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/exofs') diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index ff1c8286cd6..58644544849 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -59,7 +59,7 @@ static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, return d_splice_alias(inode, dentry); } -static int exofs_create(struct inode *dir, struct dentry *dentry, int mode, +static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct inode *inode = exofs_new_inode(dir, mode); -- cgit v1.2.3 From 1a67aafb5f72a436ca044293309fa7e6351d6a35 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:52:52 -0400 Subject: switch ->mknod() to umode_t Signed-off-by: Al Viro --- fs/exofs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/exofs') diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 58644544849..9dbf0c30103 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -74,7 +74,7 @@ static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return err; } -static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode, +static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; -- cgit v1.2.3 From bef41c267e2b903f411c0f56236027f68553b721 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 03:07:49 -0400 Subject: exofs: propagate umode_t Signed-off-by: Al Viro --- fs/exofs/dir.c | 2 +- fs/exofs/exofs.h | 2 +- fs/exofs/inode.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/exofs') diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index d0941c6a1f7..80405836ba6 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -234,7 +234,7 @@ static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = { static inline void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 51f4b4c40f0..ca9d49665ef 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -154,7 +154,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); extern struct inode *exofs_iget(struct super_block *, unsigned long); -struct inode *exofs_new_inode(struct inode *, int); +struct inode *exofs_new_inode(struct inode *, umode_t); extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); extern void exofs_evict_inode(struct inode *); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index f6dbf7768ce..ea5e1f97806 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1276,7 +1276,7 @@ static void create_done(struct ore_io_state *ios, void *p) /* * Set up a new inode and create an object for it on the OSD */ -struct inode *exofs_new_inode(struct inode *dir, int mode) +struct inode *exofs_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct exofs_sb_info *sbi = sb->s_fs_info; -- cgit v1.2.3 From 831c2dc5f47c1dc79c32229d75065ada1dcc66e1 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Tue, 29 Nov 2011 15:35:53 -0800 Subject: ore: FIX breakage when MISC_FILESYSTEMS is not set As Reported by Randy Dunlap When MISC_FILESYSTEMS is not enabled and NFS4.1 is: fs/built-in.o: In function `objio_alloc_io_state': objio_osd.c:(.text+0xcb525): undefined reference to `ore_get_rw_state' fs/built-in.o: In function `_write_done': objio_osd.c:(.text+0xcb58d): undefined reference to `ore_check_io' fs/built-in.o: In function `_read_done': ... When MISC_FILESYSTEMS, which is more of a GUI thing then anything else, is not selected. exofs/Kconfig is never examined during Kconfig, and it can not do it's magic stuff to automatically select everything needed. We must split exofs/Kconfig in two. The ore one is always included. And the exofs one is left in it's old place in the menu. [Needed for the 3.2.0 Kernel] CC: Stable Tree Reported-by: Randy Dunlap Signed-off-by: Boaz Harrosh --- fs/exofs/Kconfig | 11 ----------- fs/exofs/Kconfig.ore | 12 ++++++++++++ 2 files changed, 12 insertions(+), 11 deletions(-) create mode 100644 fs/exofs/Kconfig.ore (limited to 'fs/exofs') diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index da42f32c49b..86194b2f799 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig @@ -1,14 +1,3 @@ -# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects -# for every ORE user we do it like this. Any user should add itself here -# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are -# selected here, and we default to "ON". So in effect it is like been -# selected by any of the users. -config ORE - tristate - depends on EXOFS_FS || PNFS_OBJLAYOUT - select ASYNC_XOR - default SCSI_OSD_ULD - config EXOFS_FS tristate "exofs: OSD based file system support" depends on SCSI_OSD_ULD diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore new file mode 100644 index 00000000000..1ca7fb7b6ba --- /dev/null +++ b/fs/exofs/Kconfig.ore @@ -0,0 +1,12 @@ +# ORE - Objects Raid Engine (libore.ko) +# +# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects +# for every ORE user we do it like this. Any user should add itself here +# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are +# selected here, and we default to "ON". So in effect it is like been +# selected by any of the users. +config ORE + tristate + depends on EXOFS_FS || PNFS_OBJLAYOUT + select ASYNC_XOR + default SCSI_OSD_ULD -- cgit v1.2.3 From ffefb8eaa367e8a5c14f779233d9da1fbc23d164 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Tue, 27 Dec 2011 19:23:36 +0200 Subject: ore: Fix crash in case of an IO error. The users of ore_check_io() expect the reported device (In case of error) to be indexed relative to the passed-in ore_components table, and not the logical dev index. This causes a crash inside objlayoutdriver in case of an IO error. [Bug in 3.2.0 Kernel] CC: Stable Tree Signed-off-by: Boaz Harrosh --- fs/exofs/ore.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/exofs') diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index d271ad83720..894f3e192e6 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -445,10 +445,10 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) u64 residual = ios->reading ? or->in.residual : or->out.residual; u64 offset = (ios->offset + ios->length) - residual; - struct ore_dev *od = ios->oc->ods[ - per_dev->dev - ios->oc->first_dev]; + unsigned dev = per_dev->dev - ios->oc->first_dev; + struct ore_dev *od = ios->oc->ods[dev]; - on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri, + on_dev_error(ios, od, dev, osi.osd_err_pri, offset, residual); } if (osi.osd_err_pri >= acumulated_osd_err) { -- cgit v1.2.3 From 361aba569f55dd159b850489a3538253afbb3973 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Dec 2011 19:14:23 +0200 Subject: ore: fix BUG_ON, too few sgs when reading When reading RAID5 files, in rare cases, we calculated too few sg segments. There should be two extra for the beginning and end partial units. Also "too few sg segments" should not be a BUG_ON there is all the mechanics in place to handle it, as a short read. So just return -ENOMEM and the rest of the code will gracefully split the IO. [Bug in 3.2.0 Kernel] CC: Stable Tree Signed-off-by: Boaz Harrosh --- fs/exofs/ore.c | 2 +- fs/exofs/ore_raid.c | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/exofs') diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 894f3e192e6..49cf230554a 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -266,7 +266,7 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, /* first/last seg is split */ num_raid_units += layout->group_width; - sgs_per_dev = div_u64(num_raid_units, data_devs); + sgs_per_dev = div_u64(num_raid_units, data_devs) + 2; } else { /* For Writes add parity pages array. */ max_par_pages = num_raid_units * pages_in_unit * diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 29c47e5c4a8..414a2dfd950 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -551,7 +551,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios, unsigned cur_len) { if (ios->reading) { - BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); + if (per_dev->cur_sg >= ios->sgs_per_dev) { + ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" , + per_dev->cur_sg, ios->sgs_per_dev); + return -ENOMEM; + } _ore_add_sg_seg(per_dev, cur_len, true); } else { struct __stripe_pages_2d *sp2d = ios->sp2d; -- cgit v1.2.3 From 724577ca355795b0a25c93ccbeee927871ca1a77 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Dec 2011 19:21:45 +0200 Subject: ore: Must support none-PAGE-aligned IO NFS might send us offsets that are not PAGE aligned. So we must read in the reminder of the first/last pages, in cases we need it for Parity calculations. We only add an sg segments to read the partial page. But we don't mark it as read=true because it is a lock-for-write page. TODO: In some cases (IO spans a single unit) we can just adjust the raid_unit offset/length, but this is left for later Kernels. [Bug in 3.2.0 Kernel] CC: Stable Tree Signed-off-by: Boaz Harrosh --- fs/exofs/ore_raid.c | 72 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 12 deletions(-) (limited to 'fs/exofs') diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 414a2dfd950..d222c77cfa1 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -328,8 +328,8 @@ static int _alloc_read_4_write(struct ore_io_state *ios) /* @si contains info of the to-be-inserted page. Update of @si should be * maintained by caller. Specificaly si->dev, si->obj_offset, ... */ -static int _add_to_read_4_write(struct ore_io_state *ios, - struct ore_striping_info *si, struct page *page) +static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si, + struct page *page, unsigned pg_len) { struct request_queue *q; struct ore_per_dev_state *per_dev; @@ -366,17 +366,60 @@ static int _add_to_read_4_write(struct ore_io_state *ios, _ore_add_sg_seg(per_dev, gap, true); } q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); - added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); - if (unlikely(added_len != PAGE_SIZE)) { + added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len, + si->obj_offset % PAGE_SIZE); + if (unlikely(added_len != pg_len)) { ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", per_dev->bio->bi_vcnt); return -ENOMEM; } - per_dev->length += PAGE_SIZE; + per_dev->length += pg_len; return 0; } +/* read the beginning of an unaligned first page */ +static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page) +{ + struct ore_striping_info si; + unsigned pg_len; + + ore_calc_stripe_info(ios->layout, ios->offset, 0, &si); + + pg_len = si.obj_offset % PAGE_SIZE; + si.obj_offset -= pg_len; + + ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n", + _LLU(si.obj_offset), pg_len, page->index, si.dev); + + return _add_to_r4w(ios, &si, page, pg_len); +} + +/* read the end of an incomplete last page */ +static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) +{ + struct ore_striping_info si; + struct page *page; + unsigned pg_len, p, c; + + ore_calc_stripe_info(ios->layout, *offset, 0, &si); + + p = si.unit_off / PAGE_SIZE; + c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, + ios->layout->mirrors_p1, si.par_dev, si.dev); + page = ios->sp2d->_1p_stripes[p].pages[c]; + + pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); + *offset += pg_len; + + ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n", + p, c, _LLU(*offset), pg_len, si.dev, si.par_dev); + + BUG_ON(!page); + + return _add_to_r4w(ios, &si, page, pg_len); +} + static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) { struct bio_vec *bv; @@ -444,9 +487,13 @@ static int _read_4_write(struct ore_io_state *ios) struct page **pp = &_1ps->pages[c]; bool uptodate; - if (*pp) + if (*pp) { + if (ios->offset % PAGE_SIZE) + /* Read the remainder of the page */ + _add_to_r4w_first_page(ios, *pp); /* to-be-written pages start here */ goto read_last_stripe; + } *pp = ios->r4w->get_page(ios->private, offset, &uptodate); @@ -454,7 +501,7 @@ static int _read_4_write(struct ore_io_state *ios) return -ENOMEM; if (!uptodate) - _add_to_read_4_write(ios, &read_si, *pp); + _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE); /* Mark read-pages to be cache_released */ _1ps->page_is_read[c] = true; @@ -465,8 +512,11 @@ static int _read_4_write(struct ore_io_state *ios) } read_last_stripe: - offset = ios->offset + (ios->length + PAGE_SIZE - 1) / - PAGE_SIZE * PAGE_SIZE; + offset = ios->offset + ios->length; + if (offset % PAGE_SIZE) + _add_to_r4w_last_page(ios, &offset); + /* offset will be aligned to next page */ + last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) * bytes_in_stripe; if (offset == last_stripe_end) /* Optimize for the aligned case */ @@ -503,7 +553,7 @@ read_last_stripe: /* Mark read-pages to be cache_released */ _1ps->page_is_read[c] = true; if (!uptodate) - _add_to_read_4_write(ios, &read_si, page); + _add_to_r4w(ios, &read_si, page, PAGE_SIZE); } offset += PAGE_SIZE; @@ -616,8 +666,6 @@ int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) return -ENOMEM; } - BUG_ON(ios->offset % PAGE_SIZE); - /* Round io down to last full strip */ first_stripe = div_u64(ios->offset, stripe_size); last_stripe = div_u64(ios->offset + ios->length, stripe_size); -- cgit v1.2.3 From da01636a6511c3bd0c1cf546c47b8e92a837a613 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Jan 2012 19:45:28 -0500 Subject: exofs: oops after late failure in mount We have already set ->s_root, so ->put_super() is going to be called. Freeing ->s_fs_info is a bloody bad idea when it's going to be dereferenced very shortly... Signed-off-by: Al Viro --- fs/exofs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/exofs') diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 8addfe314dc..d22cd168c6e 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -838,6 +838,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); if (ret) { EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); + dput(sb->s_root); + sb->s_root = NULL; goto free_sbi; } -- cgit v1.2.3 From b6d1f2dd61de7e696c75fbf39fb99cf41a189084 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 21 Nov 2011 17:43:11 +0300 Subject: exofs: fix endian conversion in exofs_sync_fs() fscb->s_numfiles is an __le64 field so we need to use cpu_to_le64() to get a little endian 64 bit on big endian systems. Signed-off-by: Dan Carpenter Signed-off-by: Boaz Harrosh --- fs/exofs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/exofs') diff --git a/fs/exofs/super.c b/fs/exofs/super.c index d22cd168c6e..4710b5f686d 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -389,7 +389,7 @@ static int exofs_sync_fs(struct super_block *sb, int wait) ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); memset(fscb, 0, ios->length); fscb->s_nextid = cpu_to_le64(sbi->s_nextid); - fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); + fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles); fscb->s_magic = cpu_to_le16(sb->s_magic); fscb->s_newfs = 0; fscb->s_version = EXOFS_FSCB_VER; -- cgit v1.2.3 From 3e57638bb1469ba2705456e9fd4063d1890341e1 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 1 Feb 2012 23:53:46 +0900 Subject: exofs: (trivial) Fix typo in super.c Correct spelling "faild" to "failed" in fs/exofs/super.c Signed-off-by: Masanari Iida Signed-off-by: Boaz Harrosh --- fs/exofs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/exofs') diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 4710b5f686d..d9619a57780 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -565,7 +565,7 @@ int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); if (unlikely(!aoded)) { - EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", + EXOFS_ERR("ERROR: failed allocating Device array[%d]\n", numdevs); return -ENOMEM; } -- cgit v1.2.3 From 72749a270b6d254b4a018e290b853c27edb2fa62 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 30 Jan 2012 10:59:49 +0300 Subject: exofs: Cap on the memcpy() size This data comes from the device, so probably it's fairly trustworthy but it makes the static checkers happy if we check it. [Boaz] the system_id_len is zero, if not present, or always OSD_SYSTEMID_LEN. So always copy OSD_SYSTEMID_LEN bytes. Signed-off-by: Dan Carpenter Signed-off-by: Boaz Harrosh --- fs/exofs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/exofs') diff --git a/fs/exofs/super.c b/fs/exofs/super.c index d9619a57780..6c5397ad7a2 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -529,7 +529,8 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, struct osd_dev_info *odi) { odi->systemid_len = le32_to_cpu(dt_dev->systemid_len); - memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len); + if (likely(odi->systemid_len)) + memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN); odi->osdname_len = le32_to_cpu(dt_dev->osdname_len); odi->osdname = dt_dev->osdname; -- cgit v1.2.3 From bf7014b67fd931003e5f12e6742b1fc5f6c0a045 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 25 Nov 2011 23:14:29 +0800 Subject: exofs: remove the second argument of k[un]map_atomic() Ack-by: Boaz Harrosh Signed-off-by: Cong Wang --- fs/exofs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/exofs') diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 80405836ba6..c61e62ac231 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -597,7 +597,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent) goto fail; } - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); de = (struct exofs_dir_entry *)kaddr; de->name_len = 1; de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1)); @@ -611,7 +611,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent) de->inode_no = cpu_to_le64(parent->i_ino); memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); exofs_set_de_type(de, inode); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); err = exofs_commit_chunk(page, 0, chunk_size); fail: page_cache_release(page); -- cgit v1.2.3 From 8de52778798fe39660a8d6b26f290e0c93202761 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 6 Feb 2012 12:45:27 -0500 Subject: vfs: check i_nlink limits in vfs_{mkdir,rename_dir,link} New field of struct super_block - ->s_max_links. Maximal allowed value of ->i_nlink or 0; in the latter case all checks still need to be done in ->link/->mkdir/->rename instances. Note that this limit applies both to directoris and to non-directories. Signed-off-by: Al Viro --- fs/exofs/namei.c | 13 +------------ fs/exofs/super.c | 1 + 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'fs/exofs') diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 9dbf0c30103..fc7161d6bf6 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -143,9 +143,6 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir, { struct inode *inode = old_dentry->d_inode; - if (inode->i_nlink >= EXOFS_LINK_MAX) - return -EMLINK; - inode->i_ctime = CURRENT_TIME; inode_inc_link_count(inode); ihold(inode); @@ -156,10 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir, static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - int err = -EMLINK; - - if (dir->i_nlink >= EXOFS_LINK_MAX) - goto out; + int err; inode_inc_link_count(dir); @@ -275,11 +269,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto out_dir; } else { - if (dir_de) { - err = -EMLINK; - if (new_dir->i_nlink >= EXOFS_LINK_MAX) - goto out_dir; - } err = exofs_add_link(new_dentry, old_inode); if (err) goto out_dir; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index d22cd168c6e..6cafcadfc3c 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -754,6 +754,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize = EXOFS_BLKSIZE; sb->s_blocksize_bits = EXOFS_BLKSHIFT; sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_max_links = EXOFS_LINK_MAX; atomic_set(&sbi->s_curr_pending, 0); sb->s_bdev = NULL; sb->s_dev = 0; -- cgit v1.2.3 From 48fde701aff662559b38d9a609574068f22d00fe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Jan 2012 22:15:13 -0500 Subject: switch open-coded instances of d_make_root() to new helper Signed-off-by: Al Viro --- fs/exofs/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/exofs') diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 6cafcadfc3c..7f2b590a36b 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -819,9 +819,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) ret = PTR_ERR(root); goto free_sbi; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { - iput(root); EXOFS_ERR("ERROR: get root inode failed\n"); ret = -ENOMEM; goto free_sbi; -- cgit v1.2.3 From dbd5768f87ff6fb0a4fe09c4d7b6c4a24de99430 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 May 2012 14:48:02 +0200 Subject: vfs: Rename end_writeback() to clear_inode() After we moved inode_sync_wait() from end_writeback() it doesn't make sense to call the function end_writeback() anymore. Rename it to clear_inode() which well says what the function really does - set I_CLEAR flag. Signed-off-by: Jan Kara Signed-off-by: Fengguang Wu --- fs/exofs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/exofs') diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index ea5e1f97806..5badb0c039d 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1473,7 +1473,7 @@ void exofs_evict_inode(struct inode *inode) goto no_delete; inode->i_size = 0; - end_writeback(inode); + clear_inode(inode); /* if we are deleting an obj that hasn't been created yet, wait. * This also makes sure that create_done cannot be called with an @@ -1503,5 +1503,5 @@ void exofs_evict_inode(struct inode *inode) return; no_delete: - end_writeback(inode); + clear_inode(inode); } -- cgit v1.2.3 From 6abe4a87f7bc7978705c386dbba0ca0c7790b3ec Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 16 May 2012 14:22:21 +0300 Subject: exofs: Fix CRASH on very early IO errors. If at exofs_fill_super() we had an early termination do to any error, like an IO error while reading the super-block. We would crash inside exofs_free_sbi(). This is because sbi->oc.numdevs was set to 1, before we actually have a device table at all. Fix it by moving the sbi->oc.numdevs = 1 to after the allocation of the device table. Reported-by: Johannes Schild Stable: This is a bug since v3.2.0 CC: Stable Tree Signed-off-by: Boaz Harrosh --- fs/exofs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/exofs') diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 735ca06430a..59e0849772b 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -745,7 +745,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->one_comp.obj.partition = opts->pid; sbi->one_comp.obj.id = 0; exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); - sbi->oc.numdevs = 1; sbi->oc.single_comp = EC_SINGLE_COMP; sbi->oc.comps = &sbi->one_comp; @@ -804,6 +803,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; ore_comp_set_dev(&sbi->oc, 0, od); + sbi->oc.numdevs = 1; } __sbi_read_stats(sbi); -- cgit v1.2.3 From 8b56a30caaf9bc1850784f196636c5f550cc7577 Mon Sep 17 00:00:00 2001 From: Sachin Bhamare Date: Wed, 14 Mar 2012 18:01:45 -0700 Subject: exofs: Add SYSFS info for autologin/pNFS export Introduce sysfs infrastructure for exofs cluster filesystem. Each OSD target shows up as below in the sysfs hierarchy: /sys/fs/exofs/_/devX Where _ is the unique identification of a Superblock. Where devX: 0 <= X < device_table_size. They are ordered in device-table order as specified to the mkfs.exofs command Each OSD device devX has following attributes : osdname - ReadOnly systemid - ReadOnly uri - Read/Write It is up to user-mode to update devX/uri for support of autologin. These sysfs information are used both for autologin as well as support for exporting exofs via a pNFSD server in user-mode. (.eg NFS-Ganesha) Signed-off-by: Sachin Bhamare Signed-off-by: Boaz Harrosh --- fs/exofs/Kbuild | 2 +- fs/exofs/exofs.h | 14 ++++ fs/exofs/super.c | 14 ++++ fs/exofs/sys.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 229 insertions(+), 1 deletion(-) create mode 100644 fs/exofs/sys.c (limited to 'fs/exofs') diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index 352ba149d23..389ba8312d5 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -16,5 +16,5 @@ libore-y := ore.o ore_raid.o obj-$(CONFIG_ORE) += libore.o -exofs-y := inode.o file.o symlink.o namei.o dir.o super.o +exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index ca9d49665ef..fffe86fd7a4 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -56,6 +56,9 @@ struct exofs_dev { struct ore_dev ored; unsigned did; + unsigned urilen; + uint8_t *uri; + struct kobject ed_kobj; }; /* * our extension to the in-memory superblock @@ -73,6 +76,7 @@ struct exofs_sb_info { struct ore_layout layout; /* Default files layout */ struct ore_comp one_comp; /* id & cred of partition id=0*/ struct ore_components oc; /* comps for the partition */ + struct kobject s_kobj; /* holds per-sbi kobject */ }; /* @@ -176,6 +180,16 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj); int exofs_sbi_write_stats(struct exofs_sb_info *sbi); +/* sys.c */ +int exofs_sysfs_init(void); +void exofs_sysfs_uninit(void); +int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, + struct exofs_dt_device_info *dt_dev); +void exofs_sysfs_sb_del(struct exofs_sb_info *sbi); +int exofs_sysfs_odev_add(struct exofs_dev *edev, + struct exofs_sb_info *sbi); +void exofs_sysfs_dbg_print(void); + /********************* * operation vectors * *********************/ diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 59e0849772b..433783624d1 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -472,6 +472,7 @@ static void exofs_put_super(struct super_block *sb) _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); + exofs_sysfs_sb_del(sbi); bdi_destroy(&sbi->bdi); exofs_free_sbi(sbi); sb->s_fs_info = NULL; @@ -632,6 +633,12 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], (numdevs - 1) * sizeof(sbi->oc.ods[0])); + /* create sysfs subdir under which we put the device table + * And cluster layout. A Superblock is identified by the string: + * "dev[0].osdname"_"pid" + */ + exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]); + for (i = 0; i < numdevs; i++) { struct exofs_fscb fscb; struct osd_dev_info odi; @@ -657,6 +664,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, eds[i].ored.od = fscb_od; ++sbi->oc.numdevs; fscb_od = NULL; + exofs_sysfs_odev_add(&eds[i], sbi); continue; } @@ -682,6 +690,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, odi.osdname); goto out; } + exofs_sysfs_odev_add(&eds[i], sbi); /* TODO: verify other information is correct and FS-uuid * matches. Benny what did you say about device table @@ -844,6 +853,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } + exofs_sysfs_dbg_print(); _exofs_print_device("Mounting", opts->dev_name, ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); @@ -1023,6 +1033,9 @@ static int __init init_exofs(void) if (err) goto out_d; + /* We don't fail if sysfs creation failed */ + exofs_sysfs_init(); + return 0; out_d: destroy_inodecache(); @@ -1032,6 +1045,7 @@ out: static void __exit exit_exofs(void) { + exofs_sysfs_uninit(); unregister_filesystem(&exofs_type); destroy_inodecache(); } diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c new file mode 100644 index 00000000000..e32bc919e4e --- /dev/null +++ b/fs/exofs/sys.c @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2012 + * Sachin Bhamare + * Boaz Harrosh + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License 2 as published by + * the Free Software Foundation. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the: + * Free Software Foundation + */ + +#include +#include + +#include "exofs.h" + +struct odev_attr { + struct attribute attr; + ssize_t (*show)(struct exofs_dev *, char *); + ssize_t (*store)(struct exofs_dev *, const char *, size_t); +}; + +static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); + struct odev_attr *a = container_of(attr, struct odev_attr, attr); + + return a->show ? a->show(edp, buf) : 0; +} + +static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); + struct odev_attr *a = container_of(attr, struct odev_attr, attr); + + return a->store ? a->store(edp, buf, len) : len; +} + +static const struct sysfs_ops odev_attr_ops = { + .show = odev_attr_show, + .store = odev_attr_store, +}; + + +static struct kset *exofs_kset; + +static ssize_t osdname_show(struct exofs_dev *edp, char *buf) +{ + struct osd_dev *odev = edp->ored.od; + const struct osd_dev_info *odi = osduld_device_info(odev); + + return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname); +} + +static ssize_t systemid_show(struct exofs_dev *edp, char *buf) +{ + struct osd_dev *odev = edp->ored.od; + const struct osd_dev_info *odi = osduld_device_info(odev); + + memcpy(buf, odi->systemid, odi->systemid_len); + return odi->systemid_len; +} + +static ssize_t uri_show(struct exofs_dev *edp, char *buf) +{ + return snprintf(buf, edp->urilen, "%s", edp->uri); +} + +static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len) +{ + edp->urilen = strlen(buf) + 1; + edp->uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL); + strncpy(edp->uri, buf, edp->urilen); + return edp->urilen; +} + +#define OSD_ATTR(name, mode, show, store) \ + static struct odev_attr odev_attr_##name = \ + __ATTR(name, mode, show, store) + +OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL); +OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL); +OSD_ATTR(uri, S_IRWXU, uri_show, uri_store); + +static struct attribute *odev_attrs[] = { + &odev_attr_osdname.attr, + &odev_attr_systemid.attr, + &odev_attr_uri.attr, + NULL, +}; + +static struct kobj_type odev_ktype = { + .default_attrs = odev_attrs, + .sysfs_ops = &odev_attr_ops, +}; + +static struct kobj_type uuid_ktype = { +}; + +void exofs_sysfs_dbg_print() +{ +#ifdef CONFIG_EXOFS_DEBUG + struct kobject *k_name, *k_tmp; + + list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { + printk(KERN_INFO "%s: name %s ref %d\n", + __func__, kobject_name(k_name), + (int)atomic_read(&k_name->kref.refcount)); + } +#endif +} +/* + * This function removes all kobjects under exofs_kset + * At the end of it, exofs_kset kobject will have a refcount + * of 1 which gets decremented only on exofs module unload + */ +void exofs_sysfs_sb_del(struct exofs_sb_info *sbi) +{ + struct kobject *k_name, *k_tmp; + struct kobject *s_kobj = &sbi->s_kobj; + + list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { + /* Remove all that are children of this SBI */ + if (k_name->parent == s_kobj) + kobject_put(k_name); + } + kobject_put(s_kobj); +} + +/* + * This function creates sysfs entries to hold the current exofs cluster + * instance (uniquely identified by osdname,pid tuple). + * This function gets called once per exofs mount instance. + */ +int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, + struct exofs_dt_device_info *dt_dev) +{ + struct kobject *s_kobj; + int retval = 0; + uint64_t pid = sbi->one_comp.obj.partition; + + /* allocate new uuid dirent */ + s_kobj = &sbi->s_kobj; + s_kobj->kset = exofs_kset; + retval = kobject_init_and_add(s_kobj, &uuid_ktype, + &exofs_kset->kobj, "%s_%llx", dt_dev->osdname, pid); + if (retval) { + EXOFS_ERR("ERROR: Failed to create sysfs entry for " + "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval); + return -ENOMEM; + } + return 0; +} + +int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi) +{ + struct kobject *d_kobj; + int retval = 0; + + /* create osd device group which contains following attributes + * osdname, systemid & uri + */ + d_kobj = &edev->ed_kobj; + d_kobj->kset = exofs_kset; + retval = kobject_init_and_add(d_kobj, &odev_ktype, + &sbi->s_kobj, "dev%u", edev->did); + if (retval) { + EXOFS_ERR("ERROR: Failed to create sysfs entry for " + "device dev%u\n", edev->did); + return retval; + } + return 0; +} + +int exofs_sysfs_init(void) +{ + exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj); + if (!exofs_kset) { + EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n"); + return -ENOMEM; + } + return 0; +} + +void exofs_sysfs_uninit(void) +{ + kset_unregister(exofs_kset); +} -- cgit v1.2.3 From b84297197ce60bb5da14cbd5516cf5130b0cd380 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 11 Jun 2012 16:46:55 -0700 Subject: exofs: fix sparse non-ANSI function warning Fix sparse non-ANSI function warning: fs/exofs/sys.c:112:28: warning: non-ANSI function declaration of function 'exofs_sysfs_dbg_print' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- fs/exofs/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/exofs') diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c index e32bc919e4e..5a7b691e748 100644 --- a/fs/exofs/sys.c +++ b/fs/exofs/sys.c @@ -109,7 +109,7 @@ static struct kobj_type odev_ktype = { static struct kobj_type uuid_ktype = { }; -void exofs_sysfs_dbg_print() +void exofs_sysfs_dbg_print(void) { #ifdef CONFIG_EXOFS_DEBUG struct kobject *k_name, *k_tmp; -- cgit v1.2.3 From 9ff19309a9623f2963ac5a136782ea4d8b5d67fb Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Fri, 8 Jun 2012 01:19:07 +0300 Subject: ore: Fix NFS crash by supporting any unaligned RAID IO In RAID_5/6 We used to not permit an IO that it's end byte is not stripe_size aligned and spans more than one stripe. .i.e the caller must check if after submission the actual transferred bytes is shorter, and would need to resubmit a new IO with the remainder. Exofs supports this, and NFS was supposed to support this as well with it's short write mechanism. But late testing has exposed a CRASH when this is used with none-RPC layout-drivers. The change at NFS is deep and risky, in it's place the fix at ORE to lift the limitation is actually clean and simple. So here it is below. The principal here is that in the case of unaligned IO on both ends, beginning and end, we will send two read requests one like old code, before the calculation of the first stripe, and also a new site, before the calculation of the last stripe. If any "boundary" is aligned or the complete IO is within a single stripe. we do a single read like before. The code is clean and simple by splitting the old _read_4_write into 3 even parts: 1._read_4_write_first_stripe 2. _read_4_write_last_stripe 3. _read_4_write_execute And calling 1+3 at the same place as before. 2+3 before last stripe, and in the case of all in a single stripe then 1+2+3 is preformed additively. Why did I not think of it before. Well I had a strike of genius because I have stared at this code for 2 years, and did not find this simple solution, til today. Not that I did not try. This solution is much better for NFS than the previous supposedly solution because the short write was dealt with out-of-band after IO_done, which would cause for a seeky IO pattern where as in here we execute in order. At both solutions we do 2 separate reads, only here we do it within a single IO request. (And actually combine two writes into a single submission) NFS/exofs code need not change since the ORE API communicates the new shorter length on return, what will happen is that this case would not occur anymore. hurray!! [Stable this is an NFS bug since 3.2 Kernel should apply cleanly] CC: Stable Tree Signed-off-by: Boaz Harrosh --- fs/exofs/ore_raid.c | 67 ++++++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 31 deletions(-) (limited to 'fs/exofs') diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index d222c77cfa1..fff2070c675 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -461,16 +461,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) * ios->sp2d[p][*], xor is calculated the same way. These pages are * allocated/freed and don't go through cache */ -static int _read_4_write(struct ore_io_state *ios) +static int _read_4_write_first_stripe(struct ore_io_state *ios) { - struct ore_io_state *ios_read; struct ore_striping_info read_si; struct __stripe_pages_2d *sp2d = ios->sp2d; u64 offset = ios->si.first_stripe_start; - u64 last_stripe_end; - unsigned bytes_in_stripe = ios->si.bytes_in_stripe; - unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; - int ret; + unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; if (offset == ios->offset) /* Go to start collect $200 */ goto read_last_stripe; @@ -478,6 +474,9 @@ static int _read_4_write(struct ore_io_state *ios) min_p = _sp2d_min_pg(sp2d); max_p = _sp2d_max_pg(sp2d); + ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n", + offset, ios->offset, min_p, max_p); + for (c = 0; ; c++) { ore_calc_stripe_info(ios->layout, offset, 0, &read_si); read_si.obj_offset += min_p * PAGE_SIZE; @@ -512,6 +511,18 @@ static int _read_4_write(struct ore_io_state *ios) } read_last_stripe: + return 0; +} + +static int _read_4_write_last_stripe(struct ore_io_state *ios) +{ + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset; + u64 last_stripe_end; + unsigned bytes_in_stripe = ios->si.bytes_in_stripe; + unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; + offset = ios->offset + ios->length; if (offset % PAGE_SIZE) _add_to_r4w_last_page(ios, &offset); @@ -527,15 +538,15 @@ read_last_stripe: c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); - BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); - /* unaligned IO must be within a single stripe */ - if (min_p == sp2d->pages_in_unit) { /* Didn't do it yet */ min_p = _sp2d_min_pg(sp2d); max_p = _sp2d_max_pg(sp2d); } + ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n", + offset, last_stripe_end, min_p, max_p); + while (offset < last_stripe_end) { struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; @@ -568,6 +579,15 @@ read_last_stripe: } read_it: + return 0; +} + +static int _read_4_write_execute(struct ore_io_state *ios) +{ + struct ore_io_state *ios_read; + unsigned i; + int ret; + ios_read = ios->ios_read_4_write; if (!ios_read) return 0; @@ -591,6 +611,8 @@ read_it: } _mark_read4write_pages_uptodate(ios_read, ret); + ore_put_io_state(ios_read); + ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */ return 0; } @@ -626,8 +648,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios, /* If first stripe, Read in all read4write pages * (if needed) before we calculate the first parity. */ - _read_4_write(ios); + _read_4_write_first_stripe(ios); } + if (!cur_len) /* If last stripe r4w pages of last stripe */ + _read_4_write_last_stripe(ios); + _read_4_write_execute(ios); for (i = 0; i < num_pages; i++) { pages[i] = _raid_page_alloc(); @@ -654,34 +679,14 @@ int _ore_add_parity_unit(struct ore_io_state *ios, int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) { - struct ore_layout *layout = ios->layout; - if (ios->parity_pages) { + struct ore_layout *layout = ios->layout; unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; - unsigned stripe_size = ios->si.bytes_in_stripe; - u64 last_stripe, first_stripe; if (_sp2d_alloc(pages_in_unit, layout->group_width, layout->parity, &ios->sp2d)) { return -ENOMEM; } - - /* Round io down to last full strip */ - first_stripe = div_u64(ios->offset, stripe_size); - last_stripe = div_u64(ios->offset + ios->length, stripe_size); - - /* If an IO spans more then a single stripe it must end at - * a stripe boundary. The reminder at the end is pushed into the - * next IO. - */ - if (last_stripe != first_stripe) { - ios->length = last_stripe * stripe_size - ios->offset; - - BUG_ON(!ios->length); - ios->nr_pages = (ios->length + PAGE_SIZE - 1) / - PAGE_SIZE; - ios->si.length = ios->length; /*make it consistent */ - } } return 0; } -- cgit v1.2.3 From 62b62ad873f2accad9222a4d7ffbe1e93f6714c1 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Fri, 8 Jun 2012 04:30:40 +0300 Subject: ore: Remove support of partial IO request (NFS crash) Do to OOM situations the ore might fail to allocate all resources needed for IO of the full request. If some progress was possible it would proceed with a partial/short request, for the sake of forward progress. Since this crashes NFS-core and exofs is just fine without it just remove this contraption, and fail. TODO: Support real forward progress with some reserved allocations of resources, such as mem pools and/or bio_sets [Bug since 3.2 Kernel] CC: Stable Tree CC: Benny Halevy Signed-off-by: Boaz Harrosh --- fs/exofs/ore.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs/exofs') diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 49cf230554a..24a49d47e93 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -735,13 +735,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) out: ios->numdevs = devs_in_group; ios->pages_consumed = cur_pg; - if (unlikely(ret)) { - if (length == ios->length) - return ret; - else - ios->length -= length; - } - return 0; + return ret; } int ore_create(struct ore_io_state *ios) -- cgit v1.2.3 From 537632e0a54a5355cdd0330911d18c3b773f9cf7 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 11 Jul 2012 15:27:13 +0300 Subject: ore: Unlock r4w pages in exact reverse order of locking The read-4-write pages are locked in address ascending order. But where unlocked in a way easiest for coding. Fix that, locks should be released in opposite order of locking, .i.e descending address order. I have not hit this dead-lock. It was found by inspecting the dbug print-outs. I suspect there is an higher lock at caller that protects us, but fix it regardless. Signed-off-by: Boaz Harrosh --- fs/exofs/ore_raid.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs/exofs') diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index fff2070c675..5f376d14fdc 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -144,26 +144,26 @@ static void _sp2d_reset(struct __stripe_pages_2d *sp2d, { unsigned data_devs = sp2d->data_devs; unsigned group_width = data_devs + sp2d->parity; - unsigned p; + int p, c; if (!sp2d->needed) return; - for (p = 0; p < sp2d->pages_in_unit; p++) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - if (_1ps->write_count < group_width) { - unsigned c; + for (c = data_devs - 1; c >= 0; --c) + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - for (c = 0; c < data_devs; c++) - if (_1ps->page_is_read[c]) { - struct page *page = _1ps->pages[c]; + if (_1ps->page_is_read[c]) { + struct page *page = _1ps->pages[c]; - r4w->put_page(priv, page); - _1ps->page_is_read[c] = false; - } + r4w->put_page(priv, page); + _1ps->page_is_read[c] = false; + } } + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); _1ps->write_count = 0; _1ps->tx = NULL; -- cgit v1.2.3