aboutsummaryrefslogtreecommitdiffstats
path: root/fs/reiserfs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/reiserfs
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'fs/reiserfs')
-rw-r--r--fs/reiserfs/Makefile36
-rw-r--r--fs/reiserfs/README161
-rw-r--r--fs/reiserfs/bitmap.c1169
-rw-r--r--fs/reiserfs/dir.c275
-rw-r--r--fs/reiserfs/do_balan.c1597
-rw-r--r--fs/reiserfs/file.c1408
-rw-r--r--fs/reiserfs/fix_node.c2518
-rw-r--r--fs/reiserfs/hashes.c209
-rw-r--r--fs/reiserfs/ibalance.c1058
-rw-r--r--fs/reiserfs/inode.c2846
-rw-r--r--fs/reiserfs/ioctl.c151
-rw-r--r--fs/reiserfs/item_ops.c788
-rw-r--r--fs/reiserfs/journal.c3876
-rw-r--r--fs/reiserfs/lbalance.c1222
-rw-r--r--fs/reiserfs/namei.c1491
-rw-r--r--fs/reiserfs/objectid.c206
-rw-r--r--fs/reiserfs/prints.c727
-rw-r--r--fs/reiserfs/procfs.c664
-rw-r--r--fs/reiserfs/resize.c182
-rw-r--r--fs/reiserfs/stree.c2073
-rw-r--r--fs/reiserfs/super.c2148
-rw-r--r--fs/reiserfs/tail_conversion.c276
-rw-r--r--fs/reiserfs/xattr.c1450
-rw-r--r--fs/reiserfs/xattr_acl.c571
-rw-r--r--fs/reiserfs/xattr_security.c69
-rw-r--r--fs/reiserfs/xattr_trusted.c81
-rw-r--r--fs/reiserfs/xattr_user.c99
27 files changed, 27351 insertions, 0 deletions
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
new file mode 100644
index 00000000000..3a59309f3ca
--- /dev/null
+++ b/fs/reiserfs/Makefile
@@ -0,0 +1,36 @@
+#
+# Makefile for the linux reiser-filesystem routines.
+#
+
+obj-$(CONFIG_REISERFS_FS) += reiserfs.o
+
+reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
+ super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
+ hashes.o tail_conversion.o journal.o resize.o \
+ item_ops.o ioctl.o procfs.o
+
+ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
+reiserfs-objs += xattr.o xattr_user.o xattr_trusted.o
+endif
+
+ifeq ($(CONFIG_REISERFS_FS_SECURITY),y)
+reiserfs-objs += xattr_security.o
+endif
+
+ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y)
+reiserfs-objs += xattr_acl.o
+endif
+
+# gcc -O2 (the kernel default) is overaggressive on ppc32 when many inline
+# functions are used. This causes the compiler to advance the stack
+# pointer out of the available stack space, corrupting kernel space,
+# and causing a panic. Since this behavior only affects ppc32, this ifeq
+# will work around it. If any other architecture displays this behavior,
+# add it here.
+ifeq ($(CONFIG_PPC32),y)
+EXTRA_CFLAGS := -O1
+endif
+
+TAGS:
+ etags *.c
+
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
new file mode 100644
index 00000000000..90e1670e4e6
--- /dev/null
+++ b/fs/reiserfs/README
@@ -0,0 +1,161 @@
+[LICENSING]
+
+ReiserFS is hereby licensed under the GNU General
+Public License version 2.
+
+Source code files that contain the phrase "licensing governed by
+reiserfs/README" are "governed files" throughout this file. Governed
+files are licensed under the GPL. The portions of them owned by Hans
+Reiser, or authorized to be licensed by him, have been in the past,
+and likely will be in the future, licensed to other parties under
+other licenses. If you add your code to governed files, and don't
+want it to be owned by Hans Reiser, put your copyright label on that
+code so the poor blight and his customers can keep things straight.
+All portions of governed files not labeled otherwise are owned by Hans
+Reiser, and by adding your code to it, widely distributing it to
+others or sending us a patch, and leaving the sentence in stating that
+licensing is governed by the statement in this file, you accept this.
+It will be a kindness if you identify whether Hans Reiser is allowed
+to license code labeled as owned by you on your behalf other than
+under the GPL, because he wants to know if it is okay to do so and put
+a check in the mail to you (for non-trivial improvements) when he
+makes his next sale. He makes no guarantees as to the amount if any,
+though he feels motivated to motivate contributors, and you can surely
+discuss this with him before or after contributing. You have the
+right to decline to allow him to license your code contribution other
+than under the GPL.
+
+Further licensing options are available for commercial and/or other
+interests directly from Hans Reiser: hans@reiser.to. If you interpret
+the GPL as not allowing those additional licensing options, you read
+it wrongly, and Richard Stallman agrees with me, when carefully read
+you can see that those restrictions on additional terms do not apply
+to the owner of the copyright, and my interpretation of this shall
+govern for this license.
+
+Finally, nothing in this license shall be interpreted to allow you to
+fail to fairly credit me, or to remove my credits, without my
+permission, unless you are an end user not redistributing to others.
+If you have doubts about how to properly do that, or about what is
+fair, ask. (Last I spoke with him Richard was contemplating how best
+to address the fair crediting issue in the next GPL version.)
+
+[END LICENSING]
+
+Reiserfs is a file system based on balanced tree algorithms, which is
+described at http://devlinux.com/namesys.
+
+Stop reading here. Go there, then return.
+
+Send bug reports to yura@namesys.botik.ru.
+
+mkreiserfs and other utilities are in reiserfs/utils, or wherever your
+Linux provider put them. There is some disagreement about how useful
+it is for users to get their fsck and mkreiserfs out of sync with the
+version of reiserfs that is in their kernel, with many important
+distributors wanting them out of sync.:-) Please try to remember to
+recompile and reinstall fsck and mkreiserfs with every update of
+reiserfs, this is a common source of confusion. Note that some of the
+utilities cannot be compiled without accessing the balancing code
+which is in the kernel code, and relocating the utilities may require
+you to specify where that code can be found.
+
+Yes, if you update your reiserfs kernel module you do have to
+recompile your kernel, most of the time. The errors you get will be
+quite cryptic if your forget to do so.
+
+Real users, as opposed to folks who want to hack and then understand
+what went wrong, will want REISERFS_CHECK off.
+
+Hideous Commercial Pitch: Spread your development costs across other OS
+vendors. Select from the best in the world, not the best in your
+building, by buying from third party OS component suppliers. Leverage
+the software component development power of the internet. Be the most
+aggressive in taking advantage of the commercial possibilities of
+decentralized internet development, and add value through your branded
+integration that you sell as an operating system. Let your competitors
+be the ones to compete against the entire internet by themselves. Be
+hip, get with the new economic trend, before your competitors do. Send
+email to hans@reiser.to.
+
+To understand the code, after reading the website, start reading the
+code by reading reiserfs_fs.h first.
+
+Hans Reiser was the project initiator, primary architect, source of all
+funding for the first 5.5 years, and one of the programmers. He owns
+the copyright.
+
+Vladimir Saveljev was one of the programmers, and he worked long hours
+writing the cleanest code. He always made the effort to be the best he
+could be, and to make his code the best that it could be. What resulted
+was quite remarkable. I don't think that money can ever motivate someone
+to work the way he did, he is one of the most selfless men I know.
+
+Yura helps with benchmarking, coding hashes, and block pre-allocation
+code.
+
+Anatoly Pinchuk is a former member of our team who worked closely with
+Vladimir throughout the project's development. He wrote a quite
+substantial portion of the total code. He realized that there was a
+space problem with packing tails of files for files larger than a node
+that start on a node aligned boundary (there are reasons to want to node
+align files), and he invented and implemented indirect items and
+unformatted nodes as the solution.
+
+Konstantin Shvachko, with the help of the Russian version of a VC,
+tried to put me in a position where I was forced into giving control
+of the project to him. (Fortunately, as the person paying the money
+for all salaries from my dayjob I owned all copyrights, and you can't
+really force takeovers of sole proprietorships.) This was something
+curious, because he never really understood the value of our project,
+why we should do what we do, or why innovation was possible in
+general, but he was sure that he ought to be controlling it. Every
+innovation had to be forced past him while he was with us. He added
+two years to the time required to complete reiserfs, and was a net
+loss for me. Mikhail Gilula was a brilliant innovator who also left
+in a destructive way that erased the value of his contributions, and
+that he was shown much generosity just makes it more painful.
+
+Grigory Zaigralin was an extremely effective system administrator for
+our group.
+
+Igor Krasheninnikov was wonderful at hardware procurement, repair, and
+network installation.
+
+Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a
+textbook he got the algorithm from in the code. Note that his analysis
+of how we could use the hashing code in making 32 bit NFS cookies work
+was probably more important than the actual algorithm. Colin Plumb also
+contributed to it.
+
+Chris Mason dived right into our code, and in just a few months produced
+the journaling code that dramatically increased the value of ReiserFS.
+He is just an amazing programmer.
+
+Igor Zagorovsky is writing much of the new item handler and extent code
+for our next major release.
+
+Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
+resizer, and is hard at work on implementing allocate on flush. SGI
+implemented allocate on flush before us for XFS, and generously took
+the time to convince me we should do it also. They are great people,
+and a great company.
+
+Yuri Shevchuk and Nikita Danilov are doing squid cache optimization.
+
+Vitaly Fertman is doing fsck.
+
+Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably
+the endian safe patches which allow ReiserFS to run on any platform
+supported by the Linux kernel.
+
+SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the
+Alpha PC Company made it possible for me to not have a day job
+anymore, and to dramatically increase our staffing. Ecila funded
+hypertext feature development, MP3.com funded journaling, SuSE funded
+core development, IntegratedLinux.com funded squid web cache
+appliances, bigstorage.com funded HSM, and the alpha PC company funded
+the alpha port. Many of these tasks were helped by sponsors other
+than the ones just named. SuSE has helped in much more than just
+funding....
+
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
new file mode 100644
index 00000000000..a4e2ed544bb
--- /dev/null
+++ b/fs/reiserfs/bitmap.c
@@ -0,0 +1,1169 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+/* Reiserfs block (de)allocator, bitmap-based. */
+
+#include <linux/config.h>
+#include <linux/time.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/errno.h>
+#include <linux/buffer_head.h>
+#include <linux/kernel.h>
+#include <linux/pagemap.h>
+#include <linux/reiserfs_fs_sb.h>
+#include <linux/reiserfs_fs_i.h>
+#include <linux/quotaops.h>
+
+#define PREALLOCATION_SIZE 9
+
+/* different reiserfs block allocator options */
+
+#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits)
+
+#define _ALLOC_concentrating_formatted_nodes 0
+#define _ALLOC_displacing_large_files 1
+#define _ALLOC_displacing_new_packing_localities 2
+#define _ALLOC_old_hashed_relocation 3
+#define _ALLOC_new_hashed_relocation 4
+#define _ALLOC_skip_busy 5
+#define _ALLOC_displace_based_on_dirid 6
+#define _ALLOC_hashed_formatted_nodes 7
+#define _ALLOC_old_way 8
+#define _ALLOC_hundredth_slices 9
+#define _ALLOC_dirid_groups 10
+#define _ALLOC_oid_groups 11
+#define _ALLOC_packing_groups 12
+
+#define concentrating_formatted_nodes(s) test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
+#define displacing_large_files(s) test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
+#define displacing_new_packing_localities(s) test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s))
+
+#define SET_OPTION(optname) \
+ do { \
+ reiserfs_warning(s, "reiserfs: option \"%s\" is set", #optname); \
+ set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \
+ } while(0)
+#define TEST_OPTION(optname, s) \
+ test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s))
+
+static inline void get_bit_address (struct super_block * s,
+ b_blocknr_t block, int * bmap_nr, int * offset)
+{
+ /* It is in the bitmap block number equal to the block
+ * number divided by the number of bits in a block. */
+ *bmap_nr = block / (s->s_blocksize << 3);
+ /* Within that bitmap block it is located at bit offset *offset. */
+ *offset = block & ((s->s_blocksize << 3) - 1 );
+ return;
+}
+
+#ifdef CONFIG_REISERFS_CHECK
+int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value)
+{
+ int i, j;
+
+ if (block == 0 || block >= SB_BLOCK_COUNT (s)) {
+ reiserfs_warning (s, "vs-4010: is_reusable: block number is out of range %lu (%u)",
+ block, SB_BLOCK_COUNT (s));
+ return 0;
+ }
+
+ /* it can't be one of the bitmap blocks */
+ for (i = 0; i < SB_BMAP_NR (s); i ++)
+ if (block == SB_AP_BITMAP (s)[i].bh->b_blocknr) {
+ reiserfs_warning (s, "vs: 4020: is_reusable: "
+ "bitmap block %lu(%u) can't be freed or reused",
+ block, SB_BMAP_NR (s));
+ return 0;
+ }
+
+ get_bit_address (s, block, &i, &j);
+
+ if (i >= SB_BMAP_NR (s)) {
+ reiserfs_warning (s, "vs-4030: is_reusable: there is no so many bitmap blocks: "
+ "block=%lu, bitmap_nr=%d", block, i);
+ return 0;
+ }
+
+ if ((bit_value == 0 &&
+ reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i].bh->b_data)) ||
+ (bit_value == 1 &&
+ reiserfs_test_le_bit(j, SB_AP_BITMAP (s)[i].bh->b_data) == 0)) {
+ reiserfs_warning (s, "vs-4040: is_reusable: corresponding bit of block %lu does not "
+ "match required value (i==%d, j==%d) test_bit==%d",
+ block, i, j, reiserfs_test_le_bit (j, SB_AP_BITMAP (s)[i].bh->b_data));
+
+ return 0;
+ }
+
+ if (bit_value == 0 && block == SB_ROOT_BLOCK (s)) {
+ reiserfs_warning (s, "vs-4050: is_reusable: this is root block (%u), "
+ "it must be busy", SB_ROOT_BLOCK (s));
+ return 0;
+ }
+
+ return 1;
+}
+#endif /* CONFIG_REISERFS_CHECK */
+
+/* searches in journal structures for a given block number (bmap, off). If block
+ is found in reiserfs journal it suggests next free block candidate to test. */
+static inline int is_block_in_journal (struct super_block * s, int bmap, int
+off, int *next)
+{
+ b_blocknr_t tmp;
+
+ if (reiserfs_in_journal (s, bmap, off, 1, &tmp)) {
+ if (tmp) { /* hint supplied */
+ *next = tmp;
+ PROC_INFO_INC( s, scan_bitmap.in_journal_hint );
+ } else {
+ (*next) = off + 1; /* inc offset to avoid looping. */
+ PROC_INFO_INC( s, scan_bitmap.in_journal_nohint );
+ }
+ PROC_INFO_INC( s, scan_bitmap.retry );
+ return 1;
+ }
+ return 0;
+}
+
+/* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap
+ * block; */
+static int scan_bitmap_block (struct reiserfs_transaction_handle *th,
+ int bmap_n, int *beg, int boundary, int min, int max, int unfm)
+{
+ struct super_block *s = th->t_super;
+ struct reiserfs_bitmap_info *bi=&SB_AP_BITMAP(s)[bmap_n];
+ int end, next;
+ int org = *beg;
+
+ BUG_ON (!th->t_trans_id);
+
+ RFALSE(bmap_n >= SB_BMAP_NR (s), "Bitmap %d is out of range (0..%d)",bmap_n, SB_BMAP_NR (s) - 1);
+ PROC_INFO_INC( s, scan_bitmap.bmap );
+/* this is unclear and lacks comments, explain how journal bitmaps
+ work here for the reader. Convey a sense of the design here. What
+ is a window? */
+/* - I mean `a window of zero bits' as in description of this function - Zam. */
+
+ if ( !bi ) {
+ reiserfs_warning (s, "NULL bitmap info pointer for bitmap %d", bmap_n);
+ return 0;
+ }
+ if (buffer_locked (bi->bh)) {
+ PROC_INFO_INC( s, scan_bitmap.wait );
+ __wait_on_buffer (bi->bh);
+ }
+
+ while (1) {
+ cont:
+ if (bi->free_count < min)
+ return 0; // No free blocks in this bitmap
+
+ /* search for a first zero bit -- beggining of a window */
+ *beg = reiserfs_find_next_zero_le_bit
+ ((unsigned long*)(bi->bh->b_data), boundary, *beg);
+
+ if (*beg + min > boundary) { /* search for a zero bit fails or the rest of bitmap block
+ * cannot contain a zero window of minimum size */
+ return 0;
+ }
+
+ if (unfm && is_block_in_journal(s,bmap_n, *beg, beg))
+ continue;
+ /* first zero bit found; we check next bits */
+ for (end = *beg + 1;; end ++) {
+ if (end >= *beg + max || end >= boundary || reiserfs_test_le_bit (end, bi->bh->b_data)) {
+ next = end;
+ break;
+ }
+ /* finding the other end of zero bit window requires looking into journal structures (in
+ * case of searching for free blocks for unformatted nodes) */
+ if (unfm && is_block_in_journal(s, bmap_n, end, &next))
+ break;
+ }
+
+ /* now (*beg) points to beginning of zero bits window,
+ * (end) points to one bit after the window end */
+ if (end - *beg >= min) { /* it seems we have found window of proper size */
+ int i;
+ reiserfs_prepare_for_journal (s, bi->bh, 1);
+ /* try to set all blocks used checking are they still free */
+ for (i = *beg; i < end; i++) {
+ /* It seems that we should not check in journal again. */
+ if (reiserfs_test_and_set_le_bit (i, bi->bh->b_data)) {
+ /* bit was set by another process
+ * while we slept in prepare_for_journal() */
+ PROC_INFO_INC( s, scan_bitmap.stolen );
+ if (i >= *beg + min) { /* we can continue with smaller set of allocated blocks,
+ * if length of this set is more or equal to `min' */
+ end = i;
+ break;
+ }
+ /* otherwise we clear all bit were set ... */
+ while (--i >= *beg)
+ reiserfs_test_and_clear_le_bit (i, bi->bh->b_data);
+ reiserfs_restore_prepared_buffer (s, bi->bh);
+ *beg = org;
+ /* ... and search again in current block from beginning */
+ goto cont;
+ }
+ }
+ bi->free_count -= (end - *beg);
+ journal_mark_dirty (th, s, bi->bh);
+
+ /* free block count calculation */
+ reiserfs_prepare_for_journal (s, SB_BUFFER_WITH_SB(s), 1);
+ PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
+ journal_mark_dirty (th, s, SB_BUFFER_WITH_SB(s));
+
+ return end - (*beg);
+ } else {
+ *beg = next;
+ }
+ }
+}
+
+static int bmap_hash_id(struct super_block *s, u32 id) {
+ char * hash_in = NULL;
+ unsigned long hash;
+ unsigned bm;
+
+ if (id <= 2) {
+ bm = 1;
+ } else {
+ hash_in = (char *)(&id);
+ hash = keyed_hash(hash_in, 4);
+ bm = hash % SB_BMAP_NR(s);
+ if (!bm)
+ bm = 1;
+ }
+ /* this can only be true when SB_BMAP_NR = 1 */
+ if (bm >= SB_BMAP_NR(s))
+ bm = 0;
+ return bm;
+}
+
+/*
+ * hashes the id and then returns > 0 if the block group for the
+ * corresponding hash is full
+ */
+static inline int block_group_used(struct super_block *s, u32 id) {
+ int bm;
+ bm = bmap_hash_id(s, id);
+ if (SB_AP_BITMAP(s)[bm].free_count > ((s->s_blocksize << 3) * 60 / 100) ) {
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * the packing is returned in disk byte order
+ */
+u32 reiserfs_choose_packing(struct inode *dir) {
+ u32 packing;
+ if (TEST_OPTION(packing_groups, dir->i_sb)) {
+ u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id);
+ /*
+ * some versions of reiserfsck expect packing locality 1 to be
+ * special
+ */
+ if (parent_dir == 1 || block_group_used(dir->i_sb,parent_dir))
+ packing = INODE_PKEY(dir)->k_objectid;
+ else
+ packing = INODE_PKEY(dir)->k_dir_id;
+ } else
+ packing = INODE_PKEY(dir)->k_objectid;
+ return packing;
+}
+
+/* Tries to find contiguous zero bit window (given size) in given region of
+ * bitmap and place new blocks there. Returns number of allocated blocks. */
+static int scan_bitmap (struct reiserfs_transaction_handle *th,
+ b_blocknr_t *start, b_blocknr_t finish,
+ int min, int max, int unfm, unsigned long file_block)
+{
+ int nr_allocated=0;
+ struct super_block * s = th->t_super;
+ /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr
+ * - Hans, it is not a block number - Zam. */
+
+ int bm, off;
+ int end_bm, end_off;
+ int off_max = s->s_blocksize << 3;
+
+ BUG_ON (!th->t_trans_id);
+
+ PROC_INFO_INC( s, scan_bitmap.call );
+ if ( SB_FREE_BLOCKS(s) <= 0)
+ return 0; // No point in looking for more free blocks
+
+ get_bit_address (s, *start, &bm, &off);
+ get_bit_address (s, finish, &end_bm, &end_off);
+ if (bm > SB_BMAP_NR(s))
+ return 0;
+ if (end_bm > SB_BMAP_NR(s))
+ end_bm = SB_BMAP_NR(s);
+
+ /* When the bitmap is more than 10% free, anyone can allocate.
+ * When it's less than 10% free, only files that already use the
+ * bitmap are allowed. Once we pass 80% full, this restriction
+ * is lifted.
+ *
+ * We do this so that files that grow later still have space close to
+ * their original allocation. This improves locality, and presumably
+ * performance as a result.
+ *
+ * This is only an allocation policy and does not make up for getting a
+ * bad hint. Decent hinting must be implemented for this to work well.
+ */
+ if ( TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s)/20 ) {
+ for (;bm < end_bm; bm++, off = 0) {
+ if ( ( off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10 )
+ nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
+ if (nr_allocated)
+ goto ret;
+ }
+ /* we know from above that start is a reasonable number */
+ get_bit_address (s, *start, &bm, &off);
+ }
+
+ for (;bm < end_bm; bm++, off = 0) {
+ nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
+ if (nr_allocated)
+ goto ret;
+ }
+
+ nr_allocated = scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
+
+ ret:
+ *start = bm * off_max + off;
+ return nr_allocated;
+
+}
+
+static void _reiserfs_free_block (struct reiserfs_transaction_handle *th,
+ struct inode *inode, b_blocknr_t block,
+ int for_unformatted)
+{
+ struct super_block * s = th->t_super;
+ struct reiserfs_super_block * rs;
+ struct buffer_head * sbh;
+ struct reiserfs_bitmap_info *apbi;
+ int nr, offset;
+
+ BUG_ON (!th->t_trans_id);
+
+ PROC_INFO_INC( s, free_block );
+
+ rs = SB_DISK_SUPER_BLOCK (s);
+ sbh = SB_BUFFER_WITH_SB (s);
+ apbi = SB_AP_BITMAP(s);
+
+ get_bit_address (s, block, &nr, &offset);
+
+ if (nr >= sb_bmap_nr (rs)) {
+ reiserfs_warning (s, "vs-4075: reiserfs_free_block: "
+ "block %lu is out of range on %s",
+ block, reiserfs_bdevname (s));
+ return;
+ }
+
+ reiserfs_prepare_for_journal(s, apbi[nr].bh, 1 ) ;
+
+ /* clear bit for the given block in bit map */
+ if (!reiserfs_test_and_clear_le_bit (offset, apbi[nr].bh->b_data)) {
+ reiserfs_warning (s, "vs-4080: reiserfs_free_block: "
+ "free_block (%s:%lu)[dev:blocknr]: bit already cleared",
+ reiserfs_bdevname (s), block);
+ }
+ apbi[nr].free_count ++;
+ journal_mark_dirty (th, s, apbi[nr].bh);
+
+ reiserfs_prepare_for_journal(s, sbh, 1) ;
+ /* update super block */
+ set_sb_free_blocks( rs, sb_free_blocks(rs) + 1 );
+
+ journal_mark_dirty (th, s, sbh);
+ if (for_unformatted)
+ DQUOT_FREE_BLOCK_NODIRTY(inode, 1);
+}
+
+void reiserfs_free_block (struct reiserfs_transaction_handle *th,
+ struct inode *inode, b_blocknr_t block,
+ int for_unformatted)
+{
+ struct super_block * s = th->t_super;
+
+ BUG_ON (!th->t_trans_id);
+
+ RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
+ RFALSE(is_reusable (s, block, 1) == 0, "vs-4071: can not free such block");
+ /* mark it before we clear it, just in case */
+ journal_mark_freed(th, s, block) ;
+ _reiserfs_free_block(th, inode, block, for_unformatted) ;
+}
+
+/* preallocated blocks don't need to be run through journal_mark_freed */
+static void reiserfs_free_prealloc_block (struct reiserfs_transaction_handle *th,
+ struct inode *inode, b_blocknr_t block) {
+ RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device");
+ RFALSE(is_reusable (th->t_super, block, 1) == 0, "vs-4070: can not free such block");
+ BUG_ON (!th->t_trans_id);
+ _reiserfs_free_block(th, inode, block, 1) ;
+}
+
+static void __discard_prealloc (struct reiserfs_transaction_handle * th,
+ struct reiserfs_inode_info *ei)
+{
+ unsigned long save = ei->i_prealloc_block ;
+ int dirty = 0;
+ struct inode *inode = &ei->vfs_inode;
+ BUG_ON (!th->t_trans_id);
+#ifdef CONFIG_REISERFS_CHECK
+ if (ei->i_prealloc_count < 0)
+ reiserfs_warning (th->t_super, "zam-4001:%s: inode has negative prealloc blocks count.", __FUNCTION__ );
+#endif
+ while (ei->i_prealloc_count > 0) {
+ reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block);
+ ei->i_prealloc_block++;
+ ei->i_prealloc_count --;
+ dirty = 1;
+ }
+ if (dirty)
+ reiserfs_update_sd(th, inode);
+ ei->i_prealloc_block = save;
+ list_del_init(&(ei->i_prealloc_list));
+}
+
+/* FIXME: It should be inline function */
+void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th,
+ struct inode *inode)
+{
+ struct reiserfs_inode_info *ei = REISERFS_I(inode);
+ BUG_ON (!th->t_trans_id);
+ if (ei->i_prealloc_count)
+ __discard_prealloc(th, ei);
+}
+
+void reiserfs_discard_all_prealloc (struct reiserfs_transaction_handle *th)
+{
+ struct list_head * plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
+
+ BUG_ON (!th->t_trans_id);
+
+ while (!list_empty(plist)) {
+ struct reiserfs_inode_info *ei;
+ ei = list_entry(plist->next, struct reiserfs_inode_info, i_prealloc_list);
+#ifdef CONFIG_REISERFS_CHECK
+ if (!ei->i_prealloc_count) {
+ reiserfs_warning (th->t_super, "zam-4001:%s: inode is in prealloc list but has no preallocated blocks.", __FUNCTION__);
+ }
+#endif
+ __discard_prealloc(th, ei);
+ }
+}
+
+void reiserfs_init_alloc_options (struct super_block *s)
+{
+ set_bit (_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
+ set_bit (_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
+ set_bit (_ALLOC_packing_groups, &SB_ALLOC_OPTS(s));
+}
+
+/* block allocator related options are parsed here */
+int reiserfs_parse_alloc_options(struct super_block * s, char * options)
+{
+ char * this_char, * value;
+
+ REISERFS_SB(s)->s_alloc_options.bits = 0; /* clear default settings */
+
+ while ( (this_char = strsep (&options, ":")) != NULL ) {
+ if ((value = strchr (this_char, '=')) != NULL)
+ *value++ = 0;
+
+ if (!strcmp(this_char, "concentrating_formatted_nodes")) {
+ int temp;
+ SET_OPTION(concentrating_formatted_nodes);
+ temp = (value && *value) ? simple_strtoul (value, &value, 0) : 10;
+ if (temp <= 0 || temp > 100) {
+ REISERFS_SB(s)->s_alloc_options.border = 10;
+ } else {
+ REISERFS_SB(s)->s_alloc_options.border = 100 / temp;
+ }
+ continue;
+ }
+ if (!strcmp(this_char, "displacing_large_files")) {
+ SET_OPTION(displacing_large_files);
+ REISERFS_SB(s)->s_alloc_options.large_file_size =
+ (value && *value) ? simple_strtoul (value, &value, 0) : 16;
+ continue;
+ }
+ if (!strcmp(this_char, "displacing_new_packing_localities")) {
+ SET_OPTION(displacing_new_packing_localities);
+ continue;
+ };
+
+ if (!strcmp(this_char, "old_hashed_relocation")) {
+ SET_OPTION(old_hashed_relocation);
+ continue;
+ }
+
+ if (!strcmp(this_char, "new_hashed_relocation")) {
+ SET_OPTION(new_hashed_relocation);
+ continue;
+ }
+
+ if (!strcmp(this_char, "dirid_groups")) {
+ SET_OPTION(dirid_groups);
+ continue;
+ }
+ if (!strcmp(this_char, "oid_groups")) {
+ SET_OPTION(oid_groups);
+ continue;
+ }
+ if (!strcmp(this_char, "packing_groups")) {
+ SET_OPTION(packing_groups);
+ continue;
+ }
+ if (!strcmp(this_char, "hashed_formatted_nodes")) {
+ SET_OPTION(hashed_formatted_nodes);
+ continue;
+ }
+
+ if (!strcmp(this_char, "skip_busy")) {
+ SET_OPTION(skip_busy);
+ continue;
+ }
+
+ if (!strcmp(this_char, "hundredth_slices")) {
+ SET_OPTION(hundredth_slices);
+ continue;
+ }
+
+ if (!strcmp(this_char, "old_way")) {
+ SET_OPTION(old_way);
+ continue;
+ }
+
+ if (!strcmp(this_char, "displace_based_on_dirid")) {
+ SET_OPTION(displace_based_on_dirid);
+ continue;
+ }
+
+ if (!strcmp(this_char, "preallocmin")) {
+ REISERFS_SB(s)->s_alloc_options.preallocmin =
+ (value && *value) ? simple_strtoul (value, &value, 0) : 4;
+ continue;
+ }
+
+ if (!strcmp(this_char, "preallocsize")) {
+ REISERFS_SB(s)->s_alloc_options.preallocsize =
+ (value && *value) ? simple_strtoul (value, &value, 0) : PREALLOCATION_SIZE;
+ continue;
+ }
+
+ reiserfs_warning (s, "zam-4001: %s : unknown option - %s",
+ __FUNCTION__ , this_char);
+ return 1;
+ }
+
+ reiserfs_warning (s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
+ return 0;
+}
+
+static inline void new_hashed_relocation (reiserfs_blocknr_hint_t * hint)
+{
+ char * hash_in;
+ if (hint->formatted_node) {
+ hash_in = (char*)&hint->key.k_dir_id;
+ } else {
+ if (!hint->inode) {
+ //hint->search_start = hint->beg;
+ hash_in = (char*)&hint->key.k_dir_id;
+ } else
+ if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
+ hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
+ else
+ hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
+ }
+
+ hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
+}
+
+/*
+ * Relocation based on dirid, hashing them into a given bitmap block
+ * files. Formatted nodes are unaffected, a seperate policy covers them
+ */
+static void
+dirid_groups (reiserfs_blocknr_hint_t *hint)
+{
+ unsigned long hash;
+ __u32 dirid = 0;
+ int bm = 0;
+ struct super_block *sb = hint->th->t_super;
+ if (hint->inode)
+ dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
+ else if (hint->formatted_node)
+ dirid = hint->key.k_dir_id;
+
+ if (dirid) {
+ bm = bmap_hash_id(sb, dirid);
+ hash = bm * (sb->s_blocksize << 3);
+ /* give a portion of the block group to metadata */
+ if (hint->inode)
+ hash += sb->s_blocksize/2;
+ hint->search_start = hash;
+ }
+}
+
+/*
+ * Relocation based on oid, hashing them into a given bitmap block
+ * files. Formatted nodes are unaffected, a seperate policy covers them
+ */
+static void
+oid_groups (reiserfs_blocknr_hint_t *hint)
+{
+ if (hint->inode) {
+ unsigned long hash;
+ __u32 oid;
+ __u32 dirid;
+ int bm;
+
+ dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
+
+ /* keep the root dir and it's first set of subdirs close to
+ * the start of the disk
+ */
+ if (dirid <= 2)
+ hash = (hint->inode->i_sb->s_blocksize << 3);
+ else {
+ oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
+ bm = bmap_hash_id(hint->inode->i_sb, oid);
+ hash = bm * (hint->inode->i_sb->s_blocksize << 3);
+ }
+ hint->search_start = hash;
+ }
+}
+
+/* returns 1 if it finds an indirect item and gets valid hint info
+ * from it, otherwise 0
+ */
+static int get_left_neighbor(reiserfs_blocknr_hint_t *hint)
+{
+ struct path * path;
+ struct buffer_head * bh;
+ struct item_head * ih;
+ int pos_in_item;
+ __u32 * item;
+ int ret = 0;
+
+ if (!hint->path) /* reiserfs code can call this function w/o pointer to path
+ * structure supplied; then we rely on supplied search_start */
+ return 0;
+
+ path = hint->path;
+ bh = get_last_bh(path);
+ RFALSE( !bh, "green-4002: Illegal path specified to get_left_neighbor");
+ ih = get_ih(path);
+ pos_in_item = path->pos_in_item;
+ item = get_item (path);
+
+ hint->search_start = bh->b_blocknr;
+
+ if (!hint->formatted_node && is_indirect_le_ih (ih)) {
+ /* for indirect item: go to left and look for the first non-hole entry
+ in the indirect item */
+ if (pos_in_item == I_UNFM_NUM (ih))
+ pos_in_item--;
+// pos_in_item = I_UNFM_NUM (ih) - 1;
+ while (pos_in_item >= 0) {
+ int t=get_block_num(item,pos_in_item);
+ if (t) {
+ hint->search_start = t;
+ ret = 1;
+ break;
+ }
+ pos_in_item --;
+ }
+ }
+
+ /* does result value fit into specified region? */
+ return ret;
+}
+
+/* should be, if formatted node, then try to put on first part of the device
+ specified as number of percent with mount option device, else try to put
+ on last of device. This is not to say it is good code to do so,
+ but the effect should be measured. */
+static inline void set_border_in_hint(struct super_block *s, reiserfs_blocknr_hint_t *hint)
+{
+ b_blocknr_t border = SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border;
+
+ if (hint->formatted_node)
+ hint->end = border - 1;
+ else
+ hint->beg = border;
+}
+
+static inline void displace_large_file(reiserfs_blocknr_hint_t *hint)
+{
+ if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
+ hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id), 4) % (hint->end - hint->beg);
+ else
+ hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid), 4) % (hint->end - hint->beg);
+}
+
+static inline void hash_formatted_node(reiserfs_blocknr_hint_t *hint)
+{
+ char * hash_in;
+
+ if (!hint->inode)
+ hash_in = (char*)&hint->key.k_dir_id;
+ else if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
+ hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
+ else
+ hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
+
+ hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
+}
+
+static inline int this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *hint)
+{
+ return hint->block == REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size;
+}
+
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+static inline void displace_new_packing_locality (reiserfs_blocknr_hint_t *hint)
+{
+ struct reiserfs_key * key = &hint->key;
+
+ hint->th->displace_new_blocks = 0;
+ hint->search_start = hint->beg + keyed_hash((char*)(&key->k_objectid),4) % (hint->end - hint->beg);
+}
+ #endif
+
+static inline int old_hashed_relocation (reiserfs_blocknr_hint_t * hint)
+{
+ b_blocknr_t border;
+ u32 hash_in;
+
+ if (hint->formatted_node || hint->inode == NULL) {
+ return 0;
+ }
+
+ hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
+ border = hint->beg + (u32) keyed_hash(((char *) (&hash_in)), 4) % (hint->end - hint->beg - 1);
+ if (border > hint->search_start)
+ hint->search_start = border;
+
+ return 1;
+ }
+
+static inline int old_way (reiserfs_blocknr_hint_t * hint)
+{
+ b_blocknr_t border;
+
+ if (hint->formatted_node || hint->inode == NULL) {
+ return 0;
+ }
+
+ border = hint->beg + le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end - hint->beg);
+ if (border > hint->search_start)
+ hint->search_start = border;
+
+ return 1;
+}
+
+static inline void hundredth_slices (reiserfs_blocknr_hint_t * hint)
+{
+ struct reiserfs_key * key = &hint->key;
+ b_blocknr_t slice_start;
+
+ slice_start = (keyed_hash((char*)(&key->k_dir_id),4) % 100) * (hint->end / 100);
+ if ( slice_start > hint->search_start || slice_start + (hint->end / 100) <= hint->search_start) {
+ hint->search_start = slice_start;
+ }
+}
+
+static void determine_search_start(reiserfs_blocknr_hint_t *hint,
+ int amount_needed)
+{
+ struct super_block *s = hint->th->t_super;
+ int unfm_hint;
+
+ hint->beg = 0;
+ hint->end = SB_BLOCK_COUNT(s) - 1;
+
+ /* This is former border algorithm. Now with tunable border offset */
+ if (concentrating_formatted_nodes(s))
+ set_border_in_hint(s, hint);
+
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+ /* whenever we create a new directory, we displace it. At first we will
+ hash for location, later we might look for a moderately empty place for
+ it */
+ if (displacing_new_packing_localities(s)
+ && hint->th->displace_new_blocks) {
+ displace_new_packing_locality(hint);
+
+ /* we do not continue determine_search_start,
+ * if new packing locality is being displaced */
+ return;
+ }
+#endif
+
+ /* all persons should feel encouraged to add more special cases here and
+ * test them */
+
+ if (displacing_large_files(s) && !hint->formatted_node
+ && this_blocknr_allocation_would_make_it_a_large_file(hint)) {
+ displace_large_file(hint);
+ return;
+ }
+
+ /* if none of our special cases is relevant, use the left neighbor in the
+ tree order of the new node we are allocating for */
+ if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes,s)) {
+ hash_formatted_node(hint);
+ return;
+ }
+
+ unfm_hint = get_left_neighbor(hint);
+
+ /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation,
+ new blocks are displaced based on directory ID. Also, if suggested search_start
+ is less than last preallocated block, we start searching from it, assuming that
+ HDD dataflow is faster in forward direction */
+ if ( TEST_OPTION(old_way, s)) {
+ if (!hint->formatted_node) {
+ if ( !reiserfs_hashed_relocation(s))
+ old_way(hint);
+ else if (!reiserfs_no_unhashed_relocation(s))
+ old_hashed_relocation(hint);
+
+ if ( hint->inode && hint->search_start < REISERFS_I(hint->inode)->i_prealloc_block)
+ hint->search_start = REISERFS_I(hint->inode)->i_prealloc_block;
+ }
+ return;
+ }
+
+ /* This is an approach proposed by Hans */
+ if ( TEST_OPTION(hundredth_slices, s) && ! (displacing_large_files(s) && !hint->formatted_node)) {
+ hundredth_slices(hint);
+ return;
+ }
+
+ /* old_hashed_relocation only works on unformatted */
+ if (!unfm_hint && !hint->formatted_node &&
+ TEST_OPTION(old_hashed_relocation, s))
+ {
+ old_hashed_relocation(hint);
+ }
+ /* new_hashed_relocation works with both formatted/unformatted nodes */
+ if ((!unfm_hint || hint->formatted_node) &&
+ TEST_OPTION(new_hashed_relocation, s))
+ {
+ new_hashed_relocation(hint);
+ }
+ /* dirid grouping works only on unformatted nodes */
+ if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups,s))
+ {
+ dirid_groups(hint);
+ }
+
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+ if (hint->formatted_node && TEST_OPTION(dirid_groups,s))
+ {
+ dirid_groups(hint);
+ }
+#endif
+
+ /* oid grouping works only on unformatted nodes */
+ if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups,s))
+ {
+ oid_groups(hint);
+ }
+ return;
+}
+
+static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
+{
+ /* make minimum size a mount option and benchmark both ways */
+ /* we preallocate blocks only for regular files, specific size */
+ /* benchmark preallocating always and see what happens */
+
+ hint->prealloc_size = 0;
+
+ if (!hint->formatted_node && hint->preallocate) {
+ if (S_ISREG(hint->inode->i_mode)
+ && hint->inode->i_size >= REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocmin * hint->inode->i_sb->s_blocksize)
+ hint->prealloc_size = REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocsize - 1;
+ }
+ return CARRY_ON;
+}
+
+/* XXX I know it could be merged with upper-level function;
+ but may be result function would be too complex. */
+static inline int allocate_without_wrapping_disk (reiserfs_blocknr_hint_t * hint,
+ b_blocknr_t * new_blocknrs,
+ b_blocknr_t start, b_blocknr_t finish,
+ int min,
+ int amount_needed, int prealloc_size)
+{
+ int rest = amount_needed;
+ int nr_allocated;
+
+ while (rest > 0 && start <= finish) {
+ nr_allocated = scan_bitmap (hint->th, &start, finish, min,
+ rest + prealloc_size, !hint->formatted_node,
+ hint->block);
+
+ if (nr_allocated == 0) /* no new blocks allocated, return */
+ break;
+
+ /* fill free_blocknrs array first */
+ while (rest > 0 && nr_allocated > 0) {
+ * new_blocknrs ++ = start ++;
+ rest --; nr_allocated --;
+ }
+
+ /* do we have something to fill prealloc. array also ? */
+ if (nr_allocated > 0) {
+ /* it means prealloc_size was greater that 0 and we do preallocation */
+ list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
+ &SB_JOURNAL(hint->th->t_super)->j_prealloc_list);
+ REISERFS_I(hint->inode)->i_prealloc_block = start;
+ REISERFS_I(hint->inode)->i_prealloc_count = nr_allocated;
+ break;
+ }
+ }
+
+ return (amount_needed - rest);
+}
+
+static inline int blocknrs_and_prealloc_arrays_from_search_start
+ (reiserfs_blocknr_hint_t *hint, b_blocknr_t *new_blocknrs, int amount_needed)
+{
+ struct super_block *s = hint->th->t_super;
+ b_blocknr_t start = hint->search_start;
+ b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
+ int passno = 0;
+ int nr_allocated = 0;
+ int bigalloc = 0;
+
+ determine_prealloc_size(hint);
+ if (!hint->formatted_node) {
+ int quota_ret;
+#ifdef REISERQUOTA_DEBUG
+ reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating %d blocks id=%u", amount_needed, hint->inode->i_uid);
+#endif
+ quota_ret = DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed);
+ if (quota_ret) /* Quota exceeded? */
+ return QUOTA_EXCEEDED;
+ if (hint->preallocate && hint->prealloc_size ) {
+#ifdef REISERQUOTA_DEBUG
+ reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating (prealloc) %d blocks id=%u", hint->prealloc_size, hint->inode->i_uid);
+#endif
+ quota_ret = DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode, hint->prealloc_size);
+ if (quota_ret)
+ hint->preallocate=hint->prealloc_size=0;
+ }
+ /* for unformatted nodes, force large allocations */
+ bigalloc = amount_needed;
+ }
+
+ do {
+ /* in bigalloc mode, nr_allocated should stay zero until
+ * the entire allocation is filled
+ */
+ if (unlikely(bigalloc && nr_allocated)) {
+ reiserfs_warning(s, "bigalloc is %d, nr_allocated %d\n",
+ bigalloc, nr_allocated);
+ /* reset things to a sane value */
+ bigalloc = amount_needed - nr_allocated;
+ }
+ /*
+ * try pass 0 and pass 1 looking for a nice big
+ * contiguous allocation. Then reset and look
+ * for anything you can find.
+ */
+ if (passno == 2 && bigalloc) {
+ passno = 0;
+ bigalloc = 0;
+ }
+ switch (passno++) {
+ case 0: /* Search from hint->search_start to end of disk */
+ start = hint->search_start;
+ finish = SB_BLOCK_COUNT(s) - 1;
+ break;
+ case 1: /* Search from hint->beg to hint->search_start */
+ start = hint->beg;
+ finish = hint->search_start;
+ break;
+ case 2: /* Last chance: Search from 0 to hint->beg */
+ start = 0;
+ finish = hint->beg;
+ break;
+ default: /* We've tried searching everywhere, not enough space */
+ /* Free the blocks */
+ if (!hint->formatted_node) {
+#ifdef REISERQUOTA_DEBUG
+ reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (nospace) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid);
+#endif
+ DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); /* Free not allocated blocks */
+ }
+ while (nr_allocated --)
+ reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node);
+
+ return NO_DISK_SPACE;
+ }
+ } while ((nr_allocated += allocate_without_wrapping_disk (hint,
+ new_blocknrs + nr_allocated, start, finish,
+ bigalloc ? bigalloc : 1,
+ amount_needed - nr_allocated,
+ hint->prealloc_size))
+ < amount_needed);
+ if ( !hint->formatted_node &&
+ amount_needed + hint->prealloc_size >
+ nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
+ /* Some of preallocation blocks were not allocated */
+#ifdef REISERQUOTA_DEBUG
+ reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (failed prealloc) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid);
+#endif
+ DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed +
+ hint->prealloc_size - nr_allocated -
+ REISERFS_I(hint->inode)->i_prealloc_count);
+ }
+
+ return CARRY_ON;
+}
+
+/* grab new blocknrs from preallocated list */
+/* return amount still needed after using them */
+static int use_preallocated_list_if_available (reiserfs_blocknr_hint_t *hint,
+ b_blocknr_t *new_blocknrs, int amount_needed)
+{
+ struct inode * inode = hint->inode;
+
+ if (REISERFS_I(inode)->i_prealloc_count > 0) {
+ while (amount_needed) {
+
+ *new_blocknrs ++ = REISERFS_I(inode)->i_prealloc_block ++;
+ REISERFS_I(inode)->i_prealloc_count --;
+
+ amount_needed --;
+
+ if (REISERFS_I(inode)->i_prealloc_count <= 0) {
+ list_del(&REISERFS_I(inode)->i_prealloc_list);
+ break;
+ }
+ }
+ }
+ /* return amount still needed after using preallocated blocks */
+ return amount_needed;
+}
+
+int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint,
+ b_blocknr_t * new_blocknrs, int amount_needed,
+ int reserved_by_us /* Amount of blocks we have
+ already reserved */)
+{
+ int initial_amount_needed = amount_needed;
+ int ret;
+ struct super_block *s = hint->th->t_super;
+
+ /* Check if there is enough space, taking into account reserved space */
+ if ( SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks <
+ amount_needed - reserved_by_us)
+ return NO_DISK_SPACE;
+ /* should this be if !hint->inode && hint->preallocate? */
+ /* do you mean hint->formatted_node can be removed ? - Zam */
+ /* hint->formatted_node cannot be removed because we try to access
+ inode information here, and there is often no inode assotiated with
+ metadata allocations - green */
+
+ if (!hint->formatted_node && hint->preallocate) {
+ amount_needed = use_preallocated_list_if_available
+ (hint, new_blocknrs, amount_needed);
+ if (amount_needed == 0) /* all blocknrs we need we got from
+ prealloc. list */
+ return CARRY_ON;
+ new_blocknrs += (initial_amount_needed - amount_needed);
+ }
+
+ /* find search start and save it in hint structure */
+ determine_search_start(hint, amount_needed);
+ if (hint->search_start >= SB_BLOCK_COUNT(s))
+ hint->search_start = SB_BLOCK_COUNT(s) - 1;
+
+ /* allocation itself; fill new_blocknrs and preallocation arrays */
+ ret = blocknrs_and_prealloc_arrays_from_search_start
+ (hint, new_blocknrs, amount_needed);
+
+ /* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we
+ * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second
+ * variant) */
+
+ if (ret != CARRY_ON) {
+ while (amount_needed ++ < initial_amount_needed) {
+ reiserfs_free_block(hint->th, hint->inode, *(--new_blocknrs), 1);
+ }
+ }
+ return ret;
+}
+
+/* These 2 functions are here to provide blocks reservation to the rest of kernel */
+/* Reserve @blocks amount of blocks in fs pointed by @sb. Caller must make sure
+ there are actually this much blocks on the FS available */
+void reiserfs_claim_blocks_to_be_allocated(
+ struct super_block *sb, /* super block of
+ filesystem where
+ blocks should be
+ reserved */
+ int blocks /* How much to reserve */
+ )
+{
+
+ /* Fast case, if reservation is zero - exit immediately. */
+ if ( !blocks )
+ return;
+
+ spin_lock(&REISERFS_SB(sb)->bitmap_lock);
+ REISERFS_SB(sb)->reserved_blocks += blocks;
+ spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
+}
+
+/* Unreserve @blocks amount of blocks in fs pointed by @sb */
+void reiserfs_release_claimed_blocks(
+ struct super_block *sb, /* super block of
+ filesystem where
+ blocks should be
+ reserved */
+ int blocks /* How much to unreserve */
+ )
+{
+
+ /* Fast case, if unreservation is zero - exit immediately. */
+ if ( !blocks )
+ return;
+
+ spin_lock(&REISERFS_SB(sb)->bitmap_lock);
+ REISERFS_SB(sb)->reserved_blocks -= blocks;
+ spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
+ RFALSE( REISERFS_SB(sb)->reserved_blocks < 0, "amount of blocks reserved became zero?");
+}
+
+/* This function estimates how much pages we will be able to write to FS
+ used for reiserfs_file_write() purposes for now. */
+int reiserfs_can_fit_pages ( struct super_block *sb /* superblock of filesystem
+ to estimate space */ )
+{
+ int space;
+
+ spin_lock(&REISERFS_SB(sb)->bitmap_lock);
+ space = (SB_FREE_BLOCKS(sb) - REISERFS_SB(sb)->reserved_blocks) >> ( PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+ spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
+
+ return space>0?space:0;
+}
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
new file mode 100644
index 00000000000..d1514a9b051
--- /dev/null
+++ b/fs/reiserfs/dir.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/config.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/stat.h>
+#include <linux/smp_lock.h>
+#include <linux/buffer_head.h>
+#include <asm/uaccess.h>
+
+extern struct reiserfs_key MIN_KEY;
+
+static int reiserfs_readdir (struct file *, void *, filldir_t);
+static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) ;
+
+struct file_operations reiserfs_dir_operations = {
+ .read = generic_read_dir,
+ .readdir = reiserfs_readdir,
+ .fsync = reiserfs_dir_fsync,
+ .ioctl = reiserfs_ioctl,
+};
+
+static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) {
+ struct inode *inode = dentry->d_inode;
+ int err;
+ reiserfs_write_lock(inode->i_sb);
+ err = reiserfs_commit_for_inode(inode) ;
+ reiserfs_write_unlock(inode->i_sb) ;
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+
+#define store_ih(where,what) copy_item_head (where, what)
+
+//
+static int reiserfs_readdir (struct file * filp, void * dirent, filldir_t filldir)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */
+ INITIALIZE_PATH (path_to_entry);
+ struct buffer_head * bh;
+ int item_num, entry_num;
+ const struct reiserfs_key * rkey;
+ struct item_head * ih, tmp_ih;
+ int search_res;
+ char * local_buf;
+ loff_t next_pos;
+ char small_buf[32] ; /* avoid kmalloc if we can */
+ struct reiserfs_dir_entry de;
+ int ret = 0;
+
+ reiserfs_write_lock(inode->i_sb);
+
+ reiserfs_check_lock_depth(inode->i_sb, "readdir") ;
+
+ /* form key for search the next directory entry using f_pos field of
+ file structure */
+ make_cpu_key (&pos_key, inode, (filp->f_pos) ? (filp->f_pos) : DOT_OFFSET,
+ TYPE_DIRENTRY, 3);
+ next_pos = cpu_key_k_offset (&pos_key);
+
+ /* reiserfs_warning (inode->i_sb, "reiserfs_readdir 1: f_pos = %Ld", filp->f_pos);*/
+
+ path_to_entry.reada = PATH_READA;
+ while (1) {
+ research:
+ /* search the directory item, containing entry with specified key */
+ search_res = search_by_entry_key (inode->i_sb, &pos_key, &path_to_entry, &de);
+ if (search_res == IO_ERROR) {
+ // FIXME: we could just skip part of directory which could
+ // not be read
+ ret = -EIO;
+ goto out;
+ }
+ entry_num = de.de_entry_num;
+ bh = de.de_bh;
+ item_num = de.de_item_num;
+ ih = de.de_ih;
+ store_ih (&tmp_ih, ih);
+
+ /* we must have found item, that is item of this directory, */
+ RFALSE( COMP_SHORT_KEYS (&(ih->ih_key), &pos_key),
+ "vs-9000: found item %h does not match to dir we readdir %K",
+ ih, &pos_key);
+ RFALSE( item_num > B_NR_ITEMS (bh) - 1,
+ "vs-9005 item_num == %d, item amount == %d",
+ item_num, B_NR_ITEMS (bh));
+
+ /* and entry must be not more than number of entries in the item */
+ RFALSE( I_ENTRY_COUNT (ih) < entry_num,
+ "vs-9010: entry number is too big %d (%d)",
+ entry_num, I_ENTRY_COUNT (ih));
+
+ if (search_res == POSITION_FOUND || entry_num < I_ENTRY_COUNT (ih)) {
+ /* go through all entries in the directory item beginning from the entry, that has been found */
+ struct reiserfs_de_head * deh = B_I_DEH (bh, ih) + entry_num;
+
+ for (; entry_num < I_ENTRY_COUNT (ih); entry_num ++, deh ++) {
+ int d_reclen;
+ char * d_name;
+ off_t d_off;
+ ino_t d_ino;
+
+ if (!de_visible (deh))
+ /* it is hidden entry */
+ continue;
+ d_reclen = entry_length (bh, ih, entry_num);
+ d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh);
+ if (!d_name[d_reclen - 1])
+ d_reclen = strlen (d_name);
+
+ if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)){
+ /* too big to send back to VFS */
+ continue ;
+ }
+
+ /* Ignore the .reiserfs_priv entry */
+ if (reiserfs_xattrs (inode->i_sb) &&
+ !old_format_only(inode->i_sb) &&
+ filp->f_dentry == inode->i_sb->s_root &&
+ REISERFS_SB(inode->i_sb)->priv_root &&
+ REISERFS_SB(inode->i_sb)->priv_root->d_inode &&
+ deh_objectid(deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid)) {
+ continue;
+ }
+
+ d_off = deh_offset (deh);
+ filp->f_pos = d_off ;
+ d_ino = deh_objectid (deh);
+ if (d_reclen <= 32) {
+ local_buf = small_buf ;
+ } else {
+ local_buf = reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb) ;
+ if (!local_buf) {
+ pathrelse (&path_to_entry);
+ ret = -ENOMEM ;
+ goto out;
+ }
+ if (item_moved (&tmp_ih, &path_to_entry)) {
+ reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
+ goto research;
+ }
+ }
+ // Note, that we copy name to user space via temporary
+ // buffer (local_buf) because filldir will block if
+ // user space buffer is swapped out. At that time
+ // entry can move to somewhere else
+ memcpy (local_buf, d_name, d_reclen);
+ if (filldir (dirent, local_buf, d_reclen, d_off, d_ino,
+ DT_UNKNOWN) < 0) {
+ if (local_buf != small_buf) {
+ reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
+ }
+ goto end;
+ }
+ if (local_buf != small_buf) {
+ reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
+ }
+
+ // next entry should be looked for with such offset
+ next_pos = deh_offset (deh) + 1;
+
+ if (item_moved (&tmp_ih, &path_to_entry)) {
+ goto research;
+ }
+ } /* for */
+ }
+
+ if (item_num != B_NR_ITEMS (bh) - 1)
+ // end of directory has been reached
+ goto end;
+
+ /* item we went through is last item of node. Using right
+ delimiting key check is it directory end */
+ rkey = get_rkey (&path_to_entry, inode->i_sb);
+ if (! comp_le_keys (rkey, &MIN_KEY)) {
+ /* set pos_key to key, that is the smallest and greater
+ that key of the last entry in the item */
+ set_cpu_key_k_offset (&pos_key, next_pos);
+ continue;
+ }
+
+ if ( COMP_SHORT_KEYS (rkey, &pos_key)) {
+ // end of directory has been reached
+ goto end;
+ }
+
+ /* directory continues in the right neighboring block */
+ set_cpu_key_k_offset (&pos_key, le_key_k_offset (KEY_FORMAT_3_5, rkey));
+
+ } /* while */
+
+
+ end:
+ filp->f_pos = next_pos;
+ pathrelse (&path_to_entry);
+ reiserfs_check_path(&path_to_entry) ;
+ out:
+ reiserfs_write_unlock(inode->i_sb);
+ return ret;
+}
+
+/* compose directory item containing "." and ".." entries (entries are
+ not aligned to 4 byte boundary) */
+/* the last four params are LE */
+void make_empty_dir_item_v1 (char * body, __u32 dirid, __u32 objid,
+ __u32 par_dirid, __u32 par_objid)
+{
+ struct reiserfs_de_head * deh;
+
+ memset (body, 0, EMPTY_DIR_SIZE_V1);
+ deh = (struct reiserfs_de_head *)body;
+
+ /* direntry header of "." */
+ put_deh_offset( &(deh[0]), DOT_OFFSET );
+ /* these two are from make_le_item_head, and are are LE */
+ deh[0].deh_dir_id = dirid;
+ deh[0].deh_objectid = objid;
+ deh[0].deh_state = 0; /* Endian safe if 0 */
+ put_deh_location( &(deh[0]), EMPTY_DIR_SIZE_V1 - strlen( "." ));
+ mark_de_visible(&(deh[0]));
+
+ /* direntry header of ".." */
+ put_deh_offset( &(deh[1]), DOT_DOT_OFFSET);
+ /* key of ".." for the root directory */
+ /* these two are from the inode, and are are LE */
+ deh[1].deh_dir_id = par_dirid;
+ deh[1].deh_objectid = par_objid;
+ deh[1].deh_state = 0; /* Endian safe if 0 */
+ put_deh_location( &(deh[1]), deh_location( &(deh[0]) ) - strlen( ".." ) );
+ mark_de_visible(&(deh[1]));
+
+ /* copy ".." and "." */
+ memcpy (body + deh_location( &(deh[0]) ), ".", 1);
+ memcpy (body + deh_location( &(deh[1]) ), "..", 2);
+}
+
+/* compose directory item containing "." and ".." entries */
+void make_empty_dir_item (char * body, __u32 dirid, __u32 objid,
+ __u32 par_dirid, __u32 par_objid)
+{
+ struct reiserfs_de_head * deh;
+
+ memset (body, 0, EMPTY_DIR_SIZE);
+ deh = (struct reiserfs_de_head *)body;
+
+ /* direntry header of "." */
+ put_deh_offset( &(deh[0]), DOT_OFFSET );
+ /* these two are from make_le_item_head, and are are LE */
+ deh[0].deh_dir_id = dirid;
+ deh[0].deh_objectid = objid;
+ deh[0].deh_state = 0; /* Endian safe if 0 */
+ put_deh_location( &(deh[0]), EMPTY_DIR_SIZE - ROUND_UP( strlen( "." ) ) );
+ mark_de_visible(&(deh[0]));
+
+ /* direntry header of ".." */
+ put_deh_offset( &(deh[1]), DOT_DOT_OFFSET );
+ /* key of ".." for the root directory */
+ /* these two are from the inode, and are are LE */
+ deh[1].deh_dir_id = par_dirid;
+ deh[1].deh_objectid = par_objid;
+ deh[1].deh_state = 0; /* Endian safe if 0 */
+ put_deh_location( &(deh[1]), deh_location( &(deh[0])) - ROUND_UP( strlen( ".." ) ) );
+ mark_de_visible(&(deh[1]));
+
+ /* copy ".." and "." */
+ memcpy (body + deh_location( &(deh[0]) ), ".", 1);
+ memcpy (body + deh_location( &(deh[1]) ), "..", 2);
+}
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
new file mode 100644
index 00000000000..2118db2896c
--- /dev/null
+++ b/fs/reiserfs/do_balan.c
@@ -0,0 +1,1597 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+/* Now we have all buffers that must be used in balancing of the tree */
+/* Further calculations can not cause schedule(), and thus the buffer */
+/* tree will be stable until the balancing will be finished */
+/* balance the tree according to the analysis made before, */
+/* and using buffers obtained after all above. */
+
+
+/**
+ ** balance_leaf_when_delete
+ ** balance_leaf
+ ** do_balance
+ **
+ **/
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <linux/time.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/buffer_head.h>
+
+#ifdef CONFIG_REISERFS_CHECK
+
+struct tree_balance * cur_tb = NULL; /* detects whether more than one
+ copy of tb exists as a means
+ of checking whether schedule
+ is interrupting do_balance */
+#endif
+
+inline void do_balance_mark_leaf_dirty (struct tree_balance * tb,
+ struct buffer_head * bh, int flag)
+{
+ journal_mark_dirty(tb->transaction_handle,
+ tb->transaction_handle->t_super, bh) ;
+}
+
+#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
+#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
+
+
+/* summary:
+ if deleting something ( tb->insert_size[0] < 0 )
+ return(balance_leaf_when_delete()); (flag d handled here)
+ else
+ if lnum is larger than 0 we put items into the left node
+ if rnum is larger than 0 we put items into the right node
+ if snum1 is larger than 0 we put items into the new node s1
+ if snum2 is larger than 0 we put items into the new node s2
+Note that all *num* count new items being created.
+
+It would be easier to read balance_leaf() if each of these summary
+lines was a separate procedure rather than being inlined. I think
+that there are many passages here and in balance_leaf_when_delete() in
+which two calls to one procedure can replace two passages, and it
+might save cache space and improve software maintenance costs to do so.
+
+Vladimir made the perceptive comment that we should offload most of
+the decision making in this function into fix_nodes/check_balance, and
+then create some sort of structure in tb that says what actions should
+be performed by do_balance.
+
+-Hans */
+
+
+
+/* Balance leaf node in case of delete or cut: insert_size[0] < 0
+ *
+ * lnum, rnum can have values >= -1
+ * -1 means that the neighbor must be joined with S
+ * 0 means that nothing should be done with the neighbor
+ * >0 means to shift entirely or partly the specified number of items to the neighbor
+ */
+static int balance_leaf_when_delete (struct tree_balance * tb, int flag)
+{
+ struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path);
+ int item_pos = PATH_LAST_POSITION (tb->tb_path);
+ int pos_in_item = tb->tb_path->pos_in_item;
+ struct buffer_info bi;
+ int n;
+ struct item_head * ih;
+
+ RFALSE( tb->FR[0] && B_LEVEL (tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
+ "vs- 12000: level: wrong FR %z", tb->FR[0]);
+ RFALSE( tb->blknum[0] > 1,
+ "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]);
+ RFALSE( ! tb->blknum[0] && ! PATH_H_PPARENT(tb->tb_path, 0),
+ "PAP-12010: tree can not be empty");
+
+ ih = B_N_PITEM_HEAD (tbS0, item_pos);
+
+ /* Delete or truncate the item */
+
+ switch (flag) {
+ case M_DELETE: /* delete item in S[0] */
+
+ RFALSE( ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
+ "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
+ -tb->insert_size [0], ih);
+
+ bi.tb = tb;
+ bi.bi_bh = tbS0;
+ bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
+ bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
+ leaf_delete_items (&bi, 0, item_pos, 1, -1);
+
+ if ( ! item_pos && tb->CFL[0] ) {
+ if ( B_NR_ITEMS(tbS0) ) {
+ replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0);
+ }
+ else {
+ if ( ! PATH_H_POSITION (tb->tb_path, 1) )
+ replace_key(tb, tb->CFL[0],tb->lkey[0],PATH_H_PPARENT(tb->tb_path, 0),0);
+ }
+ }
+
+ RFALSE( ! item_pos && !tb->CFL[0],
+ "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], tb->L[0]);
+
+ break;
+
+ case M_CUT: { /* cut item in S[0] */
+ bi.tb = tb;
+ bi.bi_bh = tbS0;
+ bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
+ bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
+ if (is_direntry_le_ih (ih)) {
+
+ /* UFS unlink semantics are such that you can only delete one directory entry at a time. */
+ /* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */
+ tb->insert_size[0] = -1;
+ leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]);
+
+ RFALSE( ! item_pos && ! pos_in_item && ! tb->CFL[0],
+ "PAP-12030: can not change delimiting key. CFL[0]=%p",
+ tb->CFL[0]);
+
+ if ( ! item_pos && ! pos_in_item && tb->CFL[0] ) {
+ replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0);
+ }
+ } else {
+ leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]);
+
+ RFALSE( ! ih_item_len(ih),
+ "PAP-12035: cut must leave non-zero dynamic length of item");
+ }
+ break;
+ }
+
+ default:
+ print_cur_tb ("12040");
+ reiserfs_panic (tb->tb_sb, "PAP-12040: balance_leaf_when_delete: unexpectable mode: %s(%d)",
+ (flag == M_PASTE) ? "PASTE" : ((flag == M_INSERT) ? "INSERT" : "UNKNOWN"), flag);
+ }
+
+ /* the rule is that no shifting occurs unless by shifting a node can be freed */
+ n = B_NR_ITEMS(tbS0);
+ if ( tb->lnum[0] ) /* L[0] takes part in balancing */
+ {
+ if ( tb->lnum[0] == -1 ) /* L[0] must be joined with S[0] */
+ {
+ if ( tb->rnum[0] == -1 ) /* R[0] must be also joined with S[0] */
+ {
+ if ( tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0) )
+ {
+ /* all contents of all the 3 buffers will be in L[0] */
+ if ( PATH_H_POSITION (tb->tb_path, 1) == 0 && 1 < B_NR_ITEMS(tb->FR[0]) )
+ replace_key(tb, tb->CFL[0],tb->lkey[0],tb->FR[0],1);
+
+ leaf_move_items (LEAF_FROM_S_TO_L, tb, n, -1, NULL);
+ leaf_move_items (LEAF_FROM_R_TO_L, tb, B_NR_ITEMS(tb->R[0]), -1, NULL);
+
+ reiserfs_invalidate_buffer (tb, tbS0);
+ reiserfs_invalidate_buffer (tb, tb->R[0]);
+
+ return 0;
+ }
+ /* all contents of all the 3 buffers will be in R[0] */
+ leaf_move_items (LEAF_FROM_S_TO_R, tb, n, -1, NULL);
+ leaf_move_items (LEAF_FROM_L_TO_R, tb, B_NR_ITEMS(tb->L[0]), -1, NULL);
+
+ /* right_delimiting_key is correct in R[0] */
+ replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
+
+ reiserfs_invalidate_buffer (tb, tbS0);
+ reiserfs_invalidate_buffer (tb, tb->L[0]);
+
+ return -1;
+ }
+
+ RFALSE( tb->rnum[0] != 0,
+ "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
+ /* all contents of L[0] and S[0] will be in L[0] */
+ leaf_shift_left(tb, n, -1);
+
+ reiserfs_invalidate_buffer (tb, tbS0);
+
+ return 0;
+ }
+ /* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */
+
+ RFALSE( ( tb->lnum[0] + tb->rnum[0] < n ) ||
+ ( tb->lnum[0] + tb->rnum[0] > n+1 ),
+ "PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent",
+ tb->rnum[0], tb->lnum[0], n);
+ RFALSE( ( tb->lnum[0] + tb->rnum[0] == n ) &&
+ (tb->lbytes != -1 || tb->rbytes != -1),
+ "PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split",
+ tb->rbytes, tb->lbytes);
+ RFALSE( ( tb->lnum[0] + tb->rnum[0] == n + 1 ) &&
+ (tb->lbytes < 1 || tb->rbytes != -1),
+ "PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split",
+ tb->rbytes, tb->lbytes);
+
+ leaf_shift_left (tb, tb->lnum[0], tb->lbytes);
+ leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+
+ reiserfs_invalidate_buffer (tb, tbS0);
+
+ return 0;
+ }
+
+ if ( tb->rnum[0] == -1 ) {
+ /* all contents of R[0] and S[0] will be in R[0] */
+ leaf_shift_right(tb, n, -1);
+ reiserfs_invalidate_buffer (tb, tbS0);
+ return 0;
+ }
+
+ RFALSE( tb->rnum[0],
+ "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
+ return 0;
+}
+
+
+static int balance_leaf (struct tree_balance * tb,
+ struct item_head * ih, /* item header of inserted item (this is on little endian) */
+ const char * body, /* body of inserted item or bytes to paste */
+ int flag, /* i - insert, d - delete, c - cut, p - paste
+ (see comment to do_balance) */
+ struct item_head * insert_key, /* in our processing of one level we sometimes determine what
+ must be inserted into the next higher level. This insertion
+ consists of a key or two keys and their corresponding
+ pointers */
+ struct buffer_head ** insert_ptr /* inserted node-ptrs for the next level */
+ )
+{
+ struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path);
+ int item_pos = PATH_LAST_POSITION (tb->tb_path); /* index into the array of item headers in S[0]
+ of the affected item */
+ struct buffer_info bi;
+ struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */
+ int snum[2]; /* number of items that will be placed
+ into S_new (includes partially shifted
+ items) */
+ int sbytes[2]; /* if an item is partially shifted into S_new then
+ if it is a directory item
+ it is the number of entries from the item that are shifted into S_new
+ else
+ it is the number of bytes from the item that are shifted into S_new
+ */
+ int n, i;
+ int ret_val;
+ int pos_in_item;
+ int zeros_num;
+
+ PROC_INFO_INC( tb -> tb_sb, balance_at[ 0 ] );
+
+ /* Make balance in case insert_size[0] < 0 */
+ if ( tb->insert_size[0] < 0 )
+ return balance_leaf_when_delete (tb, flag);
+
+ zeros_num = 0;
+ if (flag == M_INSERT && body == 0)
+ zeros_num = ih_item_len( ih );
+
+ pos_in_item = tb->tb_path->pos_in_item;
+ /* for indirect item pos_in_item is measured in unformatted node
+ pointers. Recalculate to bytes */
+ if (flag != M_INSERT && is_indirect_le_ih (B_N_PITEM_HEAD (tbS0, item_pos)))
+ pos_in_item *= UNFM_P_SIZE;
+
+ if ( tb->lnum[0] > 0 ) {
+ /* Shift lnum[0] items from S[0] to the left neighbor L[0] */
+ if ( item_pos < tb->lnum[0] ) {
+ /* new item or it part falls to L[0], shift it too */
+ n = B_NR_ITEMS(tb->L[0]);
+
+ switch (flag) {
+ case M_INSERT: /* insert item into L[0] */
+
+ if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) {
+ /* part of new item falls into L[0] */
+ int new_item_len;
+ int version;
+
+ ret_val = leaf_shift_left (tb, tb->lnum[0]-1, -1);
+
+ /* Calculate item length to insert to S[0] */
+ new_item_len = ih_item_len(ih) - tb->lbytes;
+ /* Calculate and check item length to insert to L[0] */
+ put_ih_item_len(ih, ih_item_len(ih) - new_item_len );
+
+ RFALSE( ih_item_len(ih) <= 0,
+ "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d",
+ ih_item_len(ih));
+
+ /* Insert new item into L[0] */
+ bi.tb = tb;
+ bi.bi_bh = tb->L[0];
+ bi.bi_parent = tb->FL[0];
+ bi.bi_position = get_left_neighbor_position (tb, 0);
+ leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body,
+ zeros_num > ih_item_len(ih) ? ih_item_len(ih) : zeros_num);
+
+ version = ih_version (ih);
+
+ /* Calculate key component, item length and body to insert into S[0] */
+ set_le_ih_k_offset( ih, le_ih_k_offset( ih ) + (tb->lbytes << (is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)) );
+
+ put_ih_item_len( ih, new_item_len );
+ if ( tb->lbytes > zeros_num ) {
+ body += (tb->lbytes - zeros_num);
+ zeros_num = 0;
+ }
+ else
+ zeros_num -= tb->lbytes;
+
+ RFALSE( ih_item_len(ih) <= 0,
+ "PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d",
+ ih_item_len(ih));
+ } else {
+ /* new item in whole falls into L[0] */
+ /* Shift lnum[0]-1 items to L[0] */
+ ret_val = leaf_shift_left(tb, tb->lnum[0]-1, tb->lbytes);
+ /* Insert new item into L[0] */
+ bi.tb = tb;
+ bi.bi_bh = tb->L[0];
+ bi.bi_parent = tb->FL[0];
+ bi.bi_position = get_left_neighbor_position (tb, 0);
+ leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body, zeros_num);
+ tb->insert_size[0] = 0;
+ zeros_num = 0;
+ }
+ break;
+
+ case M_PASTE: /* append item in L[0] */
+
+ if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) {
+ /* we must shift the part of the appended item */
+ if ( is_direntry_le_ih (B_N_PITEM_HEAD (tbS0, item_pos))) {
+
+ RFALSE( zeros_num,
+ "PAP-12090: invalid parameter in case of a directory");
+ /* directory item */
+ if ( tb->lbytes > pos_in_item ) {
+ /* new directory entry falls into L[0] */
+ struct item_head * pasted;
+ int l_pos_in_item = pos_in_item;
+
+ /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */
+ ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1);
+ if ( ret_val && ! item_pos ) {
+ pasted = B_N_PITEM_HEAD(tb->L[0],B_NR_ITEMS(tb->L[0])-1);
+ l_pos_in_item += I_ENTRY_COUNT(pasted) - (tb->lbytes-1);
+ }
+
+ /* Append given directory entry to directory item */
+ bi.tb = tb;
+ bi.bi_bh = tb->L[0];
+ bi.bi_parent = tb->FL[0];
+ bi.bi_position = get_left_neighbor_position (tb, 0);
+ leaf_paste_in_buffer (&bi, n + item_pos - ret_val, l_pos_in_item,
+ tb->insert_size[0], body, zeros_num);
+
+ /* previous string prepared space for pasting new entry, following string pastes this entry */
+
+ /* when we have merge directory item, pos_in_item has been changed too */
+
+ /* paste new directory entry. 1 is entry number */
+ leaf_paste_entries (bi.bi_bh, n + item_pos - ret_val, l_pos_in_item, 1,
+ (struct reiserfs_de_head *)body,
+ body + DEH_SIZE, tb->insert_size[0]
+ );
+ tb->insert_size[0] = 0;
+ } else {
+ /* new directory item doesn't fall into L[0] */
+ /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */
+ leaf_shift_left (tb, tb->lnum[0], tb->lbytes);
+ }
+ /* Calculate new position to append in item body */
+ pos_in_item -= tb->lbytes;
+ }
+ else {
+ /* regular object */
+ RFALSE( tb->lbytes <= 0,
+ "PAP-12095: there is nothing to shift to L[0]. lbytes=%d",
+ tb->lbytes);
+ RFALSE( pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)),
+ "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d",
+ ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)), pos_in_item);
+
+ if ( tb->lbytes >= pos_in_item ) {
+ /* appended item will be in L[0] in whole */
+ int l_n;
+
+ /* this bytes number must be appended to the last item of L[h] */
+ l_n = tb->lbytes - pos_in_item;
+
+ /* Calculate new insert_size[0] */
+ tb->insert_size[0] -= l_n;
+
+ RFALSE( tb->insert_size[0] <= 0,
+ "PAP-12105: there is nothing to paste into L[0]. insert_size=%d",
+ tb->insert_size[0]);
+ ret_val = leaf_shift_left(tb,tb->lnum[0],
+ ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)));
+ /* Append to body of item in L[0] */
+ bi.tb = tb;
+ bi.bi_bh = tb->L[0];
+ bi.bi_parent = tb->FL[0];
+ bi.bi_position = get_left_neighbor_position (tb, 0);
+ leaf_paste_in_buffer(
+ &bi,n + item_pos - ret_val,
+ ih_item_len( B_N_PITEM_HEAD(tb->L[0],n+item_pos-ret_val)),
+ l_n,body, zeros_num > l_n ? l_n : zeros_num
+ );
+ /* 0-th item in S0 can be only of DIRECT type when l_n != 0*/
+ {
+ int version;
+ int temp_l = l_n;
+
+ RFALSE (ih_item_len (B_N_PITEM_HEAD (tbS0, 0)),
+ "PAP-12106: item length must be 0");
+ RFALSE (comp_short_le_keys (B_N_PKEY (tbS0, 0),
+ B_N_PKEY (tb->L[0],
+ n + item_pos - ret_val)),
+ "PAP-12107: items must be of the same file");
+ if (is_indirect_le_ih(B_N_PITEM_HEAD (tb->L[0],
+ n + item_pos - ret_val))) {
+ temp_l = l_n << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT);
+ }
+ /* update key of first item in S0 */
+ version = ih_version (B_N_PITEM_HEAD (tbS0, 0));
+ set_le_key_k_offset (version, B_N_PKEY (tbS0, 0),
+ le_key_k_offset (version, B_N_PKEY (tbS0, 0)) + temp_l);
+ /* update left delimiting key */
+ set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]),
+ le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0])) + temp_l);
+ }
+
+ /* Calculate new body, position in item and insert_size[0] */
+ if ( l_n > zeros_num ) {
+ body += (l_n - zeros_num);
+ zeros_num = 0;
+ }
+ else
+ zeros_num -= l_n;
+ pos_in_item = 0;
+
+ RFALSE( comp_short_le_keys
+ (B_N_PKEY(tbS0,0),
+ B_N_PKEY(tb->L[0],B_NR_ITEMS(tb->L[0])-1)) ||
+
+ !op_is_left_mergeable
+ (B_N_PKEY (tbS0, 0), tbS0->b_size) ||
+ !op_is_left_mergeable
+ (B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]),
+ tbS0->b_size),
+ "PAP-12120: item must be merge-able with left neighboring item");
+ }
+ else /* only part of the appended item will be in L[0] */
+ {
+ /* Calculate position in item for append in S[0] */
+ pos_in_item -= tb->lbytes;
+
+ RFALSE( pos_in_item <= 0,
+ "PAP-12125: no place for paste. pos_in_item=%d", pos_in_item);
+
+ /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
+ leaf_shift_left(tb,tb->lnum[0],tb->lbytes);
+ }
+ }
+ }
+ else /* appended item will be in L[0] in whole */
+ {
+ struct item_head * pasted;
+
+ if ( ! item_pos && op_is_left_mergeable (B_N_PKEY (tbS0, 0), tbS0->b_size) )
+ { /* if we paste into first item of S[0] and it is left mergable */
+ /* then increment pos_in_item by the size of the last item in L[0] */
+ pasted = B_N_PITEM_HEAD(tb->L[0],n-1);
+ if ( is_direntry_le_ih (pasted) )
+ pos_in_item += ih_entry_count(pasted);
+ else
+ pos_in_item += ih_item_len(pasted);
+ }
+
+ /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
+ ret_val = leaf_shift_left(tb,tb->lnum[0],tb->lbytes);
+ /* Append to body of item in L[0] */
+ bi.tb = tb;
+ bi.bi_bh = tb->L[0];
+ bi.bi_parent = tb->FL[0];
+ bi.bi_position = get_left_neighbor_position (tb, 0);
+ leaf_paste_in_buffer (&bi, n + item_pos - ret_val, pos_in_item, tb->insert_size[0],
+ body, zeros_num);
+
+ /* if appended item is directory, paste entry */
+ pasted = B_N_PITEM_HEAD (tb->L[0], n + item_pos - ret_val);
+ if (is_direntry_le_ih (pasted))
+ leaf_paste_entries (
+ bi.bi_bh, n + item_pos - ret_val, pos_in_item, 1,
+ (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0]
+ );
+ /* if appended item is indirect item, put unformatted node into un list */
+ if (is_indirect_le_ih (pasted))
+ set_ih_free_space (pasted, 0);
+ tb->insert_size[0] = 0;
+ zeros_num = 0;
+ }
+ break;
+ default: /* cases d and t */
+ reiserfs_panic (tb->tb_sb, "PAP-12130: balance_leaf: lnum > 0: unexpectable mode: %s(%d)",
+ (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
+ }
+ } else {
+ /* new item doesn't fall into L[0] */
+ leaf_shift_left(tb,tb->lnum[0],tb->lbytes);
+ }
+ } /* tb->lnum[0] > 0 */
+
+ /* Calculate new item position */
+ item_pos -= ( tb->lnum[0] - (( tb->lbytes != -1 ) ? 1 : 0));
+
+ if ( tb->rnum[0] > 0 ) {
+ /* shift rnum[0] items from S[0] to the right neighbor R[0] */
+ n = B_NR_ITEMS(tbS0);
+ switch ( flag ) {
+
+ case M_INSERT: /* insert item */
+ if ( n - tb->rnum[0] < item_pos )
+ { /* new item or its part falls to R[0] */
+ if ( item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1 )
+ { /* part of new item falls into R[0] */
+ loff_t old_key_comp, old_len, r_zeros_number;
+ const char * r_body;
+ int version;
+ loff_t offset;
+
+ leaf_shift_right(tb,tb->rnum[0]-1,-1);
+
+ version = ih_version(ih);
+ /* Remember key component and item length */
+ old_key_comp = le_ih_k_offset( ih );
+ old_len = ih_item_len(ih);
+
+ /* Calculate key component and item length to insert into R[0] */
+ offset = le_ih_k_offset( ih ) + ((old_len - tb->rbytes )<<(is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0));
+ set_le_ih_k_offset( ih, offset );
+ put_ih_item_len( ih, tb->rbytes);
+ /* Insert part of the item into R[0] */
+ bi.tb = tb;
+ bi.bi_bh = tb->R[0];
+ bi.bi_parent = tb->FR[0];
+ bi.bi_position = get_right_neighbor_position (tb, 0);
+ if ( (old_len - tb->rbytes) > zeros_num ) {
+ r_zeros_number = 0;
+ r_body = body + (old_len - tb->rbytes) - zeros_num;
+ }
+ else {
+ r_body = body;
+ r_zeros_number = zeros_num - (old_len - tb->rbytes);
+ zeros_num -= r_zeros_number;
+ }
+
+ leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number);
+
+ /* Replace right delimiting key by first key in R[0] */
+ replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
+
+ /* Calculate key component and item length to insert into S[0] */
+ set_le_ih_k_offset( ih, old_key_comp );
+ put_ih_item_len( ih, old_len - tb->rbytes );
+
+ tb->insert_size[0] -= tb->rbytes;
+
+ }
+ else /* whole new item falls into R[0] */
+ {
+ /* Shift rnum[0]-1 items to R[0] */
+ ret_val = leaf_shift_right(tb,tb->rnum[0]-1,tb->rbytes);
+ /* Insert new item into R[0] */
+ bi.tb = tb;
+ bi.bi_bh = tb->R[0];
+ bi.bi_parent = tb->FR[0];
+ bi.bi_position = get_right_neighbor_position (tb, 0);
+ leaf_insert_into_buf (&bi, item_pos - n + tb->rnum[0] - 1, ih, body, zeros_num);
+
+ if ( item_pos - n + tb->rnum[0] - 1 == 0 ) {
+ replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
+
+ }
+ zeros_num = tb->insert_size[0] = 0;
+ }
+ }
+ else /* new item or part of it doesn't fall into R[0] */
+ {
+ leaf_shift_right(tb,tb->rnum[0],tb->rbytes);
+ }
+ break;
+
+ case M_PASTE: /* append item */
+
+ if ( n - tb->rnum[0] <= item_pos ) /* pasted item or part of it falls to R[0] */
+ {
+ if ( item_pos == n - tb->rnum[0] && tb->rbytes != -1 )
+ { /* we must shift the part of the appended item */
+ if ( is_direntry_le_ih (B_N_PITEM_HEAD(tbS0, item_pos)))
+ { /* we append to directory item */
+ int entry_count;
+
+ RFALSE( zeros_num,
+ "PAP-12145: invalid parameter in case of a directory");
+ entry_count = I_ENTRY_COUNT(B_N_PITEM_HEAD(tbS0, item_pos));
+ if ( entry_count - tb->rbytes < pos_in_item )
+ /* new directory entry falls into R[0] */
+ {
+ int paste_entry_position;
+
+ RFALSE( tb->rbytes - 1 >= entry_count ||
+ ! tb->insert_size[0],
+ "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d",
+ tb->rbytes, entry_count);
+ /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */
+ leaf_shift_right(tb,tb->rnum[0],tb->rbytes - 1);
+ /* Paste given directory entry to directory item */
+ paste_entry_position = pos_in_item - entry_count + tb->rbytes - 1;
+ bi.tb = tb;
+ bi.bi_bh = tb->R[0];
+ bi.bi_parent = tb->FR[0];
+ bi.bi_position = get_right_neighbor_position (tb, 0);
+ leaf_paste_in_buffer (&bi, 0, paste_entry_position,
+ tb->insert_size[0],body,zeros_num);
+ /* paste entry */
+ leaf_paste_entries (
+ bi.bi_bh, 0, paste_entry_position, 1, (struct reiserfs_de_head *)body,
+ body + DEH_SIZE, tb->insert_size[0]
+ );
+
+ if ( paste_entry_position == 0 ) {
+ /* change delimiting keys */
+ replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
+ }
+
+ tb->insert_size[0] = 0;
+ pos_in_item++;
+ }
+ else /* new directory entry doesn't fall into R[0] */
+ {
+ leaf_shift_right(tb,tb->rnum[0],tb->rbytes);
+ }
+ }
+ else /* regular object */
+ {
+ int n_shift, n_rem, r_zeros_number;
+ const char * r_body;
+
+ /* Calculate number of bytes which must be shifted from appended item */
+ if ( (n_shift = tb->rbytes - tb->insert_size[0]) < 0 )
+ n_shift = 0;
+
+ RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD (tbS0, item_pos)),
+ "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d",
+ pos_in_item, ih_item_len( B_N_PITEM_HEAD(tbS0,item_pos)));
+
+ leaf_shift_right(tb,tb->rnum[0],n_shift);
+ /* Calculate number of bytes which must remain in body after appending to R[0] */
+ if ( (n_rem = tb->insert_size[0] - tb->rbytes) < 0 )
+ n_rem = 0;
+
+ {
+ int version;
+ unsigned long temp_rem = n_rem;
+
+ version = ih_version (B_N_PITEM_HEAD (tb->R[0],0));
+ if (is_indirect_le_key(version,B_N_PKEY(tb->R[0],0))){
+ temp_rem = n_rem << (tb->tb_sb->s_blocksize_bits -
+ UNFM_P_SHIFT);
+ }
+ set_le_key_k_offset (version, B_N_PKEY(tb->R[0],0),
+ le_key_k_offset (version, B_N_PKEY(tb->R[0],0)) + temp_rem);
+ set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0]),
+ le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) + temp_rem);
+ }
+/* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem;
+ k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/
+ do_balance_mark_internal_dirty (tb, tb->CFR[0], 0);
+
+ /* Append part of body into R[0] */
+ bi.tb = tb;
+ bi.bi_bh = tb->R[0];
+ bi.bi_parent = tb->FR[0];
+ bi.bi_position = get_right_neighbor_position (tb, 0);
+ if ( n_rem > zeros_num ) {
+ r_zeros_number = 0;
+ r_body = body + n_rem - zeros_num;
+ }
+ else {
+ r_body = body;
+ r_zeros_number = zeros_num - n_rem;
+ zeros_num -= r_zeros_number;
+ }
+
+ leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, r_body, r_zeros_number);
+
+ if (is_indirect_le_ih (B_N_PITEM_HEAD(tb->R[0],0))) {
+#if 0
+ RFALSE( n_rem,
+ "PAP-12160: paste more than one unformatted node pointer");
+#endif
+ set_ih_free_space (B_N_PITEM_HEAD(tb->R[0],0), 0);
+ }
+ tb->insert_size[0] = n_rem;
+ if ( ! n_rem )
+ pos_in_item ++;
+ }
+ }
+ else /* pasted item in whole falls into R[0] */
+ {
+ struct item_head * pasted;
+
+ ret_val = leaf_shift_right(tb,tb->rnum[0],tb->rbytes);
+ /* append item in R[0] */
+ if ( pos_in_item >= 0 ) {
+ bi.tb = tb;
+ bi.bi_bh = tb->R[0];
+ bi.bi_parent = tb->FR[0];
+ bi.bi_position = get_right_neighbor_position (tb, 0);
+ leaf_paste_in_buffer(&bi,item_pos - n + tb->rnum[0], pos_in_item,
+ tb->insert_size[0],body, zeros_num);
+ }
+
+ /* paste new entry, if item is directory item */
+ pasted = B_N_PITEM_HEAD(tb->R[0], item_pos - n + tb->rnum[0]);
+ if (is_direntry_le_ih (pasted) && pos_in_item >= 0 ) {
+ leaf_paste_entries (
+ bi.bi_bh, item_pos - n + tb->rnum[0], pos_in_item, 1,
+ (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0]
+ );
+ if ( ! pos_in_item ) {
+
+ RFALSE( item_pos - n + tb->rnum[0],
+ "PAP-12165: directory item must be first item of node when pasting is in 0th position");
+
+ /* update delimiting keys */
+ replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
+ }
+ }
+
+ if (is_indirect_le_ih (pasted))
+ set_ih_free_space (pasted, 0);
+ zeros_num = tb->insert_size[0] = 0;
+ }
+ }
+ else /* new item doesn't fall into R[0] */
+ {
+ leaf_shift_right(tb,tb->rnum[0],tb->rbytes);
+ }
+ break;
+ default: /* cases d and t */
+ reiserfs_panic (tb->tb_sb, "PAP-12175: balance_leaf: rnum > 0: unexpectable mode: %s(%d)",
+ (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
+ }
+
+ } /* tb->rnum[0] > 0 */
+
+
+ RFALSE( tb->blknum[0] > 3,
+ "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]);
+ RFALSE( tb->blknum[0] < 0,
+ "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]);
+
+ /* if while adding to a node we discover that it is possible to split
+ it in two, and merge the left part into the left neighbor and the
+ right part into the right neighbor, eliminating the node */
+ if ( tb->blknum[0] == 0 ) { /* node S[0] is empty now */
+
+ RFALSE( ! tb->lnum[0] || ! tb->rnum[0],
+ "PAP-12190: lnum and rnum must not be zero");
+ /* if insertion was done before 0-th position in R[0], right
+ delimiting key of the tb->L[0]'s and left delimiting key are
+ not set correctly */
+ if (tb->CFL[0]) {
+ if (!tb->CFR[0])
+ reiserfs_panic (tb->tb_sb, "vs-12195: balance_leaf: CFR not initialized");
+ copy_key (B_N_PDELIM_KEY (tb->CFL[0], tb->lkey[0]), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]));
+ do_balance_mark_internal_dirty (tb, tb->CFL[0], 0);
+ }
+
+ reiserfs_invalidate_buffer(tb,tbS0);
+ return 0;
+ }
+
+
+ /* Fill new nodes that appear in place of S[0] */
+
+ /* I am told that this copying is because we need an array to enable
+ the looping code. -Hans */
+ snum[0] = tb->s1num,
+ snum[1] = tb->s2num;
+ sbytes[0] = tb->s1bytes;
+ sbytes[1] = tb->s2bytes;
+ for( i = tb->blknum[0] - 2; i >= 0; i-- ) {
+
+ RFALSE( !snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i, snum[i]);
+
+ /* here we shift from S to S_new nodes */
+
+ S_new[i] = get_FEB(tb);
+
+ /* initialized block type and tree level */
+ set_blkh_level( B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL );
+
+
+ n = B_NR_ITEMS(tbS0);
+
+ switch (flag) {
+ case M_INSERT: /* insert item */
+
+ if ( n - snum[i] < item_pos )
+ { /* new item or it's part falls to first new node S_new[i]*/
+ if ( item_pos == n - snum[i] + 1 && sbytes[i] != -1 )
+ { /* part of new item falls into S_new[i] */
+ int old_key_comp, old_len, r_zeros_number;
+ const char * r_body;
+ int version;
+
+ /* Move snum[i]-1 items from S[0] to S_new[i] */
+ leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, -1, S_new[i]);
+ /* Remember key component and item length */
+ version = ih_version (ih);
+ old_key_comp = le_ih_k_offset( ih );
+ old_len = ih_item_len(ih);
+
+ /* Calculate key component and item length to insert into S_new[i] */
+ set_le_ih_k_offset( ih,
+ le_ih_k_offset(ih) + ((old_len - sbytes[i] )<<(is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)) );
+
+ put_ih_item_len( ih, sbytes[i] );
+
+ /* Insert part of the item into S_new[i] before 0-th item */
+ bi.tb = tb;
+ bi.bi_bh = S_new[i];
+ bi.bi_parent = NULL;
+ bi.bi_position = 0;
+
+ if ( (old_len - sbytes[i]) > zeros_num ) {
+ r_zeros_number = 0;
+ r_body = body + (old_len - sbytes[i]) - zeros_num;
+ }
+ else {
+ r_body = body;
+ r_zeros_number = zeros_num - (old_len - sbytes[i]);
+ zeros_num -= r_zeros_number;
+ }
+
+ leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number);
+
+ /* Calculate key component and item length to insert into S[i] */
+ set_le_ih_k_offset( ih, old_key_comp );
+ put_ih_item_len( ih, old_len - sbytes[i] );
+ tb->insert_size[0] -= sbytes[i];
+ }
+ else /* whole new item falls into S_new[i] */
+ {
+ /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */
+ leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, sbytes[i], S_new[i]);
+
+ /* Insert new item into S_new[i] */
+ bi.tb = tb;
+ bi.bi_bh = S_new[i];
+ bi.bi_parent = NULL;
+ bi.bi_position = 0;
+ leaf_insert_into_buf (&bi, item_pos - n + snum[i] - 1, ih, body, zeros_num);
+
+ zeros_num = tb->insert_size[0] = 0;
+ }
+ }
+
+ else /* new item or it part don't falls into S_new[i] */
+ {
+ leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]);
+ }
+ break;
+
+ case M_PASTE: /* append item */
+
+ if ( n - snum[i] <= item_pos ) /* pasted item or part if it falls to S_new[i] */
+ {
+ if ( item_pos == n - snum[i] && sbytes[i] != -1 )
+ { /* we must shift part of the appended item */
+ struct item_head * aux_ih;
+
+ RFALSE( ih, "PAP-12210: ih must be 0");
+
+ if ( is_direntry_le_ih (aux_ih = B_N_PITEM_HEAD(tbS0,item_pos))) {
+ /* we append to directory item */
+
+ int entry_count;
+
+ entry_count = ih_entry_count(aux_ih);
+
+ if ( entry_count - sbytes[i] < pos_in_item && pos_in_item <= entry_count ) {
+ /* new directory entry falls into S_new[i] */
+
+ RFALSE( ! tb->insert_size[0],
+ "PAP-12215: insert_size is already 0");
+ RFALSE( sbytes[i] - 1 >= entry_count,
+ "PAP-12220: there are no so much entries (%d), only %d",
+ sbytes[i] - 1, entry_count);
+
+ /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */
+ leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i]-1, S_new[i]);
+ /* Paste given directory entry to directory item */
+ bi.tb = tb;
+ bi.bi_bh = S_new[i];
+ bi.bi_parent = NULL;
+ bi.bi_position = 0;
+ leaf_paste_in_buffer (&bi, 0, pos_in_item - entry_count + sbytes[i] - 1,
+ tb->insert_size[0], body,zeros_num);
+ /* paste new directory entry */
+ leaf_paste_entries (
+ bi.bi_bh, 0, pos_in_item - entry_count + sbytes[i] - 1,
+ 1, (struct reiserfs_de_head *)body, body + DEH_SIZE,
+ tb->insert_size[0]
+ );
+ tb->insert_size[0] = 0;
+ pos_in_item++;
+ } else { /* new directory entry doesn't fall into S_new[i] */
+ leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]);
+ }
+ }
+ else /* regular object */
+ {
+ int n_shift, n_rem, r_zeros_number;
+ const char * r_body;
+
+ RFALSE( pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)) ||
+ tb->insert_size[0] <= 0,
+ "PAP-12225: item too short or insert_size <= 0");
+
+ /* Calculate number of bytes which must be shifted from appended item */
+ n_shift = sbytes[i] - tb->insert_size[0];
+ if ( n_shift < 0 )
+ n_shift = 0;
+ leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], n_shift, S_new[i]);
+
+ /* Calculate number of bytes which must remain in body after append to S_new[i] */
+ n_rem = tb->insert_size[0] - sbytes[i];
+ if ( n_rem < 0 )
+ n_rem = 0;
+ /* Append part of body into S_new[0] */
+ bi.tb = tb;
+ bi.bi_bh = S_new[i];
+ bi.bi_parent = NULL;
+ bi.bi_position = 0;
+
+ if ( n_rem > zeros_num ) {
+ r_zeros_number = 0;
+ r_body = body + n_rem - zeros_num;
+ }
+ else {
+ r_body = body;
+ r_zeros_number = zeros_num - n_rem;
+ zeros_num -= r_zeros_number;
+ }
+
+ leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0]-n_rem, r_body,r_zeros_number);
+ {
+ struct item_head * tmp;
+
+ tmp = B_N_PITEM_HEAD(S_new[i],0);
+ if (is_indirect_le_ih (tmp)) {
+ set_ih_free_space (tmp, 0);
+ set_le_ih_k_offset( tmp, le_ih_k_offset(tmp) +
+ (n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT)));
+ } else {
+ set_le_ih_k_offset( tmp, le_ih_k_offset(tmp) +
+ n_rem );
+ }
+ }
+
+ tb->insert_size[0] = n_rem;
+ if ( ! n_rem )
+ pos_in_item++;
+ }
+ }
+ else
+ /* item falls wholly into S_new[i] */
+ {
+ int ret_val;
+ struct item_head * pasted;
+
+#ifdef CONFIG_REISERFS_CHECK
+ struct item_head * ih = B_N_PITEM_HEAD(tbS0,item_pos);
+
+ if ( ! is_direntry_le_ih(ih) && (pos_in_item != ih_item_len(ih) ||
+ tb->insert_size[0] <= 0) )
+ reiserfs_panic (tb->tb_sb, "PAP-12235: balance_leaf: pos_in_item must be equal to ih_item_len");
+#endif /* CONFIG_REISERFS_CHECK */
+
+ ret_val = leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]);
+
+ RFALSE( ret_val,
+ "PAP-12240: unexpected value returned by leaf_move_items (%d)",
+ ret_val);
+
+ /* paste into item */
+ bi.tb = tb;
+ bi.bi_bh = S_new[i];
+ bi.bi_parent = NULL;
+ bi.bi_position = 0;
+ leaf_paste_in_buffer(&bi, item_pos - n + snum[i], pos_in_item, tb->insert_size[0], body, zeros_num);
+
+ pasted = B_N_PITEM_HEAD(S_new[i], item_pos - n + snum[i]);
+ if (is_direntry_le_ih (pasted))
+ {
+ leaf_paste_entries (
+ bi.bi_bh, item_pos - n + snum[i], pos_in_item, 1,
+ (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0]
+ );
+ }
+
+ /* if we paste to indirect item update ih_free_space */
+ if (is_indirect_le_ih (pasted))
+ set_ih_free_space (pasted, 0);
+ zeros_num = tb->insert_size[0] = 0;
+ }
+ }
+
+ else /* pasted item doesn't fall into S_new[i] */
+ {
+ leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]);
+ }
+ break;
+ default: /* cases d and t */
+ reiserfs_panic (tb->tb_sb, "PAP-12245: balance_leaf: blknum > 2: unexpectable mode: %s(%d)",
+ (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
+ }
+
+ memcpy (insert_key + i,B_N_PKEY(S_new[i],0),KEY_SIZE);
+ insert_ptr[i] = S_new[i];
+
+ RFALSE (!buffer_journaled (S_new [i]) || buffer_journal_dirty (S_new [i]) ||
+ buffer_dirty (S_new [i]),
+ "PAP-12247: S_new[%d] : (%b)", i, S_new[i]);
+ }
+
+ /* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the
+ affected item which remains in S */
+ if ( 0 <= item_pos && item_pos < tb->s0num )
+ { /* if we must insert or append into buffer S[0] */
+
+ switch (flag)
+ {
+ case M_INSERT: /* insert item into S[0] */
+ bi.tb = tb;
+ bi.bi_bh = tbS0;
+ bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
+ bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
+ leaf_insert_into_buf (&bi, item_pos, ih, body, zeros_num);
+
+ /* If we insert the first key change the delimiting key */
+ if( item_pos == 0 ) {
+ if (tb->CFL[0]) /* can be 0 in reiserfsck */
+ replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0);
+
+ }
+ break;
+
+ case M_PASTE: { /* append item in S[0] */
+ struct item_head * pasted;
+
+ pasted = B_N_PITEM_HEAD (tbS0, item_pos);
+ /* when directory, may be new entry already pasted */
+ if (is_direntry_le_ih (pasted)) {
+ if ( pos_in_item >= 0 &&
+ pos_in_item <= ih_entry_count(pasted) ) {
+
+ RFALSE( ! tb->insert_size[0],
+ "PAP-12260: insert_size is 0 already");
+
+ /* prepare space */
+ bi.tb = tb;
+ bi.bi_bh = tbS0;
+ bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
+ bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
+ leaf_paste_in_buffer(&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num);
+
+ /* paste entry */
+ leaf_paste_entries (
+ bi.bi_bh, item_pos, pos_in_item, 1, (struct reiserfs_de_head *)body,
+ body + DEH_SIZE, tb->insert_size[0]
+ );
+ if ( ! item_pos && ! pos_in_item ) {
+ RFALSE( !tb->CFL[0] || !tb->L[0],
+ "PAP-12270: CFL[0]/L[0] must be specified");
+ if (tb->CFL[0]) {
+ replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0);
+
+ }
+ }
+ tb->insert_size[0] = 0;
+ }
+ } else { /* regular object */
+ if ( pos_in_item == ih_item_len(pasted) ) {
+
+ RFALSE( tb->insert_size[0] <= 0,
+ "PAP-12275: insert size must not be %d",
+ tb->insert_size[0]);
+ bi.tb = tb;
+ bi.bi_bh = tbS0;
+ bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
+ bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
+ leaf_paste_in_buffer (&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num);
+
+ if (is_indirect_le_ih (pasted)) {
+#if 0
+ RFALSE( tb->insert_size[0] != UNFM_P_SIZE,
+ "PAP-12280: insert_size for indirect item must be %d, not %d",
+ UNFM_P_SIZE, tb->insert_size[0]);
+#endif
+ set_ih_free_space (pasted, 0);
+ }
+ tb->insert_size[0] = 0;
+ }
+
+#ifdef CONFIG_REISERFS_CHECK
+ else {
+ if ( tb->insert_size[0] ) {
+ print_cur_tb ("12285");
+ reiserfs_panic (tb->tb_sb, "PAP-12285: balance_leaf: insert_size must be 0 (%d)", tb->insert_size[0]);
+ }
+ }
+#endif /* CONFIG_REISERFS_CHECK */
+
+ }
+ } /* case M_PASTE: */
+ }
+ }
+
+#ifdef CONFIG_REISERFS_CHECK
+ if ( flag == M_PASTE && tb->insert_size[0] ) {
+ print_cur_tb ("12290");
+ reiserfs_panic (tb->tb_sb, "PAP-12290: balance_leaf: insert_size is still not 0 (%d)", tb->insert_size[0]);
+ }
+#endif /* CONFIG_REISERFS_CHECK */
+
+ return 0;
+} /* Leaf level of the tree is balanced (end of balance_leaf) */
+
+
+
+/* Make empty node */
+void make_empty_node (struct buffer_info * bi)
+{
+ struct block_head * blkh;
+
+ RFALSE( bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL");
+
+ blkh = B_BLK_HEAD(bi->bi_bh);
+ set_blkh_nr_item( blkh, 0 );
+ set_blkh_free_space( blkh, MAX_CHILD_SIZE(bi->bi_bh) );
+
+ if (bi->bi_parent)
+ B_N_CHILD (bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */
+}
+
+
+/* Get first empty buffer */
+struct buffer_head * get_FEB (struct tree_balance * tb)
+{
+ int i;
+ struct buffer_head * first_b;
+ struct buffer_info bi;
+
+ for (i = 0; i < MAX_FEB_SIZE; i ++)
+ if (tb->FEB[i] != 0)
+ break;
+
+ if (i == MAX_FEB_SIZE)
+ reiserfs_panic(tb->tb_sb, "vs-12300: get_FEB: FEB list is empty");
+
+ bi.tb = tb;
+ bi.bi_bh = first_b = tb->FEB[i];
+ bi.bi_parent = NULL;
+ bi.bi_position = 0;
+ make_empty_node (&bi);
+ set_buffer_uptodate(first_b);
+ tb->FEB[i] = NULL;
+ tb->used[i] = first_b;
+
+ return(first_b);
+}
+
+
+/* This is now used because reiserfs_free_block has to be able to
+** schedule.
+*/
+static void store_thrown (struct tree_balance * tb, struct buffer_head * bh)
+{
+ int i;
+
+ if (buffer_dirty (bh))
+ reiserfs_warning (tb->tb_sb, "store_thrown deals with dirty buffer");
+ for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i ++)
+ if (!tb->thrown[i]) {
+ tb->thrown[i] = bh;
+ get_bh(bh) ; /* free_thrown puts this */
+ return;
+ }
+ reiserfs_warning (tb->tb_sb, "store_thrown: too many thrown buffers");
+}
+
+static void free_thrown(struct tree_balance *tb) {
+ int i ;
+ b_blocknr_t blocknr ;
+ for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i++) {
+ if (tb->thrown[i]) {
+ blocknr = tb->thrown[i]->b_blocknr ;
+ if (buffer_dirty (tb->thrown[i]))
+ reiserfs_warning (tb->tb_sb,
+ "free_thrown deals with dirty buffer %d",
+ blocknr);
+ brelse(tb->thrown[i]) ; /* incremented in store_thrown */
+ reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
+ }
+ }
+}
+
+void reiserfs_invalidate_buffer (struct tree_balance * tb, struct buffer_head * bh)
+{
+ struct block_head *blkh;
+ blkh = B_BLK_HEAD(bh);
+ set_blkh_level( blkh, FREE_LEVEL );
+ set_blkh_nr_item( blkh, 0 );
+
+ clear_buffer_dirty(bh);
+ store_thrown (tb, bh);
+}
+
+/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/
+void replace_key (struct tree_balance * tb, struct buffer_head * dest, int n_dest,
+ struct buffer_head * src, int n_src)
+{
+
+ RFALSE( dest == NULL || src == NULL,
+ "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)",
+ src, dest);
+ RFALSE( ! B_IS_KEYS_LEVEL (dest),
+ "vs-12310: invalid level (%z) for destination buffer. dest must be leaf",
+ dest);
+ RFALSE( n_dest < 0 || n_src < 0,
+ "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest);
+ RFALSE( n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src),
+ "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big",
+ n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest));
+
+ if (B_IS_ITEMS_LEVEL (src))
+ /* source buffer contains leaf node */
+ memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PITEM_HEAD(src,n_src), KEY_SIZE);
+ else
+ memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PDELIM_KEY(src,n_src), KEY_SIZE);
+
+ do_balance_mark_internal_dirty (tb, dest, 0);
+}
+
+
+int get_left_neighbor_position (
+ struct tree_balance * tb,
+ int h
+ )
+{
+ int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1);
+
+ RFALSE( PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FL[h] == 0,
+ "vs-12325: FL[%d](%p) or F[%d](%p) does not exist",
+ h, tb->FL[h], h, PATH_H_PPARENT (tb->tb_path, h));
+
+ if (Sh_position == 0)
+ return B_NR_ITEMS (tb->FL[h]);
+ else
+ return Sh_position - 1;
+}
+
+
+int get_right_neighbor_position (struct tree_balance * tb, int h)
+{
+ int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1);
+
+ RFALSE( PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FR[h] == 0,
+ "vs-12330: F[%d](%p) or FR[%d](%p) does not exist",
+ h, PATH_H_PPARENT (tb->tb_path, h), h, tb->FR[h]);
+
+ if (Sh_position == B_NR_ITEMS (PATH_H_PPARENT (tb->tb_path, h)))
+ return 0;
+ else
+ return Sh_position + 1;
+}
+
+
+#ifdef CONFIG_REISERFS_CHECK
+
+int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value);
+static void check_internal_node (struct super_block * s, struct buffer_head * bh, char * mes)
+{
+ struct disk_child * dc;
+ int i;
+
+ RFALSE( !bh, "PAP-12336: bh == 0");
+
+ if (!bh || !B_IS_IN_TREE (bh))
+ return;
+
+ RFALSE( !buffer_dirty (bh) &&
+ !(buffer_journaled(bh) || buffer_journal_dirty(bh)),
+ "PAP-12337: buffer (%b) must be dirty", bh);
+ dc = B_N_CHILD (bh, 0);
+
+ for (i = 0; i <= B_NR_ITEMS (bh); i ++, dc ++) {
+ if (!is_reusable (s, dc_block_number(dc), 1) ) {
+ print_cur_tb (mes);
+ reiserfs_panic (s, "PAP-12338: check_internal_node: invalid child pointer %y in %b", dc, bh);
+ }
+ }
+}
+
+
+static int locked_or_not_in_tree (struct buffer_head * bh, char * which)
+{
+ if ( (!buffer_journal_prepared (bh) && buffer_locked (bh)) ||
+ !B_IS_IN_TREE (bh) ) {
+ reiserfs_warning (NULL, "vs-12339: locked_or_not_in_tree: %s (%b)",
+ which, bh);
+ return 1;
+ }
+ return 0;
+}
+
+
+static int check_before_balancing (struct tree_balance * tb)
+{
+ int retval = 0;
+
+ if ( cur_tb ) {
+ reiserfs_panic (tb->tb_sb, "vs-12335: check_before_balancing: "
+ "suspect that schedule occurred based on cur_tb not being null at this point in code. "
+ "do_balance cannot properly handle schedule occurring while it runs.");
+ }
+
+ /* double check that buffers that we will modify are unlocked. (fix_nodes should already have
+ prepped all of these for us). */
+ if ( tb->lnum[0] ) {
+ retval |= locked_or_not_in_tree (tb->L[0], "L[0]");
+ retval |= locked_or_not_in_tree (tb->FL[0], "FL[0]");
+ retval |= locked_or_not_in_tree (tb->CFL[0], "CFL[0]");
+ check_leaf (tb->L[0]);
+ }
+ if ( tb->rnum[0] ) {
+ retval |= locked_or_not_in_tree (tb->R[0], "R[0]");
+ retval |= locked_or_not_in_tree (tb->FR[0], "FR[0]");
+ retval |= locked_or_not_in_tree (tb->CFR[0], "CFR[0]");
+ check_leaf (tb->R[0]);
+ }
+ retval |= locked_or_not_in_tree (PATH_PLAST_BUFFER (tb->tb_path), "S[0]");
+ check_leaf (PATH_PLAST_BUFFER (tb->tb_path));
+
+ return retval;
+}
+
+
+static void check_after_balance_leaf (struct tree_balance * tb)
+{
+ if (tb->lnum[0]) {
+ if (B_FREE_SPACE (tb->L[0]) !=
+ MAX_CHILD_SIZE (tb->L[0]) - dc_size(B_N_CHILD (tb->FL[0], get_left_neighbor_position (tb, 0)))) {
+ print_cur_tb ("12221");
+ reiserfs_panic (tb->tb_sb, "PAP-12355: check_after_balance_leaf: shift to left was incorrect");
+ }
+ }
+ if (tb->rnum[0]) {
+ if (B_FREE_SPACE (tb->R[0]) !=
+ MAX_CHILD_SIZE (tb->R[0]) - dc_size(B_N_CHILD (tb->FR[0], get_right_neighbor_position (tb, 0)))) {
+ print_cur_tb ("12222");
+ reiserfs_panic (tb->tb_sb, "PAP-12360: check_after_balance_leaf: shift to right was incorrect");
+ }
+ }
+ if (PATH_H_PBUFFER(tb->tb_path,1) &&
+ (B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) !=
+ (MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)) -
+ dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1),
+ PATH_H_POSITION (tb->tb_path, 1)))) )) {
+ int left = B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0));
+ int right = (MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)) -
+ dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1),
+ PATH_H_POSITION (tb->tb_path, 1))));
+ print_cur_tb ("12223");
+ reiserfs_warning (tb->tb_sb,
+ "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
+ "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
+ left,
+ MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)),
+ PATH_H_PBUFFER(tb->tb_path,1),
+ PATH_H_POSITION (tb->tb_path, 1),
+ dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1), PATH_H_POSITION (tb->tb_path, 1 )) ),
+ right );
+ reiserfs_panic (tb->tb_sb, "PAP-12365: check_after_balance_leaf: S is incorrect");
+ }
+}
+
+
+static void check_leaf_level (struct tree_balance * tb)
+{
+ check_leaf (tb->L[0]);
+ check_leaf (tb->R[0]);
+ check_leaf (PATH_PLAST_BUFFER (tb->tb_path));
+}
+
+static void check_internal_levels (struct tree_balance * tb)
+{
+ int h;
+
+ /* check all internal nodes */
+ for (h = 1; tb->insert_size[h]; h ++) {
+ check_internal_node (tb->tb_sb, PATH_H_PBUFFER (tb->tb_path, h), "BAD BUFFER ON PATH");
+ if (tb->lnum[h])
+ check_internal_node (tb->tb_sb, tb->L[h], "BAD L");
+ if (tb->rnum[h])
+ check_internal_node (tb->tb_sb, tb->R[h], "BAD R");
+ }
+
+}
+
+#endif
+
+
+
+
+
+
+/* Now we have all of the buffers that must be used in balancing of
+ the tree. We rely on the assumption that schedule() will not occur
+ while do_balance works. ( Only interrupt handlers are acceptable.)
+ We balance the tree according to the analysis made before this,
+ using buffers already obtained. For SMP support it will someday be
+ necessary to add ordered locking of tb. */
+
+/* Some interesting rules of balancing:
+
+ we delete a maximum of two nodes per level per balancing: we never
+ delete R, when we delete two of three nodes L, S, R then we move
+ them into R.
+
+ we only delete L if we are deleting two nodes, if we delete only
+ one node we delete S
+
+ if we shift leaves then we shift as much as we can: this is a
+ deliberate policy of extremism in node packing which results in
+ higher average utilization after repeated random balance operations
+ at the cost of more memory copies and more balancing as a result of
+ small insertions to full nodes.
+
+ if we shift internal nodes we try to evenly balance the node
+ utilization, with consequent less balancing at the cost of lower
+ utilization.
+
+ one could argue that the policy for directories in leaves should be
+ that of internal nodes, but we will wait until another day to
+ evaluate this.... It would be nice to someday measure and prove
+ these assumptions as to what is optimal....
+
+*/
+
+static inline void do_balance_starts (struct tree_balance *tb)
+{
+ /* use print_cur_tb() to see initial state of struct
+ tree_balance */
+
+ /* store_print_tb (tb); */
+
+ /* do not delete, just comment it out */
+/* print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb,
+ "check");*/
+ RFALSE( check_before_balancing (tb), "PAP-12340: locked buffers in TB");
+#ifdef CONFIG_REISERFS_CHECK
+ cur_tb = tb;
+#endif
+}
+
+
+static inline void do_balance_completed (struct tree_balance * tb)
+{
+
+#ifdef CONFIG_REISERFS_CHECK
+ check_leaf_level (tb);
+ check_internal_levels (tb);
+ cur_tb = NULL;
+#endif
+
+ /* reiserfs_free_block is no longer schedule safe. So, we need to
+ ** put the buffers we want freed on the thrown list during do_balance,
+ ** and then free them now
+ */
+
+ REISERFS_SB(tb->tb_sb)->s_do_balance ++;
+
+
+ /* release all nodes hold to perform the balancing */
+ unfix_nodes(tb);
+
+ free_thrown(tb) ;
+}
+
+
+
+
+
+void do_balance (struct tree_balance * tb, /* tree_balance structure */
+ struct item_head * ih, /* item header of inserted item */
+ const char * body, /* body of inserted item or bytes to paste */
+ int flag) /* i - insert, d - delete
+ c - cut, p - paste
+
+ Cut means delete part of an item
+ (includes removing an entry from a
+ directory).
+
+ Delete means delete whole item.
+
+ Insert means add a new item into the
+ tree.
+
+ Paste means to append to the end of an
+ existing file or to insert a directory
+ entry. */
+{
+ int child_pos, /* position of a child node in its parent */
+ h; /* level of the tree being processed */
+ struct item_head insert_key[2]; /* in our processing of one level
+ we sometimes determine what
+ must be inserted into the next
+ higher level. This insertion
+ consists of a key or two keys
+ and their corresponding
+ pointers */
+ struct buffer_head *insert_ptr[2]; /* inserted node-ptrs for the next
+ level */
+
+ tb->tb_mode = flag;
+ tb->need_balance_dirty = 0;
+
+ if (FILESYSTEM_CHANGED_TB(tb)) {
+ reiserfs_panic(tb->tb_sb, "clm-6000: do_balance, fs generation has changed\n") ;
+ }
+ /* if we have no real work to do */
+ if ( ! tb->insert_size[0] ) {
+ reiserfs_warning (tb->tb_sb,
+ "PAP-12350: do_balance: insert_size == 0, mode == %c",
+ flag);
+ unfix_nodes(tb);
+ return;
+ }
+
+ atomic_inc (&(fs_generation (tb->tb_sb)));
+ do_balance_starts (tb);
+
+ /* balance leaf returns 0 except if combining L R and S into
+ one node. see balance_internal() for explanation of this
+ line of code.*/
+ child_pos = PATH_H_B_ITEM_ORDER (tb->tb_path, 0) +
+ balance_leaf (tb, ih, body, flag, insert_key, insert_ptr);
+
+#ifdef CONFIG_REISERFS_CHECK
+ check_after_balance_leaf (tb);
+#endif
+
+ /* Balance internal level of the tree. */
+ for ( h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++ )
+ child_pos = balance_internal (tb, h, child_pos, insert_key, insert_ptr);
+
+
+ do_balance_completed (tb);
+
+}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
new file mode 100644
index 00000000000..26950113af8
--- /dev/null
+++ b/fs/reiserfs/file.c
@@ -0,0 +1,1408 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+
+#include <linux/time.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_acl.h>
+#include <linux/reiserfs_xattr.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/quotaops.h>
+
+/*
+** We pack the tails of files on file close, not at the time they are written.
+** This implies an unnecessary copy of the tail and an unnecessary indirect item
+** insertion/balancing, for files that are written in one write.
+** It avoids unnecessary tail packings (balances) for files that are written in
+** multiple writes and are small enough to have tails.
+**
+** file_release is called by the VFS layer when the file is closed. If
+** this is the last open file descriptor, and the file
+** small enough to have a tail, and the tail is currently in an
+** unformatted node, the tail is converted back into a direct item.
+**
+** We use reiserfs_truncate_file to pack the tail, since it already has
+** all the conditions coded.
+*/
+static int reiserfs_file_release (struct inode * inode, struct file * filp)
+{
+
+ struct reiserfs_transaction_handle th ;
+ int err;
+ int jbegin_failure = 0;
+
+ if (!S_ISREG (inode->i_mode))
+ BUG ();
+
+ /* fast out for when nothing needs to be done */
+ if ((atomic_read(&inode->i_count) > 1 ||
+ !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
+ !tail_has_to_be_packed(inode)) &&
+ REISERFS_I(inode)->i_prealloc_count <= 0) {
+ return 0;
+ }
+
+ reiserfs_write_lock(inode->i_sb);
+ down (&inode->i_sem);
+ /* freeing preallocation only involves relogging blocks that
+ * are already in the current transaction. preallocation gets
+ * freed at the end of each transaction, so it is impossible for
+ * us to log any additional blocks (including quota blocks)
+ */
+ err = journal_begin(&th, inode->i_sb, 1);
+ if (err) {
+ /* uh oh, we can't allow the inode to go away while there
+ * is still preallocation blocks pending. Try to join the
+ * aborted transaction
+ */
+ jbegin_failure = err;
+ err = journal_join_abort(&th, inode->i_sb, 1);
+
+ if (err) {
+ /* hmpf, our choices here aren't good. We can pin the inode
+ * which will disallow unmount from every happening, we can
+ * do nothing, which will corrupt random memory on unmount,
+ * or we can forcibly remove the file from the preallocation
+ * list, which will leak blocks on disk. Lets pin the inode
+ * and let the admin know what is going on.
+ */
+ igrab(inode);
+ reiserfs_warning(inode->i_sb, "pinning inode %lu because the "
+ "preallocation can't be freed");
+ goto out;
+ }
+ }
+ reiserfs_update_inode_transaction(inode) ;
+
+#ifdef REISERFS_PREALLOCATE
+ reiserfs_discard_prealloc (&th, inode);
+#endif
+ err = journal_end(&th, inode->i_sb, 1);
+
+ /* copy back the error code from journal_begin */
+ if (!err)
+ err = jbegin_failure;
+
+ if (!err && atomic_read(&inode->i_count) <= 1 &&
+ (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
+ tail_has_to_be_packed (inode)) {
+ /* if regular file is released by last holder and it has been
+ appended (we append by unformatted node only) or its direct
+ item(s) had to be converted, then it may have to be
+ indirect2direct converted */
+ err = reiserfs_truncate_file(inode, 0) ;
+ }
+out:
+ up (&inode->i_sem);
+ reiserfs_write_unlock(inode->i_sb);
+ return err;
+}
+
+static void reiserfs_vfs_truncate_file(struct inode *inode) {
+ reiserfs_truncate_file(inode, 1) ;
+}
+
+/* Sync a reiserfs file. */
+
+/*
+ * FIXME: sync_mapping_buffers() never has anything to sync. Can
+ * be removed...
+ */
+
+static int reiserfs_sync_file(
+ struct file * p_s_filp,
+ struct dentry * p_s_dentry,
+ int datasync
+ ) {
+ struct inode * p_s_inode = p_s_dentry->d_inode;
+ int n_err;
+ int barrier_done;
+
+ if (!S_ISREG(p_s_inode->i_mode))
+ BUG ();
+ n_err = sync_mapping_buffers(p_s_inode->i_mapping) ;
+ reiserfs_write_lock(p_s_inode->i_sb);
+ barrier_done = reiserfs_commit_for_inode(p_s_inode);
+ reiserfs_write_unlock(p_s_inode->i_sb);
+ if (barrier_done != 1)
+ blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
+ if (barrier_done < 0)
+ return barrier_done;
+ return ( n_err < 0 ) ? -EIO : 0;
+}
+
+/* I really do not want to play with memory shortage right now, so
+ to simplify the code, we are not going to write more than this much pages at
+ a time. This still should considerably improve performance compared to 4k
+ at a time case. This is 32 pages of 4k size. */
+#define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
+
+/* Allocates blocks for a file to fulfil write request.
+ Maps all unmapped but prepared pages from the list.
+ Updates metadata with newly allocated blocknumbers as needed */
+static int reiserfs_allocate_blocks_for_region(
+ struct reiserfs_transaction_handle *th,
+ struct inode *inode, /* Inode we work with */
+ loff_t pos, /* Writing position */
+ int num_pages, /* number of pages write going
+ to touch */
+ int write_bytes, /* amount of bytes to write */
+ struct page **prepared_pages, /* array of
+ prepared pages
+ */
+ int blocks_to_allocate /* Amount of blocks we
+ need to allocate to
+ fit the data into file
+ */
+ )
+{
+ struct cpu_key key; // cpu key of item that we are going to deal with
+ struct item_head *ih; // pointer to item head that we are going to deal with
+ struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
+ __u32 * item; // pointer to item we are going to deal with
+ INITIALIZE_PATH(path); // path to item, that we are going to deal with.
+ b_blocknr_t *allocated_blocks; // Pointer to a place where allocated blocknumbers would be stored.
+ reiserfs_blocknr_hint_t hint; // hint structure for block allocator.
+ size_t res; // return value of various functions that we call.
+ int curr_block; // current block used to keep track of unmapped blocks.
+ int i; // loop counter
+ int itempos; // position in item
+ unsigned int from = (pos & (PAGE_CACHE_SIZE - 1)); // writing position in
+ // first page
+ unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; /* last modified byte offset in last page */
+ __u64 hole_size ; // amount of blocks for a file hole, if it needed to be created.
+ int modifying_this_item = 0; // Flag for items traversal code to keep track
+ // of the fact that we already prepared
+ // current block for journal
+ int will_prealloc = 0;
+ RFALSE(!blocks_to_allocate, "green-9004: tried to allocate zero blocks?");
+
+ /* only preallocate if this is a small write */
+ if (REISERFS_I(inode)->i_prealloc_count ||
+ (!(write_bytes & (inode->i_sb->s_blocksize -1)) &&
+ blocks_to_allocate <
+ REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
+ will_prealloc = REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
+
+ allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
+ sizeof(b_blocknr_t), GFP_NOFS);
+
+ /* First we compose a key to point at the writing position, we want to do
+ that outside of any locking region. */
+ make_cpu_key (&key, inode, pos+1, TYPE_ANY, 3/*key length*/);
+
+ /* If we came here, it means we absolutely need to open a transaction,
+ since we need to allocate some blocks */
+ reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that.
+ res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS); // Wish I know if this number enough
+ if (res)
+ goto error_exit;
+ reiserfs_update_inode_transaction(inode) ;
+
+ /* Look for the in-tree position of our write, need path for block allocator */
+ res = search_for_position_by_key(inode->i_sb, &key, &path);
+ if ( res == IO_ERROR ) {
+ res = -EIO;
+ goto error_exit;
+ }
+
+ /* Allocate blocks */
+ /* First fill in "hint" structure for block allocator */
+ hint.th = th; // transaction handle.
+ hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine.
+ hint.inode = inode; // Inode is needed by block allocator too.
+ hint.search_start = 0; // We have no hint on where to search free blocks for block allocator.
+ hint.key = key.on_disk_key; // on disk key of file.
+ hint.block = inode->i_blocks>>(inode->i_sb->s_blocksize_bits-9); // Number of disk blocks this file occupies already.
+ hint.formatted_node = 0; // We are allocating blocks for unformatted node.
+ hint.preallocate = will_prealloc;
+
+ /* Call block allocator to allocate blocks */
+ res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
+ if ( res != CARRY_ON ) {
+ if ( res == NO_DISK_SPACE ) {
+ /* We flush the transaction in case of no space. This way some
+ blocks might become free */
+ SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
+ res = restart_transaction(th, inode, &path);
+ if (res)
+ goto error_exit;
+
+ /* We might have scheduled, so search again */
+ res = search_for_position_by_key(inode->i_sb, &key, &path);
+ if ( res == IO_ERROR ) {
+ res = -EIO;
+ goto error_exit;
+ }
+
+ /* update changed info for hint structure. */
+ res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
+ if ( res != CARRY_ON ) {
+ res = -ENOSPC;
+ pathrelse(&path);
+ goto error_exit;
+ }
+ } else {
+ res = -ENOSPC;
+ pathrelse(&path);
+ goto error_exit;
+ }
+ }
+
+#ifdef __BIG_ENDIAN
+ // Too bad, I have not found any way to convert a given region from
+ // cpu format to little endian format
+ {
+ int i;
+ for ( i = 0; i < blocks_to_allocate ; i++)
+ allocated_blocks[i]=cpu_to_le32(allocated_blocks[i]);
+ }
+#endif
+
+ /* Blocks allocating well might have scheduled and tree might have changed,
+ let's search the tree again */
+ /* find where in the tree our write should go */
+ res = search_for_position_by_key(inode->i_sb, &key, &path);
+ if ( res == IO_ERROR ) {
+ res = -EIO;
+ goto error_exit_free_blocks;
+ }
+
+ bh = get_last_bh( &path ); // Get a bufferhead for last element in path.
+ ih = get_ih( &path ); // Get a pointer to last item head in path.
+ item = get_item( &path ); // Get a pointer to last item in path
+
+ /* Let's see what we have found */
+ if ( res != POSITION_FOUND ) { /* position not found, this means that we
+ might need to append file with holes
+ first */
+ // Since we are writing past the file's end, we need to find out if
+ // there is a hole that needs to be inserted before our writing
+ // position, and how many blocks it is going to cover (we need to
+ // populate pointers to file blocks representing the hole with zeros)
+
+ {
+ int item_offset = 1;
+ /*
+ * if ih is stat data, its offset is 0 and we don't want to
+ * add 1 to pos in the hole_size calculation
+ */
+ if (is_statdata_le_ih(ih))
+ item_offset = 0;
+ hole_size = (pos + item_offset -
+ (le_key_k_offset( get_inode_item_key_version(inode),
+ &(ih->ih_key)) +
+ op_bytes_number(ih, inode->i_sb->s_blocksize))) >>
+ inode->i_sb->s_blocksize_bits;
+ }
+
+ if ( hole_size > 0 ) {
+ int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); // How much data to insert first time.
+ /* area filled with zeroes, to supply as list of zero blocknumbers
+ We allocate it outside of loop just in case loop would spin for
+ several iterations. */
+ char *zeros = kmalloc(to_paste*UNFM_P_SIZE, GFP_ATOMIC); // We cannot insert more than MAX_ITEM_LEN bytes anyway.
+ if ( !zeros ) {
+ res = -ENOMEM;
+ goto error_exit_free_blocks;
+ }
+ memset ( zeros, 0, to_paste*UNFM_P_SIZE);
+ do {
+ to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE );
+ if ( is_indirect_le_ih(ih) ) {
+ /* Ok, there is existing indirect item already. Need to append it */
+ /* Calculate position past inserted item */
+ make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
+ res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)zeros, UNFM_P_SIZE*to_paste);
+ if ( res ) {
+ kfree(zeros);
+ goto error_exit_free_blocks;
+ }
+ } else if ( is_statdata_le_ih(ih) ) {
+ /* No existing item, create it */
+ /* item head for new item */
+ struct item_head ins_ih;
+
+ /* create a key for our new item */
+ make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3);
+
+ /* Create new item head for our new item */
+ make_le_item_head (&ins_ih, &key, key.version, 1,
+ TYPE_INDIRECT, to_paste*UNFM_P_SIZE,
+ 0 /* free space */);
+
+ /* Find where such item should live in the tree */
+ res = search_item (inode->i_sb, &key, &path);
+ if ( res != ITEM_NOT_FOUND ) {
+ /* item should not exist, otherwise we have error */
+ if ( res != -ENOSPC ) {
+ reiserfs_warning (inode->i_sb,
+ "green-9008: search_by_key (%K) returned %d",
+ &key, res);
+ }
+ res = -EIO;
+ kfree(zeros);
+ goto error_exit_free_blocks;
+ }
+ res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)zeros);
+ } else {
+ reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key);
+ }
+ if ( res ) {
+ kfree(zeros);
+ goto error_exit_free_blocks;
+ }
+ /* Now we want to check if transaction is too full, and if it is
+ we restart it. This will also free the path. */
+ if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
+ res = restart_transaction(th, inode, &path);
+ if (res) {
+ pathrelse (&path);
+ kfree(zeros);
+ goto error_exit;
+ }
+ }
+
+ /* Well, need to recalculate path and stuff */
+ set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits));
+ res = search_for_position_by_key(inode->i_sb, &key, &path);
+ if ( res == IO_ERROR ) {
+ res = -EIO;
+ kfree(zeros);
+ goto error_exit_free_blocks;
+ }
+ bh=get_last_bh(&path);
+ ih=get_ih(&path);
+ item = get_item(&path);
+ hole_size -= to_paste;
+ } while ( hole_size );
+ kfree(zeros);
+ }
+ }
+
+ // Go through existing indirect items first
+ // replace all zeroes with blocknumbers from list
+ // Note that if no corresponding item was found, by previous search,
+ // it means there are no existing in-tree representation for file area
+ // we are going to overwrite, so there is nothing to scan through for holes.
+ for ( curr_block = 0, itempos = path.pos_in_item ; curr_block < blocks_to_allocate && res == POSITION_FOUND ; ) {
+retry:
+
+ if ( itempos >= ih_item_len(ih)/UNFM_P_SIZE ) {
+ /* We run out of data in this indirect item, let's look for another
+ one. */
+ /* First if we are already modifying current item, log it */
+ if ( modifying_this_item ) {
+ journal_mark_dirty (th, inode->i_sb, bh);
+ modifying_this_item = 0;
+ }
+ /* Then set the key to look for a new indirect item (offset of old
+ item is added to old item length */
+ set_cpu_key_k_offset( &key, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize));
+ /* Search ofor position of new key in the tree. */
+ res = search_for_position_by_key(inode->i_sb, &key, &path);
+ if ( res == IO_ERROR) {
+ res = -EIO;
+ goto error_exit_free_blocks;
+ }
+ bh=get_last_bh(&path);
+ ih=get_ih(&path);
+ item = get_item(&path);
+ itempos = path.pos_in_item;
+ continue; // loop to check all kinds of conditions and so on.
+ }
+ /* Ok, we have correct position in item now, so let's see if it is
+ representing file hole (blocknumber is zero) and fill it if needed */
+ if ( !item[itempos] ) {
+ /* Ok, a hole. Now we need to check if we already prepared this
+ block to be journaled */
+ while ( !modifying_this_item ) { // loop until succeed
+ /* Well, this item is not journaled yet, so we must prepare
+ it for journal first, before we can change it */
+ struct item_head tmp_ih; // We copy item head of found item,
+ // here to detect if fs changed under
+ // us while we were preparing for
+ // journal.
+ int fs_gen; // We store fs generation here to find if someone
+ // changes fs under our feet
+
+ copy_item_head (&tmp_ih, ih); // Remember itemhead
+ fs_gen = get_generation (inode->i_sb); // remember fs generation
+ reiserfs_prepare_for_journal(inode->i_sb, bh, 1); // Prepare a buffer within which indirect item is stored for changing.
+ if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
+ // Sigh, fs was changed under us, we need to look for new
+ // location of item we are working with
+
+ /* unmark prepaerd area as journaled and search for it's
+ new position */
+ reiserfs_restore_prepared_buffer(inode->i_sb, bh);
+ res = search_for_position_by_key(inode->i_sb, &key, &path);
+ if ( res == IO_ERROR) {
+ res = -EIO;
+ goto error_exit_free_blocks;
+ }
+ bh=get_last_bh(&path);
+ ih=get_ih(&path);
+ item = get_item(&path);
+ itempos = path.pos_in_item;
+ goto retry;
+ }
+ modifying_this_item = 1;
+ }
+ item[itempos] = allocated_blocks[curr_block]; // Assign new block
+ curr_block++;
+ }
+ itempos++;
+ }
+
+ if ( modifying_this_item ) { // We need to log last-accessed block, if it
+ // was modified, but not logged yet.
+ journal_mark_dirty (th, inode->i_sb, bh);
+ }
+
+ if ( curr_block < blocks_to_allocate ) {
+ // Oh, well need to append to indirect item, or to create indirect item
+ // if there weren't any
+ if ( is_indirect_le_ih(ih) ) {
+ // Existing indirect item - append. First calculate key for append
+ // position. We do not need to recalculate path as it should
+ // already point to correct place.
+ make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
+ res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
+ if ( res ) {
+ goto error_exit_free_blocks;
+ }
+ } else if (is_statdata_le_ih(ih) ) {
+ // Last found item was statdata. That means we need to create indirect item.
+ struct item_head ins_ih; /* itemhead for new item */
+
+ /* create a key for our new item */
+ make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3); // Position one,
+ // because that's
+ // where first
+ // indirect item
+ // begins
+ /* Create new item head for our new item */
+ make_le_item_head (&ins_ih, &key, key.version, 1, TYPE_INDIRECT,
+ (blocks_to_allocate-curr_block)*UNFM_P_SIZE,
+ 0 /* free space */);
+ /* Find where such item should live in the tree */
+ res = search_item (inode->i_sb, &key, &path);
+ if ( res != ITEM_NOT_FOUND ) {
+ /* Well, if we have found such item already, or some error
+ occured, we need to warn user and return error */
+ if ( res != -ENOSPC ) {
+ reiserfs_warning (inode->i_sb,
+ "green-9009: search_by_key (%K) "
+ "returned %d", &key, res);
+ }
+ res = -EIO;
+ goto error_exit_free_blocks;
+ }
+ /* Insert item into the tree with the data as its body */
+ res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)(allocated_blocks+curr_block));
+ } else {
+ reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key);
+ }
+ }
+
+ // the caller is responsible for closing the transaction
+ // unless we return an error, they are also responsible for logging
+ // the inode.
+ //
+ pathrelse(&path);
+ /*
+ * cleanup prellocation from previous writes
+ * if this is a partial block write
+ */
+ if (write_bytes & (inode->i_sb->s_blocksize -1))
+ reiserfs_discard_prealloc(th, inode);
+ reiserfs_write_unlock(inode->i_sb);
+
+ // go through all the pages/buffers and map the buffers to newly allocated
+ // blocks (so that system knows where to write these pages later).
+ curr_block = 0;
+ for ( i = 0; i < num_pages ; i++ ) {
+ struct page *page=prepared_pages[i]; //current page
+ struct buffer_head *head = page_buffers(page);// first buffer for a page
+ int block_start, block_end; // in-page offsets for buffers.
+
+ if (!page_buffers(page))
+ reiserfs_panic(inode->i_sb, "green-9005: No buffers for prepared page???");
+
+ /* For each buffer in page */
+ for(bh = head, block_start = 0; bh != head || !block_start;
+ block_start=block_end, bh = bh->b_this_page) {
+ if (!bh)
+ reiserfs_panic(inode->i_sb, "green-9006: Allocated but absent buffer for a page?");
+ block_end = block_start+inode->i_sb->s_blocksize;
+ if (i == 0 && block_end <= from )
+ /* if this buffer is before requested data to map, skip it */
+ continue;
+ if (i == num_pages - 1 && block_start >= to)
+ /* If this buffer is after requested data to map, abort
+ processing of current page */
+ break;
+
+ if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it
+ map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block]));
+ curr_block++;
+ set_buffer_new(bh);
+ }
+ }
+ }
+
+ RFALSE( curr_block > blocks_to_allocate, "green-9007: Used too many blocks? weird");
+
+ kfree(allocated_blocks);
+ return 0;
+
+// Need to deal with transaction here.
+error_exit_free_blocks:
+ pathrelse(&path);
+ // free blocks
+ for( i = 0; i < blocks_to_allocate; i++ )
+ reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]), 1);
+
+error_exit:
+ if (th->t_trans_id) {
+ int err;
+ // update any changes we made to blk count
+ reiserfs_update_sd(th, inode);
+ err = journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS);
+ if (err)
+ res = err;
+ }
+ reiserfs_write_unlock(inode->i_sb);
+ kfree(allocated_blocks);
+
+ return res;
+}
+
+/* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
+static void reiserfs_unprepare_pages(struct page **prepared_pages, /* list of locked pages */
+ size_t num_pages /* amount of pages */) {
+ int i; // loop counter
+
+ for (i=0; i < num_pages ; i++) {
+ struct page *page = prepared_pages[i];
+
+ try_to_free_buffers(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+}
+
+/* This function will copy data from userspace to specified pages within
+ supplied byte range */
+static int reiserfs_copy_from_user_to_file_region(
+ loff_t pos, /* In-file position */
+ int num_pages, /* Number of pages affected */
+ int write_bytes, /* Amount of bytes to write */
+ struct page **prepared_pages, /* pointer to
+ array to
+ prepared pages
+ */
+ const char __user *buf /* Pointer to user-supplied
+ data*/
+ )
+{
+ long page_fault=0; // status of copy_from_user.
+ int i; // loop counter.
+ int offset; // offset in page
+
+ for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
+ size_t count = min_t(size_t,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
+ struct page *page=prepared_pages[i]; // Current page we process.
+
+ fault_in_pages_readable( buf, count);
+
+ /* Copy data from userspace to the current page */
+ kmap(page);
+ page_fault = __copy_from_user(page_address(page)+offset, buf, count); // Copy the data.
+ /* Flush processor's dcache for this page */
+ flush_dcache_page(page);
+ kunmap(page);
+ buf+=count;
+ write_bytes-=count;
+
+ if (page_fault)
+ break; // Was there a fault? abort.
+ }
+
+ return page_fault?-EFAULT:0;
+}
+
+/* taken fs/buffer.c:__block_commit_write */
+int reiserfs_commit_page(struct inode *inode, struct page *page,
+ unsigned from, unsigned to)
+{
+ unsigned block_start, block_end;
+ int partial = 0;
+ unsigned blocksize;
+ struct buffer_head *bh, *head;
+ unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ int new;
+ int logit = reiserfs_file_data_log(inode);
+ struct super_block *s = inode->i_sb;
+ int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+ struct reiserfs_transaction_handle th;
+ int ret = 0;
+
+ th.t_trans_id = 0;
+ blocksize = 1 << inode->i_blkbits;
+
+ if (logit) {
+ reiserfs_write_lock(s);
+ ret = journal_begin(&th, s, bh_per_page + 1);
+ if (ret)
+ goto drop_write_lock;
+ reiserfs_update_inode_transaction(inode);
+ }
+ for(bh = head = page_buffers(page), block_start = 0;
+ bh != head || !block_start;
+ block_start=block_end, bh = bh->b_this_page)
+ {
+
+ new = buffer_new(bh);
+ clear_buffer_new(bh);
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (!buffer_uptodate(bh))
+ partial = 1;
+ } else {
+ set_buffer_uptodate(bh);
+ if (logit) {
+ reiserfs_prepare_for_journal(s, bh, 1);
+ journal_mark_dirty(&th, s, bh);
+ } else if (!buffer_dirty(bh)) {
+ mark_buffer_dirty(bh);
+ /* do data=ordered on any page past the end
+ * of file and any buffer marked BH_New.
+ */
+ if (reiserfs_data_ordered(inode->i_sb) &&
+ (new || page->index >= i_size_index)) {
+ reiserfs_add_ordered_list(inode, bh);
+ }
+ }
+ }
+ }
+ if (logit) {
+ ret = journal_end(&th, s, bh_per_page + 1);
+drop_write_lock:
+ reiserfs_write_unlock(s);
+ }
+ /*
+ * If this is a partial write which happened to make all buffers
+ * uptodate then we can optimize away a bogus readpage() for
+ * the next read(). Here we 'discover' whether the page went
+ * uptodate as a result of this (potentially partial) write.
+ */
+ if (!partial)
+ SetPageUptodate(page);
+ return ret;
+}
+
+
+/* Submit pages for write. This was separated from actual file copying
+ because we might want to allocate block numbers in-between.
+ This function assumes that caller will adjust file size to correct value. */
+static int reiserfs_submit_file_region_for_write(
+ struct reiserfs_transaction_handle *th,
+ struct inode *inode,
+ loff_t pos, /* Writing position offset */
+ size_t num_pages, /* Number of pages to write */
+ size_t write_bytes, /* number of bytes to write */
+ struct page **prepared_pages /* list of pages */
+ )
+{
+ int status; // return status of block_commit_write.
+ int retval = 0; // Return value we are going to return.
+ int i; // loop counter
+ int offset; // Writing offset in page.
+ int orig_write_bytes = write_bytes;
+ int sd_update = 0;
+
+ for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
+ int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
+ struct page *page=prepared_pages[i]; // Current page we process.
+
+ status = reiserfs_commit_page(inode, page, offset, offset+count);
+ if ( status )
+ retval = status; // To not overcomplicate matters We are going to
+ // submit all the pages even if there was error.
+ // we only remember error status to report it on
+ // exit.
+ write_bytes-=count;
+ }
+ /* now that we've gotten all the ordered buffers marked dirty,
+ * we can safely update i_size and close any running transaction
+ */
+ if ( pos + orig_write_bytes > inode->i_size) {
+ inode->i_size = pos + orig_write_bytes; // Set new size
+ /* If the file have grown so much that tail packing is no
+ * longer possible, reset "need to pack" flag */
+ if ( (have_large_tails (inode->i_sb) &&
+ inode->i_size > i_block_size (inode)*4) ||
+ (have_small_tails (inode->i_sb) &&
+ inode->i_size > i_block_size(inode)) )
+ REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
+ else if ( (have_large_tails (inode->i_sb) &&
+ inode->i_size < i_block_size (inode)*4) ||
+ (have_small_tails (inode->i_sb) &&
+ inode->i_size < i_block_size(inode)) )
+ REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
+
+ if (th->t_trans_id) {
+ reiserfs_write_lock(inode->i_sb);
+ reiserfs_update_sd(th, inode); // And update on-disk metadata
+ reiserfs_write_unlock(inode->i_sb);
+ } else
+ inode->i_sb->s_op->dirty_inode(inode);
+
+ sd_update = 1;
+ }
+ if (th->t_trans_id) {
+ reiserfs_write_lock(inode->i_sb);
+ if (!sd_update)
+ reiserfs_update_sd(th, inode);
+ status = journal_end(th, th->t_super, th->t_blocks_allocated);
+ if (status)
+ retval = status;
+ reiserfs_write_unlock(inode->i_sb);
+ }
+ th->t_trans_id = 0;
+
+ /*
+ * we have to unlock the pages after updating i_size, otherwise
+ * we race with writepage
+ */
+ for ( i = 0; i < num_pages ; i++) {
+ struct page *page=prepared_pages[i];
+ unlock_page(page);
+ mark_page_accessed(page);
+ page_cache_release(page);
+ }
+ return retval;
+}
+
+/* Look if passed writing region is going to touch file's tail
+ (if it is present). And if it is, convert the tail to unformatted node */
+static int reiserfs_check_for_tail_and_convert( struct inode *inode, /* inode to deal with */
+ loff_t pos, /* Writing position */
+ int write_bytes /* amount of bytes to write */
+ )
+{
+ INITIALIZE_PATH(path); // needed for search_for_position
+ struct cpu_key key; // Key that would represent last touched writing byte.
+ struct item_head *ih; // item header of found block;
+ int res; // Return value of various functions we call.
+ int cont_expand_offset; // We will put offset for generic_cont_expand here
+ // This can be int just because tails are created
+ // only for small files.
+
+/* this embodies a dependency on a particular tail policy */
+ if ( inode->i_size >= inode->i_sb->s_blocksize*4 ) {
+ /* such a big files do not have tails, so we won't bother ourselves
+ to look for tails, simply return */
+ return 0;
+ }
+
+ reiserfs_write_lock(inode->i_sb);
+ /* find the item containing the last byte to be written, or if
+ * writing past the end of the file then the last item of the
+ * file (and then we check its type). */
+ make_cpu_key (&key, inode, pos+write_bytes+1, TYPE_ANY, 3/*key length*/);
+ res = search_for_position_by_key(inode->i_sb, &key, &path);
+ if ( res == IO_ERROR ) {
+ reiserfs_write_unlock(inode->i_sb);
+ return -EIO;
+ }
+ ih = get_ih(&path);
+ res = 0;
+ if ( is_direct_le_ih(ih) ) {
+ /* Ok, closest item is file tail (tails are stored in "direct"
+ * items), so we need to unpack it. */
+ /* To not overcomplicate matters, we just call generic_cont_expand
+ which will in turn call other stuff and finally will boil down to
+ reiserfs_get_block() that would do necessary conversion. */
+ cont_expand_offset = le_key_k_offset(get_inode_item_key_version(inode), &(ih->ih_key));
+ pathrelse(&path);
+ res = generic_cont_expand( inode, cont_expand_offset);
+ } else
+ pathrelse(&path);
+
+ reiserfs_write_unlock(inode->i_sb);
+ return res;
+}
+
+/* This function locks pages starting from @pos for @inode.
+ @num_pages pages are locked and stored in
+ @prepared_pages array. Also buffers are allocated for these pages.
+ First and last page of the region is read if it is overwritten only
+ partially. If last page did not exist before write (file hole or file
+ append), it is zeroed, then.
+ Returns number of unallocated blocks that should be allocated to cover
+ new file data.*/
+static int reiserfs_prepare_file_region_for_write(
+ struct inode *inode /* Inode of the file */,
+ loff_t pos, /* position in the file */
+ size_t num_pages, /* number of pages to
+ prepare */
+ size_t write_bytes, /* Amount of bytes to be
+ overwritten from
+ @pos */
+ struct page **prepared_pages /* pointer to array
+ where to store
+ prepared pages */
+ )
+{
+ int res=0; // Return values of different functions we call.
+ unsigned long index = pos >> PAGE_CACHE_SHIFT; // Offset in file in pages.
+ int from = (pos & (PAGE_CACHE_SIZE - 1)); // Writing offset in first page
+ int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
+ /* offset of last modified byte in last
+ page */
+ struct address_space *mapping = inode->i_mapping; // Pages are mapped here.
+ int i; // Simple counter
+ int blocks = 0; /* Return value (blocks that should be allocated) */
+ struct buffer_head *bh, *head; // Current bufferhead and first bufferhead
+ // of a page.
+ unsigned block_start, block_end; // Starting and ending offsets of current
+ // buffer in the page.
+ struct buffer_head *wait[2], **wait_bh=wait; // Buffers for page, if
+ // Page appeared to be not up
+ // to date. Note how we have
+ // at most 2 buffers, this is
+ // because we at most may
+ // partially overwrite two
+ // buffers for one page. One at // the beginning of write area
+ // and one at the end.
+ // Everything inthe middle gets // overwritten totally.
+
+ struct cpu_key key; // cpu key of item that we are going to deal with
+ struct item_head *ih = NULL; // pointer to item head that we are going to deal with
+ struct buffer_head *itembuf=NULL; // Buffer head that contains items that we are going to deal with
+ INITIALIZE_PATH(path); // path to item, that we are going to deal with.
+ __u32 * item=NULL; // pointer to item we are going to deal with
+ int item_pos=-1; /* Position in indirect item */
+
+
+ if ( num_pages < 1 ) {
+ reiserfs_warning (inode->i_sb,
+ "green-9001: reiserfs_prepare_file_region_for_write "
+ "called with zero number of pages to process");
+ return -EFAULT;
+ }
+
+ /* We have 2 loops for pages. In first loop we grab and lock the pages, so
+ that nobody would touch these until we release the pages. Then
+ we'd start to deal with mapping buffers to blocks. */
+ for ( i = 0; i < num_pages; i++) {
+ prepared_pages[i] = grab_cache_page(mapping, index + i); // locks the page
+ if ( !prepared_pages[i]) {
+ res = -ENOMEM;
+ goto failed_page_grabbing;
+ }
+ if (!page_has_buffers(prepared_pages[i]))
+ create_empty_buffers(prepared_pages[i], inode->i_sb->s_blocksize, 0);
+ }
+
+ /* Let's count amount of blocks for a case where all the blocks
+ overwritten are new (we will substract already allocated blocks later)*/
+ if ( num_pages > 2 )
+ /* These are full-overwritten pages so we count all the blocks in
+ these pages are counted as needed to be allocated */
+ blocks = (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ /* count blocks needed for first page (possibly partially written) */
+ blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) +
+ !!(from & (inode->i_sb->s_blocksize-1)); /* roundup */
+
+ /* Now we account for last page. If last page == first page (we
+ overwrite only one page), we substract all the blocks past the
+ last writing position in a page out of already calculated number
+ of blocks */
+ blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT-inode->i_blkbits)) -
+ ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
+ /* Note how we do not roundup here since partial blocks still
+ should be allocated */
+
+ /* Now if all the write area lies past the file end, no point in
+ maping blocks, since there is none, so we just zero out remaining
+ parts of first and last pages in write area (if needed) */
+ if ( (pos & ~((loff_t)PAGE_CACHE_SIZE - 1)) > inode->i_size ) {
+ if ( from != 0 ) {/* First page needs to be partially zeroed */
+ char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
+ memset(kaddr, 0, from);
+ kunmap_atomic( kaddr, KM_USER0);
+ }
+ if ( to != PAGE_CACHE_SIZE ) { /* Last page needs to be partially zeroed */
+ char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
+ memset(kaddr+to, 0, PAGE_CACHE_SIZE - to);
+ kunmap_atomic( kaddr, KM_USER0);
+ }
+
+ /* Since all blocks are new - use already calculated value */
+ return blocks;
+ }
+
+ /* Well, since we write somewhere into the middle of a file, there is
+ possibility we are writing over some already allocated blocks, so
+ let's map these blocks and substract number of such blocks out of blocks
+ we need to allocate (calculated above) */
+ /* Mask write position to start on blocksize, we do it out of the
+ loop for performance reasons */
+ pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
+ /* Set cpu key to the starting position in a file (on left block boundary)*/
+ make_cpu_key (&key, inode, 1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)), TYPE_ANY, 3/*key length*/);
+
+ reiserfs_write_lock(inode->i_sb); // We need that for at least search_by_key()
+ for ( i = 0; i < num_pages ; i++ ) {
+
+ head = page_buffers(prepared_pages[i]);
+ /* For each buffer in the page */
+ for(bh = head, block_start = 0; bh != head || !block_start;
+ block_start=block_end, bh = bh->b_this_page) {
+ if (!bh)
+ reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
+ /* Find where this buffer ends */
+ block_end = block_start+inode->i_sb->s_blocksize;
+ if (i == 0 && block_end <= from )
+ /* if this buffer is before requested data to map, skip it*/
+ continue;
+
+ if (i == num_pages - 1 && block_start >= to) {
+ /* If this buffer is after requested data to map, abort
+ processing of current page */
+ break;
+ }
+
+ if ( buffer_mapped(bh) && bh->b_blocknr !=0 ) {
+ /* This is optimisation for a case where buffer is mapped
+ and have blocknumber assigned. In case significant amount
+ of such buffers are present, we may avoid some amount
+ of search_by_key calls.
+ Probably it would be possible to move parts of this code
+ out of BKL, but I afraid that would overcomplicate code
+ without any noticeable benefit.
+ */
+ item_pos++;
+ /* Update the key */
+ set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
+ blocks--; // Decrease the amount of blocks that need to be
+ // allocated
+ continue; // Go to the next buffer
+ }
+
+ if ( !itembuf || /* if first iteration */
+ item_pos >= ih_item_len(ih)/UNFM_P_SIZE)
+ { /* or if we progressed past the
+ current unformatted_item */
+ /* Try to find next item */
+ res = search_for_position_by_key(inode->i_sb, &key, &path);
+ /* Abort if no more items */
+ if ( res != POSITION_FOUND ) {
+ /* make sure later loops don't use this item */
+ itembuf = NULL;
+ item = NULL;
+ break;
+ }
+
+ /* Update information about current indirect item */
+ itembuf = get_last_bh( &path );
+ ih = get_ih( &path );
+ item = get_item( &path );
+ item_pos = path.pos_in_item;
+
+ RFALSE( !is_indirect_le_ih (ih), "green-9003: indirect item expected");
+ }
+
+ /* See if there is some block associated with the file
+ at that position, map the buffer to this block */
+ if ( get_block_num(item,item_pos) ) {
+ map_bh(bh, inode->i_sb, get_block_num(item,item_pos));
+ blocks--; // Decrease the amount of blocks that need to be
+ // allocated
+ }
+ item_pos++;
+ /* Update the key */
+ set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
+ }
+ }
+ pathrelse(&path); // Free the path
+ reiserfs_write_unlock(inode->i_sb);
+
+ /* Now zero out unmappend buffers for the first and last pages of
+ write area or issue read requests if page is mapped. */
+ /* First page, see if it is not uptodate */
+ if ( !PageUptodate(prepared_pages[0]) ) {
+ head = page_buffers(prepared_pages[0]);
+
+ /* For each buffer in page */
+ for(bh = head, block_start = 0; bh != head || !block_start;
+ block_start=block_end, bh = bh->b_this_page) {
+
+ if (!bh)
+ reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
+ /* Find where this buffer ends */
+ block_end = block_start+inode->i_sb->s_blocksize;
+ if ( block_end <= from )
+ /* if this buffer is before requested data to map, skip it*/
+ continue;
+ if ( block_start < from ) { /* Aha, our partial buffer */
+ if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
+ issue READ request for it to
+ not loose data */
+ ll_rw_block(READ, 1, &bh);
+ *wait_bh++=bh;
+ } else { /* Not mapped, zero it */
+ char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
+ memset(kaddr+block_start, 0, from-block_start);
+ kunmap_atomic( kaddr, KM_USER0);
+ set_buffer_uptodate(bh);
+ }
+ }
+ }
+ }
+
+ /* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
+ if ( !PageUptodate(prepared_pages[num_pages-1]) ||
+ ((pos+write_bytes)>>PAGE_CACHE_SHIFT) > (inode->i_size>>PAGE_CACHE_SHIFT) ) {
+ head = page_buffers(prepared_pages[num_pages-1]);
+
+ /* for each buffer in page */
+ for(bh = head, block_start = 0; bh != head || !block_start;
+ block_start=block_end, bh = bh->b_this_page) {
+
+ if (!bh)
+ reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
+ /* Find where this buffer ends */
+ block_end = block_start+inode->i_sb->s_blocksize;
+ if ( block_start >= to )
+ /* if this buffer is after requested data to map, skip it*/
+ break;
+ if ( block_end > to ) { /* Aha, our partial buffer */
+ if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
+ issue READ request for it to
+ not loose data */
+ ll_rw_block(READ, 1, &bh);
+ *wait_bh++=bh;
+ } else { /* Not mapped, zero it */
+ char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
+ memset(kaddr+to, 0, block_end-to);
+ kunmap_atomic( kaddr, KM_USER0);
+ set_buffer_uptodate(bh);
+ }
+ }
+ }
+ }
+
+ /* Wait for read requests we made to happen, if necessary */
+ while(wait_bh > wait) {
+ wait_on_buffer(*--wait_bh);
+ if (!buffer_uptodate(*wait_bh)) {
+ res = -EIO;
+ goto failed_read;
+ }
+ }
+
+ return blocks;
+failed_page_grabbing:
+ num_pages = i;
+failed_read:
+ reiserfs_unprepare_pages(prepared_pages, num_pages);
+ return res;
+}
+
+/* Write @count bytes at position @ppos in a file indicated by @file
+ from the buffer @buf.
+
+ generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
+ something simple that works. It is not for serious use by general purpose filesystems, excepting the one that it was
+ written for (ext2/3). This is for several reasons:
+
+ * It has no understanding of any filesystem specific optimizations.
+
+ * It enters the filesystem repeatedly for each page that is written.
+
+ * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
+ * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
+ * to reiserfs which allows for fewer tree traversals.
+
+ * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
+
+ * Asking the block allocation code for blocks one at a time is slightly less efficient.
+
+ All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
+ use it, but we were in a hurry to make code freeze, and so it couldn't be revised then. This new code should make
+ things right finally.
+
+ Future Features: providing search_by_key with hints.
+
+*/
+static ssize_t reiserfs_file_write( struct file *file, /* the file we are going to write into */
+ const char __user *buf, /* pointer to user supplied data
+(in userspace) */
+ size_t count, /* amount of bytes to write */
+ loff_t *ppos /* pointer to position in file that we start writing at. Should be updated to
+ * new current position before returning. */ )
+{
+ size_t already_written = 0; // Number of bytes already written to the file.
+ loff_t pos; // Current position in the file.
+ ssize_t res; // return value of various functions that we call.
+ int err = 0;
+ struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to.
+ /* To simplify coding at this time, we store
+ locked pages in array for now */
+ struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
+ struct reiserfs_transaction_handle th;
+ th.t_trans_id = 0;
+
+ if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment
+ ssize_t result, after_file_end = 0;
+ if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) {
+ /* If we are appending a file, we need to put this savelink in here.
+ If we will crash while doing direct io, finish_unfinished will
+ cut the garbage from the file end. */
+ reiserfs_write_lock(inode->i_sb);
+ err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT );
+ if (err) {
+ reiserfs_write_unlock (inode->i_sb);
+ return err;
+ }
+ reiserfs_update_inode_transaction(inode);
+ add_save_link (&th, inode, 1 /* Truncate */);
+ after_file_end = 1;
+ err = journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT );
+ reiserfs_write_unlock(inode->i_sb);
+ if (err)
+ return err;
+ }
+ result = generic_file_write(file, buf, count, ppos);
+
+ if ( after_file_end ) { /* Now update i_size and remove the savelink */
+ struct reiserfs_transaction_handle th;
+ reiserfs_write_lock(inode->i_sb);
+ err = journal_begin(&th, inode->i_sb, 1);
+ if (err) {
+ reiserfs_write_unlock (inode->i_sb);
+ return err;
+ }
+ reiserfs_update_inode_transaction(inode);
+ reiserfs_update_sd(&th, inode);
+ err = journal_end(&th, inode->i_sb, 1);
+ if (err) {
+ reiserfs_write_unlock (inode->i_sb);
+ return err;
+ }
+ err = remove_save_link (inode, 1/* truncate */);
+ reiserfs_write_unlock(inode->i_sb);
+ if (err)
+ return err;
+ }
+
+ return result;
+ }
+
+ if ( unlikely((ssize_t) count < 0 ))
+ return -EINVAL;
+
+ if (unlikely(!access_ok(VERIFY_READ, buf, count)))
+ return -EFAULT;
+
+ down(&inode->i_sem); // locks the entire file for just us
+
+ pos = *ppos;
+
+ /* Check if we can write to specified region of file, file
+ is not overly big and this kind of stuff. Adjust pos and
+ count, if needed */
+ res = generic_write_checks(file, &pos, &count, 0);
+ if (res)
+ goto out;
+
+ if ( count == 0 )
+ goto out;
+
+ res = remove_suid(file->f_dentry);
+ if (res)
+ goto out;
+
+ inode_update_time(inode, 1); /* Both mtime and ctime */
+
+ // Ok, we are done with all the checks.
+
+ // Now we should start real work
+
+ /* If we are going to write past the file's packed tail or if we are going
+ to overwrite part of the tail, we need that tail to be converted into
+ unformatted node */
+ res = reiserfs_check_for_tail_and_convert( inode, pos, count);
+ if (res)
+ goto out;
+
+ while ( count > 0) {
+ /* This is the main loop in which we running until some error occures
+ or until we write all of the data. */
+ size_t num_pages;/* amount of pages we are going to write this iteration */
+ size_t write_bytes; /* amount of bytes to write during this iteration */
+ size_t blocks_to_allocate; /* how much blocks we need to allocate for this iteration */
+
+ /* (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos*/
+ num_pages = !!((pos+count) & (PAGE_CACHE_SIZE - 1)) + /* round up partial
+ pages */
+ ((count + (pos & (PAGE_CACHE_SIZE-1))) >> PAGE_CACHE_SHIFT);
+ /* convert size to amount of
+ pages */
+ reiserfs_write_lock(inode->i_sb);
+ if ( num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
+ || num_pages > reiserfs_can_fit_pages(inode->i_sb) ) {
+ /* If we were asked to write more data than we want to or if there
+ is not that much space, then we shorten amount of data to write
+ for this iteration. */
+ num_pages = min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME, reiserfs_can_fit_pages(inode->i_sb));
+ /* Also we should not forget to set size in bytes accordingly */
+ write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
+ (pos & (PAGE_CACHE_SIZE-1));
+ /* If position is not on the
+ start of the page, we need
+ to substract the offset
+ within page */
+ } else
+ write_bytes = count;
+
+ /* reserve the blocks to be allocated later, so that later on
+ we still have the space to write the blocks to */
+ reiserfs_claim_blocks_to_be_allocated(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
+ reiserfs_write_unlock(inode->i_sb);
+
+ if ( !num_pages ) { /* If we do not have enough space even for */
+ res = -ENOSPC; /* single page, return -ENOSPC */
+ if ( pos > (inode->i_size & (inode->i_sb->s_blocksize-1)))
+ break; // In case we are writing past the file end, break.
+ // Otherwise we are possibly overwriting the file, so
+ // let's set write size to be equal or less than blocksize.
+ // This way we get it correctly for file holes.
+ // But overwriting files on absolutelly full volumes would not
+ // be very efficient. Well, people are not supposed to fill
+ // 100% of disk space anyway.
+ write_bytes = min_t(size_t, count, inode->i_sb->s_blocksize - (pos & (inode->i_sb->s_blocksize - 1)));
+ num_pages = 1;
+ // No blocks were claimed before, so do it now.
+ reiserfs_claim_blocks_to_be_allocated(inode->i_sb, 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits));
+ }
+
+ /* Prepare for writing into the region, read in all the
+ partially overwritten pages, if needed. And lock the pages,
+ so that nobody else can access these until we are done.
+ We get number of actual blocks needed as a result.*/
+ blocks_to_allocate = reiserfs_prepare_file_region_for_write(inode, pos, num_pages, write_bytes, prepared_pages);
+ if ( blocks_to_allocate < 0 ) {
+ res = blocks_to_allocate;
+ reiserfs_release_claimed_blocks(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
+ break;
+ }
+
+ /* First we correct our estimate of how many blocks we need */
+ reiserfs_release_claimed_blocks(inode->i_sb, (num_pages << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) - blocks_to_allocate );
+
+ if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/
+ /* Fill in all the possible holes and append the file if needed */
+ res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
+ }
+
+ /* well, we have allocated the blocks, so it is time to free
+ the reservation we made earlier. */
+ reiserfs_release_claimed_blocks(inode->i_sb, blocks_to_allocate);
+ if ( res ) {
+ reiserfs_unprepare_pages(prepared_pages, num_pages);
+ break;
+ }
+
+/* NOTE that allocating blocks and filling blocks can be done in reverse order
+ and probably we would do that just to get rid of garbage in files after a
+ crash */
+
+ /* Copy data from user-supplied buffer to file's pages */
+ res = reiserfs_copy_from_user_to_file_region(pos, num_pages, write_bytes, prepared_pages, buf);
+ if ( res ) {
+ reiserfs_unprepare_pages(prepared_pages, num_pages);
+ break;
+ }
+
+ /* Send the pages to disk and unlock them. */
+ res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages,
+ write_bytes,prepared_pages);
+ if ( res )
+ break;
+
+ already_written += write_bytes;
+ buf += write_bytes;
+ *ppos = pos += write_bytes;
+ count -= write_bytes;
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ }
+
+ /* this is only true on error */
+ if (th.t_trans_id) {
+ reiserfs_write_lock(inode->i_sb);
+ err = journal_end(&th, th.t_super, th.t_blocks_allocated);
+ reiserfs_write_unlock(inode->i_sb);
+ if (err) {
+ res = err;
+ goto out;
+ }
+ }
+
+ if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
+ res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA);
+
+ up(&inode->i_sem);
+ reiserfs_async_progress_wait(inode->i_sb);
+ return (already_written != 0)?already_written:res;
+
+out:
+ up(&inode->i_sem); // unlock the file on exit.
+ return res;
+}
+
+static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user *buf,
+ size_t count, loff_t pos)
+{
+ return generic_file_aio_write(iocb, buf, count, pos);
+}
+
+
+
+struct file_operations reiserfs_file_operations = {
+ .read = generic_file_read,
+ .write = reiserfs_file_write,
+ .ioctl = reiserfs_ioctl,
+ .mmap = generic_file_mmap,
+ .release = reiserfs_file_release,
+ .fsync = reiserfs_sync_file,
+ .sendfile = generic_file_sendfile,
+ .aio_read = generic_file_aio_read,
+ .aio_write = reiserfs_aio_write,
+};
+
+
+struct inode_operations reiserfs_file_inode_operations = {
+ .truncate = reiserfs_vfs_truncate_file,
+ .setattr = reiserfs_setattr,
+ .setxattr = reiserfs_setxattr,
+ .getxattr = reiserfs_getxattr,
+ .listxattr = reiserfs_listxattr,
+ .removexattr = reiserfs_removexattr,
+ .permission = reiserfs_permission,
+};
+
+
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
new file mode 100644
index 00000000000..e4f64be9e15
--- /dev/null
+++ b/fs/reiserfs/fix_node.c
@@ -0,0 +1,2518 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+/**
+ ** old_item_num
+ ** old_entry_num
+ ** set_entry_sizes
+ ** create_virtual_node
+ ** check_left
+ ** check_right
+ ** directory_part_size
+ ** get_num_ver
+ ** set_parameters
+ ** is_leaf_removable
+ ** are_leaves_removable
+ ** get_empty_nodes
+ ** get_lfree
+ ** get_rfree
+ ** is_left_neighbor_in_cache
+ ** decrement_key
+ ** get_far_parent
+ ** get_parents
+ ** can_node_be_removed
+ ** ip_check_balance
+ ** dc_check_balance_internal
+ ** dc_check_balance_leaf
+ ** dc_check_balance
+ ** check_balance
+ ** get_direct_parent
+ ** get_neighbors
+ ** fix_nodes
+ **
+ **
+ **/
+
+
+#include <linux/config.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/buffer_head.h>
+
+
+/* To make any changes in the tree we find a node, that contains item
+ to be changed/deleted or position in the node we insert a new item
+ to. We call this node S. To do balancing we need to decide what we
+ will shift to left/right neighbor, or to a new node, where new item
+ will be etc. To make this analysis simpler we build virtual
+ node. Virtual node is an array of items, that will replace items of
+ node S. (For instance if we are going to delete an item, virtual
+ node does not contain it). Virtual node keeps information about
+ item sizes and types, mergeability of first and last items, sizes
+ of all entries in directory item. We use this array of items when
+ calculating what we can shift to neighbors and how many nodes we
+ have to have if we do not any shiftings, if we shift to left/right
+ neighbor or to both. */
+
+
+/* taking item number in virtual node, returns number of item, that it has in source buffer */
+static inline int old_item_num (int new_num, int affected_item_num, int mode)
+{
+ if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
+ return new_num;
+
+ if (mode == M_INSERT) {
+
+ RFALSE( new_num == 0,
+ "vs-8005: for INSERT mode and item number of inserted item");
+
+ return new_num - 1;
+ }
+
+ RFALSE( mode != M_DELETE,
+ "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", mode);
+ /* delete mode */
+ return new_num + 1;
+}
+
+static void create_virtual_node (struct tree_balance * tb, int h)
+{
+ struct item_head * ih;
+ struct virtual_node * vn = tb->tb_vn;
+ int new_num;
+ struct buffer_head * Sh; /* this comes from tb->S[h] */
+
+ Sh = PATH_H_PBUFFER (tb->tb_path, h);
+
+ /* size of changed node */
+ vn->vn_size = MAX_CHILD_SIZE (Sh) - B_FREE_SPACE (Sh) + tb->insert_size[h];
+
+ /* for internal nodes array if virtual items is not created */
+ if (h) {
+ vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
+ return;
+ }
+
+ /* number of items in virtual node */
+ vn->vn_nr_item = B_NR_ITEMS (Sh) + ((vn->vn_mode == M_INSERT)? 1 : 0) - ((vn->vn_mode == M_DELETE)? 1 : 0);
+
+ /* first virtual item */
+ vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
+ memset (vn->vn_vi, 0, vn->vn_nr_item * sizeof (struct virtual_item));
+ vn->vn_free_ptr += vn->vn_nr_item * sizeof (struct virtual_item);
+
+
+ /* first item in the node */
+ ih = B_N_PITEM_HEAD (Sh, 0);
+
+ /* define the mergeability for 0-th item (if it is not being deleted) */
+ if (op_is_left_mergeable (&(ih->ih_key), Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
+ vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
+
+ /* go through all items those remain in the virtual node (except for the new (inserted) one) */
+ for (new_num = 0; new_num < vn->vn_nr_item; new_num ++) {
+ int j;
+ struct virtual_item * vi = vn->vn_vi + new_num;
+ int is_affected = ((new_num != vn->vn_affected_item_num) ? 0 : 1);
+
+
+ if (is_affected && vn->vn_mode == M_INSERT)
+ continue;
+
+ /* get item number in source node */
+ j = old_item_num (new_num, vn->vn_affected_item_num, vn->vn_mode);
+
+ vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
+ vi->vi_ih = ih + j;
+ vi->vi_item = B_I_PITEM (Sh, ih + j);
+ vi->vi_uarea = vn->vn_free_ptr;
+
+ // FIXME: there is no check, that item operation did not
+ // consume too much memory
+ vn->vn_free_ptr += op_create_vi (vn, vi, is_affected, tb->insert_size [0]);
+ if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
+ reiserfs_panic (tb->tb_sb, "vs-8030: create_virtual_node: "
+ "virtual node space consumed");
+
+ if (!is_affected)
+ /* this is not being changed */
+ continue;
+
+ if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
+ vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
+ vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted
+ }
+ }
+
+
+ /* virtual inserted item is not defined yet */
+ if (vn->vn_mode == M_INSERT) {
+ struct virtual_item * vi = vn->vn_vi + vn->vn_affected_item_num;
+
+ RFALSE( vn->vn_ins_ih == 0,
+ "vs-8040: item header of inserted item is not specified");
+ vi->vi_item_len = tb->insert_size[0];
+ vi->vi_ih = vn->vn_ins_ih;
+ vi->vi_item = vn->vn_data;
+ vi->vi_uarea = vn->vn_free_ptr;
+
+ op_create_vi (vn, vi, 0/*not pasted or cut*/, tb->insert_size [0]);
+ }
+
+ /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
+ if (tb->CFR[0]) {
+ struct reiserfs_key * key;
+
+ key = B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]);
+ if (op_is_left_mergeable (key, Sh->b_size) && (vn->vn_mode != M_DELETE ||
+ vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1))
+ vn->vn_vi[vn->vn_nr_item-1].vi_type |= VI_TYPE_RIGHT_MERGEABLE;
+
+#ifdef CONFIG_REISERFS_CHECK
+ if (op_is_left_mergeable (key, Sh->b_size) &&
+ !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1) ) {
+ /* we delete last item and it could be merged with right neighbor's first item */
+ if (!(B_NR_ITEMS (Sh) == 1 && is_direntry_le_ih (B_N_PITEM_HEAD (Sh, 0)) &&
+ I_ENTRY_COUNT (B_N_PITEM_HEAD (Sh, 0)) == 1)) {
+ /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
+ print_block (Sh, 0, -1, -1);
+ reiserfs_panic (tb->tb_sb, "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
+ key, vn->vn_affected_item_num, vn->vn_mode, M_DELETE);
+ } else
+ /* we can delete directory item, that has only one directory entry in it */
+ ;
+ }
+#endif
+
+ }
+}
+
+
+/* using virtual node check, how many items can be shifted to left
+ neighbor */
+static void check_left (struct tree_balance * tb, int h, int cur_free)
+{
+ int i;
+ struct virtual_node * vn = tb->tb_vn;
+ struct virtual_item * vi;
+ int d_size, ih_size;
+
+ RFALSE( cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
+
+ /* internal level */
+ if (h > 0) {
+ tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
+ return;
+ }
+
+ /* leaf level */
+
+ if (!cur_free || !vn->vn_nr_item) {
+ /* no free space or nothing to move */
+ tb->lnum[h] = 0;
+ tb->lbytes = -1;
+ return;
+ }
+
+ RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
+ "vs-8055: parent does not exist or invalid");
+
+ vi = vn->vn_vi;
+ if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
+ /* all contents of S[0] fits into L[0] */
+
+ RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
+ "vs-8055: invalid mode or balance condition failed");
+
+ tb->lnum[0] = vn->vn_nr_item;
+ tb->lbytes = -1;
+ return;
+ }
+
+
+ d_size = 0, ih_size = IH_SIZE;
+
+ /* first item may be merge with last item in left neighbor */
+ if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
+ d_size = -((int)IH_SIZE), ih_size = 0;
+
+ tb->lnum[0] = 0;
+ for (i = 0; i < vn->vn_nr_item; i ++, ih_size = IH_SIZE, d_size = 0, vi ++) {
+ d_size += vi->vi_item_len;
+ if (cur_free >= d_size) {
+ /* the item can be shifted entirely */
+ cur_free -= d_size;
+ tb->lnum[0] ++;
+ continue;
+ }
+
+ /* the item cannot be shifted entirely, try to split it */
+ /* check whether L[0] can hold ih and at least one byte of the item body */
+ if (cur_free <= ih_size) {
+ /* cannot shift even a part of the current item */
+ tb->lbytes = -1;
+ return;
+ }
+ cur_free -= ih_size;
+
+ tb->lbytes = op_check_left (vi, cur_free, 0, 0);
+ if (tb->lbytes != -1)
+ /* count partially shifted item */
+ tb->lnum[0] ++;
+
+ break;
+ }
+
+ return;
+}
+
+
+/* using virtual node check, how many items can be shifted to right
+ neighbor */
+static void check_right (struct tree_balance * tb, int h, int cur_free)
+{
+ int i;
+ struct virtual_node * vn = tb->tb_vn;
+ struct virtual_item * vi;
+ int d_size, ih_size;
+
+ RFALSE( cur_free < 0, "vs-8070: cur_free < 0");
+
+ /* internal level */
+ if (h > 0) {
+ tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
+ return;
+ }
+
+ /* leaf level */
+
+ if (!cur_free || !vn->vn_nr_item) {
+ /* no free space */
+ tb->rnum[h] = 0;
+ tb->rbytes = -1;
+ return;
+ }
+
+ RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
+ "vs-8075: parent does not exist or invalid");
+
+ vi = vn->vn_vi + vn->vn_nr_item - 1;
+ if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
+ /* all contents of S[0] fits into R[0] */
+
+ RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
+ "vs-8080: invalid mode or balance condition failed");
+
+ tb->rnum[h] = vn->vn_nr_item;
+ tb->rbytes = -1;
+ return;
+ }
+
+ d_size = 0, ih_size = IH_SIZE;
+
+ /* last item may be merge with first item in right neighbor */
+ if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
+ d_size = -(int)IH_SIZE, ih_size = 0;
+
+ tb->rnum[0] = 0;
+ for (i = vn->vn_nr_item - 1; i >= 0; i --, d_size = 0, ih_size = IH_SIZE, vi --) {
+ d_size += vi->vi_item_len;
+ if (cur_free >= d_size) {
+ /* the item can be shifted entirely */
+ cur_free -= d_size;
+ tb->rnum[0] ++;
+ continue;
+ }
+
+ /* check whether R[0] can hold ih and at least one byte of the item body */
+ if ( cur_free <= ih_size ) { /* cannot shift even a part of the current item */
+ tb->rbytes = -1;
+ return;
+ }
+
+ /* R[0] can hold the header of the item and at least one byte of its body */
+ cur_free -= ih_size; /* cur_free is still > 0 */
+
+ tb->rbytes = op_check_right (vi, cur_free);
+ if (tb->rbytes != -1)
+ /* count partially shifted item */
+ tb->rnum[0] ++;
+
+ break;
+ }
+
+ return;
+}
+
+
+/*
+ * from - number of items, which are shifted to left neighbor entirely
+ * to - number of item, which are shifted to right neighbor entirely
+ * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
+ * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
+static int get_num_ver (int mode, struct tree_balance * tb, int h,
+ int from, int from_bytes,
+ int to, int to_bytes,
+ short * snum012, int flow
+ )
+{
+ int i;
+ int cur_free;
+ // int bytes;
+ int units;
+ struct virtual_node * vn = tb->tb_vn;
+ // struct virtual_item * vi;
+
+ int total_node_size, max_node_size, current_item_size;
+ int needed_nodes;
+ int start_item, /* position of item we start filling node from */
+ end_item, /* position of item we finish filling node by */
+ start_bytes,/* number of first bytes (entries for directory) of start_item-th item
+ we do not include into node that is being filled */
+ end_bytes; /* number of last bytes (entries for directory) of end_item-th item
+ we do node include into node that is being filled */
+ int split_item_positions[2]; /* these are positions in virtual item of
+ items, that are split between S[0] and
+ S1new and S1new and S2new */
+
+ split_item_positions[0] = -1;
+ split_item_positions[1] = -1;
+
+ /* We only create additional nodes if we are in insert or paste mode
+ or we are in replace mode at the internal level. If h is 0 and
+ the mode is M_REPLACE then in fix_nodes we change the mode to
+ paste or insert before we get here in the code. */
+ RFALSE( tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
+ "vs-8100: insert_size < 0 in overflow");
+
+ max_node_size = MAX_CHILD_SIZE (PATH_H_PBUFFER (tb->tb_path, h));
+
+ /* snum012 [0-2] - number of items, that lay
+ to S[0], first new node and second new node */
+ snum012[3] = -1; /* s1bytes */
+ snum012[4] = -1; /* s2bytes */
+
+ /* internal level */
+ if (h > 0) {
+ i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
+ if (i == max_node_size)
+ return 1;
+ return (i / max_node_size + 1);
+ }
+
+ /* leaf level */
+ needed_nodes = 1;
+ total_node_size = 0;
+ cur_free = max_node_size;
+
+ // start from 'from'-th item
+ start_item = from;
+ // skip its first 'start_bytes' units
+ start_bytes = ((from_bytes != -1) ? from_bytes : 0);
+
+ // last included item is the 'end_item'-th one
+ end_item = vn->vn_nr_item - to - 1;
+ // do not count last 'end_bytes' units of 'end_item'-th item
+ end_bytes = (to_bytes != -1) ? to_bytes : 0;
+
+ /* go through all item beginning from the start_item-th item and ending by
+ the end_item-th item. Do not count first 'start_bytes' units of
+ 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
+
+ for (i = start_item; i <= end_item; i ++) {
+ struct virtual_item * vi = vn->vn_vi + i;
+ int skip_from_end = ((i == end_item) ? end_bytes : 0);
+
+ RFALSE( needed_nodes > 3, "vs-8105: too many nodes are needed");
+
+ /* get size of current item */
+ current_item_size = vi->vi_item_len;
+
+ /* do not take in calculation head part (from_bytes) of from-th item */
+ current_item_size -= op_part_size (vi, 0/*from start*/, start_bytes);
+
+ /* do not take in calculation tail part of last item */
+ current_item_size -= op_part_size (vi, 1/*from end*/, skip_from_end);
+
+ /* if item fits into current node entierly */
+ if (total_node_size + current_item_size <= max_node_size) {
+ snum012[needed_nodes - 1] ++;
+ total_node_size += current_item_size;
+ start_bytes = 0;
+ continue;
+ }
+
+ if (current_item_size > max_node_size) {
+ /* virtual item length is longer, than max size of item in
+ a node. It is impossible for direct item */
+ RFALSE( is_direct_le_ih (vi->vi_ih),
+ "vs-8110: "
+ "direct item length is %d. It can not be longer than %d",
+ current_item_size, max_node_size);
+ /* we will try to split it */
+ flow = 1;
+ }
+
+ if (!flow) {
+ /* as we do not split items, take new node and continue */
+ needed_nodes ++; i --; total_node_size = 0;
+ continue;
+ }
+
+ // calculate number of item units which fit into node being
+ // filled
+ {
+ int free_space;
+
+ free_space = max_node_size - total_node_size - IH_SIZE;
+ units = op_check_left (vi, free_space, start_bytes, skip_from_end);
+ if (units == -1) {
+ /* nothing fits into current node, take new node and continue */
+ needed_nodes ++, i--, total_node_size = 0;
+ continue;
+ }
+ }
+
+ /* something fits into the current node */
+ //if (snum012[3] != -1 || needed_nodes != 1)
+ // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
+ //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
+ start_bytes += units;
+ snum012[needed_nodes - 1 + 3] = units;
+
+ if (needed_nodes > 2)
+ reiserfs_warning (tb->tb_sb, "vs-8111: get_num_ver: "
+ "split_item_position is out of boundary");
+ snum012[needed_nodes - 1] ++;
+ split_item_positions[needed_nodes - 1] = i;
+ needed_nodes ++;
+ /* continue from the same item with start_bytes != -1 */
+ start_item = i;
+ i --;
+ total_node_size = 0;
+ }
+
+ // sum012[4] (if it is not -1) contains number of units of which
+ // are to be in S1new, snum012[3] - to be in S0. They are supposed
+ // to be S1bytes and S2bytes correspondingly, so recalculate
+ if (snum012[4] > 0) {
+ int split_item_num;
+ int bytes_to_r, bytes_to_l;
+ int bytes_to_S1new;
+
+ split_item_num = split_item_positions[1];
+ bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
+ bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
+ bytes_to_S1new = ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0);
+
+ // s2bytes
+ snum012[4] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new;
+
+ if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
+ vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
+ reiserfs_warning (tb->tb_sb, "vs-8115: get_num_ver: not "
+ "directory or indirect item");
+ }
+
+ /* now we know S2bytes, calculate S1bytes */
+ if (snum012[3] > 0) {
+ int split_item_num;
+ int bytes_to_r, bytes_to_l;
+ int bytes_to_S2new;
+
+ split_item_num = split_item_positions[0];
+ bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
+ bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
+ bytes_to_S2new = ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0);
+
+ // s1bytes
+ snum012[3] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new;
+ }
+
+ return needed_nodes;
+}
+
+
+#ifdef CONFIG_REISERFS_CHECK
+extern struct tree_balance * cur_tb;
+#endif
+
+
+/* Set parameters for balancing.
+ * Performs write of results of analysis of balancing into structure tb,
+ * where it will later be used by the functions that actually do the balancing.
+ * Parameters:
+ * tb tree_balance structure;
+ * h current level of the node;
+ * lnum number of items from S[h] that must be shifted to L[h];
+ * rnum number of items from S[h] that must be shifted to R[h];
+ * blk_num number of blocks that S[h] will be splitted into;
+ * s012 number of items that fall into splitted nodes.
+ * lbytes number of bytes which flow to the left neighbor from the item that is not
+ * not shifted entirely
+ * rbytes number of bytes which flow to the right neighbor from the item that is not
+ * not shifted entirely
+ * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array)
+ */
+
+static void set_parameters (struct tree_balance * tb, int h, int lnum,
+ int rnum, int blk_num, short * s012, int lb, int rb)
+{
+
+ tb->lnum[h] = lnum;
+ tb->rnum[h] = rnum;
+ tb->blknum[h] = blk_num;
+
+ if (h == 0)
+ { /* only for leaf level */
+ if (s012 != NULL)
+ {
+ tb->s0num = * s012 ++,
+ tb->s1num = * s012 ++,
+ tb->s2num = * s012 ++;
+ tb->s1bytes = * s012 ++;
+ tb->s2bytes = * s012;
+ }
+ tb->lbytes = lb;
+ tb->rbytes = rb;
+ }
+ PROC_INFO_ADD( tb -> tb_sb, lnum[ h ], lnum );
+ PROC_INFO_ADD( tb -> tb_sb, rnum[ h ], rnum );
+
+ PROC_INFO_ADD( tb -> tb_sb, lbytes[ h ], lb );
+ PROC_INFO_ADD( tb -> tb_sb, rbytes[ h ], rb );
+}
+
+
+
+/* check, does node disappear if we shift tb->lnum[0] items to left
+ neighbor and tb->rnum[0] to the right one. */
+static int is_leaf_removable (struct tree_balance * tb)
+{
+ struct virtual_node * vn = tb->tb_vn;
+ int to_left, to_right;
+ int size;
+ int remain_items;
+
+ /* number of items, that will be shifted to left (right) neighbor
+ entirely */
+ to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
+ to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
+ remain_items = vn->vn_nr_item;
+
+ /* how many items remain in S[0] after shiftings to neighbors */
+ remain_items -= (to_left + to_right);
+
+ if (remain_items < 1) {
+ /* all content of node can be shifted to neighbors */
+ set_parameters (tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1);
+ return 1;
+ }
+
+ if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
+ /* S[0] is not removable */
+ return 0;
+
+ /* check, whether we can divide 1 remaining item between neighbors */
+
+ /* get size of remaining item (in item units) */
+ size = op_unit_num (&(vn->vn_vi[to_left]));
+
+ if (tb->lbytes + tb->rbytes >= size) {
+ set_parameters (tb, 0, to_left + 1, to_right + 1, 0, NULL, tb->lbytes, -1);
+ return 1;
+ }
+
+ return 0;
+}
+
+
+/* check whether L, S, R can be joined in one node */
+static int are_leaves_removable (struct tree_balance * tb, int lfree, int rfree)
+{
+ struct virtual_node * vn = tb->tb_vn;
+ int ih_size;
+ struct buffer_head *S0;
+
+ S0 = PATH_H_PBUFFER (tb->tb_path, 0);
+
+ ih_size = 0;
+ if (vn->vn_nr_item) {
+ if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
+ ih_size += IH_SIZE;
+
+ if (vn->vn_vi[vn->vn_nr_item-1].vi_type & VI_TYPE_RIGHT_MERGEABLE)
+ ih_size += IH_SIZE;
+ } else {
+ /* there was only one item and it will be deleted */
+ struct item_head * ih;
+
+ RFALSE( B_NR_ITEMS (S0) != 1,
+ "vs-8125: item number must be 1: it is %d", B_NR_ITEMS(S0));
+
+ ih = B_N_PITEM_HEAD (S0, 0);
+ if (tb->CFR[0] && !comp_short_le_keys (&(ih->ih_key), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0])))
+ if (is_direntry_le_ih (ih)) {
+ /* Directory must be in correct state here: that is
+ somewhere at the left side should exist first directory
+ item. But the item being deleted can not be that first
+ one because its right neighbor is item of the same
+ directory. (But first item always gets deleted in last
+ turn). So, neighbors of deleted item can be merged, so
+ we can save ih_size */
+ ih_size = IH_SIZE;
+
+ /* we might check that left neighbor exists and is of the
+ same directory */
+ RFALSE(le_ih_k_offset (ih) == DOT_OFFSET,
+ "vs-8130: first directory item can not be removed until directory is not empty");
+ }
+
+ }
+
+ if (MAX_CHILD_SIZE (S0) + vn->vn_size <= rfree + lfree + ih_size) {
+ set_parameters (tb, 0, -1, -1, -1, NULL, -1, -1);
+ PROC_INFO_INC( tb -> tb_sb, leaves_removable );
+ return 1;
+ }
+ return 0;
+
+}
+
+
+
+/* when we do not split item, lnum and rnum are numbers of entire items */
+#define SET_PAR_SHIFT_LEFT \
+if (h)\
+{\
+ int to_l;\
+ \
+ to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
+ (MAX_NR_KEY(Sh) + 1 - lpar);\
+ \
+ set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
+}\
+else \
+{\
+ if (lset==LEFT_SHIFT_FLOW)\
+ set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
+ tb->lbytes, -1);\
+ else\
+ set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
+ -1, -1);\
+}
+
+
+#define SET_PAR_SHIFT_RIGHT \
+if (h)\
+{\
+ int to_r;\
+ \
+ to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
+ \
+ set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
+}\
+else \
+{\
+ if (rset==RIGHT_SHIFT_FLOW)\
+ set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
+ -1, tb->rbytes);\
+ else\
+ set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
+ -1, -1);\
+}
+
+
+static void free_buffers_in_tb (
+ struct tree_balance * p_s_tb
+ ) {
+ int n_counter;
+
+ decrement_counters_in_path(p_s_tb->tb_path);
+
+ for ( n_counter = 0; n_counter < MAX_HEIGHT; n_counter++ ) {
+ decrement_bcount(p_s_tb->L[n_counter]);
+ p_s_tb->L[n_counter] = NULL;
+ decrement_bcount(p_s_tb->R[n_counter]);
+ p_s_tb->R[n_counter] = NULL;
+ decrement_bcount(p_s_tb->FL[n_counter]);
+ p_s_tb->FL[n_counter] = NULL;
+ decrement_bcount(p_s_tb->FR[n_counter]);
+ p_s_tb->FR[n_counter] = NULL;
+ decrement_bcount(p_s_tb->CFL[n_counter]);
+ p_s_tb->CFL[n_counter] = NULL;
+ decrement_bcount(p_s_tb->CFR[n_counter]);
+ p_s_tb->CFR[n_counter] = NULL;
+ }
+}
+
+
+/* Get new buffers for storing new nodes that are created while balancing.
+ * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
+ * CARRY_ON - schedule didn't occur while the function worked;
+ * NO_DISK_SPACE - no disk space.
+ */
+/* The function is NOT SCHEDULE-SAFE! */
+static int get_empty_nodes(
+ struct tree_balance * p_s_tb,
+ int n_h
+ ) {
+ struct buffer_head * p_s_new_bh,
+ * p_s_Sh = PATH_H_PBUFFER (p_s_tb->tb_path, n_h);
+ b_blocknr_t * p_n_blocknr,
+ a_n_blocknrs[MAX_AMOUNT_NEEDED] = {0, };
+ int n_counter,
+ n_number_of_freeblk,
+ n_amount_needed,/* number of needed empty blocks */
+ n_retval = CARRY_ON;
+ struct super_block * p_s_sb = p_s_tb->tb_sb;
+
+
+ /* number_of_freeblk is the number of empty blocks which have been
+ acquired for use by the balancing algorithm minus the number of
+ empty blocks used in the previous levels of the analysis,
+ number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
+ after empty blocks are acquired, and the balancing analysis is
+ then restarted, amount_needed is the number needed by this level
+ (n_h) of the balancing analysis.
+
+ Note that for systems with many processes writing, it would be
+ more layout optimal to calculate the total number needed by all
+ levels and then to run reiserfs_new_blocks to get all of them at once. */
+
+ /* Initiate number_of_freeblk to the amount acquired prior to the restart of
+ the analysis or 0 if not restarted, then subtract the amount needed
+ by all of the levels of the tree below n_h. */
+ /* blknum includes S[n_h], so we subtract 1 in this calculation */
+ for ( n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum; n_counter < n_h; n_counter++ )
+ n_number_of_freeblk -= ( p_s_tb->blknum[n_counter] ) ? (p_s_tb->blknum[n_counter] - 1) : 0;
+
+ /* Allocate missing empty blocks. */
+ /* if p_s_Sh == 0 then we are getting a new root */
+ n_amount_needed = ( p_s_Sh ) ? (p_s_tb->blknum[n_h] - 1) : 1;
+ /* Amount_needed = the amount that we need more than the amount that we have. */
+ if ( n_amount_needed > n_number_of_freeblk )
+ n_amount_needed -= n_number_of_freeblk;
+ else /* If we have enough already then there is nothing to do. */
+ return CARRY_ON;
+
+ /* No need to check quota - is not allocated for blocks used for formatted nodes */
+ if (reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
+ n_amount_needed) == NO_DISK_SPACE)
+ return NO_DISK_SPACE;
+
+ /* for each blocknumber we just got, get a buffer and stick it on FEB */
+ for ( p_n_blocknr = a_n_blocknrs, n_counter = 0; n_counter < n_amount_needed;
+ p_n_blocknr++, n_counter++ ) {
+
+ RFALSE( ! *p_n_blocknr,
+ "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
+
+ p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr);
+ RFALSE (buffer_dirty (p_s_new_bh) ||
+ buffer_journaled (p_s_new_bh) ||
+ buffer_journal_dirty (p_s_new_bh),
+ "PAP-8140: journlaled or dirty buffer %b for the new block",
+ p_s_new_bh);
+
+ /* Put empty buffers into the array. */
+ RFALSE (p_s_tb->FEB[p_s_tb->cur_blknum],
+ "PAP-8141: busy slot for new buffer");
+
+ set_buffer_journal_new (p_s_new_bh);
+ p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
+ }
+
+ if ( n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB (p_s_tb) )
+ n_retval = REPEAT_SEARCH ;
+
+ return n_retval;
+}
+
+
+/* Get free space of the left neighbor, which is stored in the parent
+ * node of the left neighbor. */
+static int get_lfree (struct tree_balance * tb, int h)
+{
+ struct buffer_head * l, * f;
+ int order;
+
+ if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0)
+ return 0;
+
+ if (f == l)
+ order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) - 1;
+ else {
+ order = B_NR_ITEMS (l);
+ f = l;
+ }
+
+ return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f,order)));
+}
+
+
+/* Get free space of the right neighbor,
+ * which is stored in the parent node of the right neighbor.
+ */
+static int get_rfree (struct tree_balance * tb, int h)
+{
+ struct buffer_head * r, * f;
+ int order;
+
+ if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0)
+ return 0;
+
+ if (f == r)
+ order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) + 1;
+ else {
+ order = 0;
+ f = r;
+ }
+
+ return (MAX_CHILD_SIZE(f) - dc_size( B_N_CHILD(f,order)));
+
+}
+
+
+/* Check whether left neighbor is in memory. */
+static int is_left_neighbor_in_cache(
+ struct tree_balance * p_s_tb,
+ int n_h
+ ) {
+ struct buffer_head * p_s_father, * left;
+ struct super_block * p_s_sb = p_s_tb->tb_sb;
+ b_blocknr_t n_left_neighbor_blocknr;
+ int n_left_neighbor_position;
+
+ if ( ! p_s_tb->FL[n_h] ) /* Father of the left neighbor does not exist. */
+ return 0;
+
+ /* Calculate father of the node to be balanced. */
+ p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
+
+ RFALSE( ! p_s_father ||
+ ! B_IS_IN_TREE (p_s_father) ||
+ ! B_IS_IN_TREE (p_s_tb->FL[n_h]) ||
+ ! buffer_uptodate (p_s_father) ||
+ ! buffer_uptodate (p_s_tb->FL[n_h]),
+ "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
+ p_s_father, p_s_tb->FL[n_h]);
+
+
+ /* Get position of the pointer to the left neighbor into the left father. */
+ n_left_neighbor_position = ( p_s_father == p_s_tb->FL[n_h] ) ?
+ p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
+ /* Get left neighbor block number. */
+ n_left_neighbor_blocknr = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
+ /* Look for the left neighbor in the cache. */
+ if ( (left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr)) ) {
+
+ RFALSE( buffer_uptodate (left) && ! B_IS_IN_TREE(left),
+ "vs-8170: left neighbor (%b %z) is not in the tree", left, left);
+ put_bh(left) ;
+ return 1;
+ }
+
+ return 0;
+}
+
+
+#define LEFT_PARENTS 'l'
+#define RIGHT_PARENTS 'r'
+
+
+static void decrement_key (struct cpu_key * p_s_key)
+{
+ // call item specific function for this key
+ item_ops[cpu_key_k_type (p_s_key)]->decrement_key (p_s_key);
+}
+
+
+
+
+/* Calculate far left/right parent of the left/right neighbor of the current node, that
+ * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
+ * Calculate left/right common parent of the current node and L[h]/R[h].
+ * Calculate left/right delimiting key position.
+ * Returns: PATH_INCORRECT - path in the tree is not correct;
+ SCHEDULE_OCCURRED - schedule occurred while the function worked;
+ * CARRY_ON - schedule didn't occur while the function worked;
+ */
+static int get_far_parent (struct tree_balance * p_s_tb,
+ int n_h,
+ struct buffer_head ** pp_s_father,
+ struct buffer_head ** pp_s_com_father,
+ char c_lr_par)
+{
+ struct buffer_head * p_s_parent;
+ INITIALIZE_PATH (s_path_to_neighbor_father);
+ struct path * p_s_path = p_s_tb->tb_path;
+ struct cpu_key s_lr_father_key;
+ int n_counter,
+ n_position = INT_MAX,
+ n_first_last_position = 0,
+ n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
+
+ /* Starting from F[n_h] go upwards in the tree, and look for the common
+ ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
+
+ n_counter = n_path_offset;
+
+ RFALSE( n_counter < FIRST_PATH_ELEMENT_OFFSET,
+ "PAP-8180: invalid path length");
+
+
+ for ( ; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter-- ) {
+ /* Check whether parent of the current buffer in the path is really parent in the tree. */
+ if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)) )
+ return REPEAT_SEARCH;
+ /* Check whether position in the parent is correct. */
+ if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_counter - 1)) > B_NR_ITEMS(p_s_parent) )
+ return REPEAT_SEARCH;
+ /* Check whether parent at the path really points to the child. */
+ if ( B_N_CHILD_NUM(p_s_parent, n_position) !=
+ PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr )
+ return REPEAT_SEARCH;
+ /* Return delimiting key if position in the parent is not equal to first/last one. */
+ if ( c_lr_par == RIGHT_PARENTS )
+ n_first_last_position = B_NR_ITEMS (p_s_parent);
+ if ( n_position != n_first_last_position ) {
+ *pp_s_com_father = p_s_parent;
+ get_bh(*pp_s_com_father) ;
+ /*(*pp_s_com_father = p_s_parent)->b_count++;*/
+ break;
+ }
+ }
+
+ /* if we are in the root of the tree, then there is no common father */
+ if ( n_counter == FIRST_PATH_ELEMENT_OFFSET ) {
+ /* Check whether first buffer in the path is the root of the tree. */
+ if ( PATH_OFFSET_PBUFFER(p_s_tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
+ SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
+ *pp_s_father = *pp_s_com_father = NULL;
+ return CARRY_ON;
+ }
+ return REPEAT_SEARCH;
+ }
+
+ RFALSE( B_LEVEL (*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
+ "PAP-8185: (%b %z) level too small",
+ *pp_s_com_father, *pp_s_com_father);
+
+ /* Check whether the common parent is locked. */
+
+ if ( buffer_locked (*pp_s_com_father) ) {
+ __wait_on_buffer(*pp_s_com_father);
+ if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
+ decrement_bcount(*pp_s_com_father);
+ return REPEAT_SEARCH;
+ }
+ }
+
+ /* So, we got common parent of the current node and its left/right neighbor.
+ Now we are geting the parent of the left/right neighbor. */
+
+ /* Form key to get parent of the left/right neighbor. */
+ le_key2cpu_key (&s_lr_father_key, B_N_PDELIM_KEY(*pp_s_com_father, ( c_lr_par == LEFT_PARENTS ) ?
+ (p_s_tb->lkey[n_h - 1] = n_position - 1) : (p_s_tb->rkey[n_h - 1] = n_position)));
+
+
+ if ( c_lr_par == LEFT_PARENTS )
+ decrement_key(&s_lr_father_key);
+
+ if (search_by_key(p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, n_h + 1) == IO_ERROR)
+ // path is released
+ return IO_ERROR;
+
+ if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
+ decrement_counters_in_path(&s_path_to_neighbor_father);
+ decrement_bcount(*pp_s_com_father);
+ return REPEAT_SEARCH;
+ }
+
+ *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
+
+ RFALSE( B_LEVEL (*pp_s_father) != n_h + 1,
+ "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
+ RFALSE( s_path_to_neighbor_father.path_length < FIRST_PATH_ELEMENT_OFFSET,
+ "PAP-8192: path length is too small");
+
+ s_path_to_neighbor_father.path_length--;
+ decrement_counters_in_path(&s_path_to_neighbor_father);
+ return CARRY_ON;
+}
+
+
+/* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of
+ * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset],
+ * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset].
+ * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset].
+ * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
+ * CARRY_ON - schedule didn't occur while the function worked;
+ */
+static int get_parents (struct tree_balance * p_s_tb, int n_h)
+{
+ struct path * p_s_path = p_s_tb->tb_path;
+ int n_position,
+ n_ret_value,
+ n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
+ struct buffer_head * p_s_curf,
+ * p_s_curcf;
+
+ /* Current node is the root of the tree or will be root of the tree */
+ if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
+ /* The root can not have parents.
+ Release nodes which previously were obtained as parents of the current node neighbors. */
+ decrement_bcount(p_s_tb->FL[n_h]);
+ decrement_bcount(p_s_tb->CFL[n_h]);
+ decrement_bcount(p_s_tb->FR[n_h]);
+ decrement_bcount(p_s_tb->CFR[n_h]);
+ p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] = p_s_tb->CFR[n_h] = NULL;
+ return CARRY_ON;
+ }
+
+ /* Get parent FL[n_path_offset] of L[n_path_offset]. */
+ if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) ) {
+ /* Current node is not the first child of its parent. */
+ /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
+ p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
+ get_bh(p_s_curf) ;
+ get_bh(p_s_curf) ;
+ p_s_tb->lkey[n_h] = n_position - 1;
+ }
+ else {
+ /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
+ Calculate current common parent of L[n_path_offset] and the current node. Note that
+ CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
+ Calculate lkey[n_path_offset]. */
+ if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
+ &p_s_curcf, LEFT_PARENTS)) != CARRY_ON )
+ return n_ret_value;
+ }
+
+ decrement_bcount(p_s_tb->FL[n_h]);
+ p_s_tb->FL[n_h] = p_s_curf; /* New initialization of FL[n_h]. */
+ decrement_bcount(p_s_tb->CFL[n_h]);
+ p_s_tb->CFL[n_h] = p_s_curcf; /* New initialization of CFL[n_h]. */
+
+ RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) ||
+ (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
+ "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
+
+/* Get parent FR[n_h] of R[n_h]. */
+
+/* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */
+ if ( n_position == B_NR_ITEMS (PATH_H_PBUFFER(p_s_path, n_h + 1)) ) {
+/* Calculate current parent of R[n_h], which is the right neighbor of F[n_h].
+ Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
+ not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
+ if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf, RIGHT_PARENTS)) != CARRY_ON )
+ return n_ret_value;
+ }
+ else {
+/* Current node is not the last child of its parent F[n_h]. */
+ /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
+ p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
+ get_bh(p_s_curf) ;
+ get_bh(p_s_curf) ;
+ p_s_tb->rkey[n_h] = n_position;
+ }
+
+ decrement_bcount(p_s_tb->FR[n_h]);
+ p_s_tb->FR[n_h] = p_s_curf; /* New initialization of FR[n_path_offset]. */
+
+ decrement_bcount(p_s_tb->CFR[n_h]);
+ p_s_tb->CFR[n_h] = p_s_curcf; /* New initialization of CFR[n_path_offset]. */
+
+ RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) ||
+ (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
+ "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
+
+ return CARRY_ON;
+}
+
+
+/* it is possible to remove node as result of shiftings to
+ neighbors even when we insert or paste item. */
+static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree, struct tree_balance * tb, int h)
+{
+ struct buffer_head * Sh = PATH_H_PBUFFER (tb->tb_path, h);
+ int levbytes = tb->insert_size[h];
+ struct item_head * ih;
+ struct reiserfs_key * r_key = NULL;
+
+ ih = B_N_PITEM_HEAD (Sh, 0);
+ if ( tb->CFR[h] )
+ r_key = B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]);
+
+ if (
+ lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
+ /* shifting may merge items which might save space */
+ - (( ! h && op_is_left_mergeable (&(ih->ih_key), Sh->b_size) ) ? IH_SIZE : 0)
+ - (( ! h && r_key && op_is_left_mergeable (r_key, Sh->b_size) ) ? IH_SIZE : 0)
+ + (( h ) ? KEY_SIZE : 0))
+ {
+ /* node can not be removed */
+ if (sfree >= levbytes ) { /* new item fits into node S[h] without any shifting */
+ if ( ! h )
+ tb->s0num = B_NR_ITEMS(Sh) + ((mode == M_INSERT ) ? 1 : 0);
+ set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
+ return NO_BALANCING_NEEDED;
+ }
+ }
+ PROC_INFO_INC( tb -> tb_sb, can_node_be_removed[ h ] );
+ return !NO_BALANCING_NEEDED;
+}
+
+
+
+/* Check whether current node S[h] is balanced when increasing its size by
+ * Inserting or Pasting.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ * tb tree_balance structure;
+ * h current level of the node;
+ * inum item number in S[h];
+ * mode i - insert, p - paste;
+ * Returns: 1 - schedule occurred;
+ * 0 - balancing for higher levels needed;
+ * -1 - no balancing for higher levels needed;
+ * -2 - no disk space.
+ */
+/* ip means Inserting or Pasting */
+static int ip_check_balance (struct tree_balance * tb, int h)
+{
+ struct virtual_node * vn = tb->tb_vn;
+ int levbytes, /* Number of bytes that must be inserted into (value
+ is negative if bytes are deleted) buffer which
+ contains node being balanced. The mnemonic is
+ that the attempted change in node space used level
+ is levbytes bytes. */
+ n_ret_value;
+
+ int lfree, sfree, rfree /* free space in L, S and R */;
+
+ /* nver is short for number of vertixes, and lnver is the number if
+ we shift to the left, rnver is the number if we shift to the
+ right, and lrnver is the number if we shift in both directions.
+ The goal is to minimize first the number of vertixes, and second,
+ the number of vertixes whose contents are changed by shifting,
+ and third the number of uncached vertixes whose contents are
+ changed by shifting and must be read from disk. */
+ int nver, lnver, rnver, lrnver;
+
+ /* used at leaf level only, S0 = S[0] is the node being balanced,
+ sInum [ I = 0,1,2 ] is the number of items that will
+ remain in node SI after balancing. S1 and S2 are new
+ nodes that might be created. */
+
+ /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters.
+ where 4th parameter is s1bytes and 5th - s2bytes
+ */
+ short snum012[40] = {0,}; /* s0num, s1num, s2num for 8 cases
+ 0,1 - do not shift and do not shift but bottle
+ 2 - shift only whole item to left
+ 3 - shift to left and bottle as much as possible
+ 4,5 - shift to right (whole items and as much as possible
+ 6,7 - shift to both directions (whole items and as much as possible)
+ */
+
+ /* Sh is the node whose balance is currently being checked */
+ struct buffer_head * Sh;
+
+ Sh = PATH_H_PBUFFER (tb->tb_path, h);
+ levbytes = tb->insert_size[h];
+
+ /* Calculate balance parameters for creating new root. */
+ if ( ! Sh ) {
+ if ( ! h )
+ reiserfs_panic (tb->tb_sb, "vs-8210: ip_check_balance: S[0] can not be 0");
+ switch ( n_ret_value = get_empty_nodes (tb, h) ) {
+ case CARRY_ON:
+ set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
+ return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
+
+ case NO_DISK_SPACE:
+ case REPEAT_SEARCH:
+ return n_ret_value;
+ default:
+ reiserfs_panic(tb->tb_sb, "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
+ }
+ }
+
+ if ( (n_ret_value = get_parents (tb, h)) != CARRY_ON ) /* get parents of S[h] neighbors. */
+ return n_ret_value;
+
+ sfree = B_FREE_SPACE (Sh);
+
+ /* get free space of neighbors */
+ rfree = get_rfree (tb, h);
+ lfree = get_lfree (tb, h);
+
+ if (can_node_be_removed (vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED)
+ /* and new item fits into node S[h] without any shifting */
+ return NO_BALANCING_NEEDED;
+
+ create_virtual_node (tb, h);
+
+ /*
+ determine maximal number of items we can shift to the left neighbor (in tb structure)
+ and the maximal number of bytes that can flow to the left neighbor
+ from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
+ */
+ check_left (tb, h, lfree);
+
+ /*
+ determine maximal number of items we can shift to the right neighbor (in tb structure)
+ and the maximal number of bytes that can flow to the right neighbor
+ from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
+ */
+ check_right (tb, h, rfree);
+
+
+ /* all contents of internal node S[h] can be moved into its
+ neighbors, S[h] will be removed after balancing */
+ if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
+ int to_r;
+
+ /* Since we are working on internal nodes, and our internal
+ nodes have fixed size entries, then we can balance by the
+ number of items rather than the space they consume. In this
+ routine we set the left node equal to the right node,
+ allowing a difference of less than or equal to 1 child
+ pointer. */
+ to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
+ (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
+ set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
+ return CARRY_ON;
+ }
+
+ /* this checks balance condition, that any two neighboring nodes can not fit in one node */
+ RFALSE( h &&
+ ( tb->lnum[h] >= vn->vn_nr_item + 1 ||
+ tb->rnum[h] >= vn->vn_nr_item + 1),
+ "vs-8220: tree is not balanced on internal level");
+ RFALSE( ! h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
+ (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1)) ),
+ "vs-8225: tree is not balanced on leaf level");
+
+ /* all contents of S[0] can be moved into its neighbors
+ S[0] will be removed after balancing. */
+ if (!h && is_leaf_removable (tb))
+ return CARRY_ON;
+
+
+ /* why do we perform this check here rather than earlier??
+ Answer: we can win 1 node in some cases above. Moreover we
+ checked it above, when we checked, that S[0] is not removable
+ in principle */
+ if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */
+ if ( ! h )
+ tb->s0num = vn->vn_nr_item;
+ set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
+ return NO_BALANCING_NEEDED;
+ }
+
+
+ {
+ int lpar, rpar, nset, lset, rset, lrset;
+ /*
+ * regular overflowing of the node
+ */
+
+ /* get_num_ver works in 2 modes (FLOW & NO_FLOW)
+ lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
+ nset, lset, rset, lrset - shows, whether flowing items give better packing
+ */
+#define FLOW 1
+#define NO_FLOW 0 /* do not any splitting */
+
+ /* we choose one the following */
+#define NOTHING_SHIFT_NO_FLOW 0
+#define NOTHING_SHIFT_FLOW 5
+#define LEFT_SHIFT_NO_FLOW 10
+#define LEFT_SHIFT_FLOW 15
+#define RIGHT_SHIFT_NO_FLOW 20
+#define RIGHT_SHIFT_FLOW 25
+#define LR_SHIFT_NO_FLOW 30
+#define LR_SHIFT_FLOW 35
+
+
+ lpar = tb->lnum[h];
+ rpar = tb->rnum[h];
+
+
+ /* calculate number of blocks S[h] must be split into when
+ nothing is shifted to the neighbors,
+ as well as number of items in each part of the split node (s012 numbers),
+ and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
+ nset = NOTHING_SHIFT_NO_FLOW;
+ nver = get_num_ver (vn->vn_mode, tb, h,
+ 0, -1, h?vn->vn_nr_item:0, -1,
+ snum012, NO_FLOW);
+
+ if (!h)
+ {
+ int nver1;
+
+ /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
+ nver1 = get_num_ver (vn->vn_mode, tb, h,
+ 0, -1, 0, -1,
+ snum012 + NOTHING_SHIFT_FLOW, FLOW);
+ if (nver > nver1)
+ nset = NOTHING_SHIFT_FLOW, nver = nver1;
+ }
+
+
+ /* calculate number of blocks S[h] must be split into when
+ l_shift_num first items and l_shift_bytes of the right most
+ liquid item to be shifted are shifted to the left neighbor,
+ as well as number of items in each part of the splitted node (s012 numbers),
+ and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+ */
+ lset = LEFT_SHIFT_NO_FLOW;
+ lnver = get_num_ver (vn->vn_mode, tb, h,
+ lpar - (( h || tb->lbytes == -1 ) ? 0 : 1), -1, h ? vn->vn_nr_item:0, -1,
+ snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
+ if (!h)
+ {
+ int lnver1;
+
+ lnver1 = get_num_ver (vn->vn_mode, tb, h,
+ lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, 0, -1,
+ snum012 + LEFT_SHIFT_FLOW, FLOW);
+ if (lnver > lnver1)
+ lset = LEFT_SHIFT_FLOW, lnver = lnver1;
+ }
+
+
+ /* calculate number of blocks S[h] must be split into when
+ r_shift_num first items and r_shift_bytes of the left most
+ liquid item to be shifted are shifted to the right neighbor,
+ as well as number of items in each part of the splitted node (s012 numbers),
+ and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+ */
+ rset = RIGHT_SHIFT_NO_FLOW;
+ rnver = get_num_ver (vn->vn_mode, tb, h,
+ 0, -1, h ? (vn->vn_nr_item-rpar) : (rpar - (( tb->rbytes != -1 ) ? 1 : 0)), -1,
+ snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
+ if (!h)
+ {
+ int rnver1;
+
+ rnver1 = get_num_ver (vn->vn_mode, tb, h,
+ 0, -1, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes,
+ snum012 + RIGHT_SHIFT_FLOW, FLOW);
+
+ if (rnver > rnver1)
+ rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
+ }
+
+
+ /* calculate number of blocks S[h] must be split into when
+ items are shifted in both directions,
+ as well as number of items in each part of the splitted node (s012 numbers),
+ and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+ */
+ lrset = LR_SHIFT_NO_FLOW;
+ lrnver = get_num_ver (vn->vn_mode, tb, h,
+ lpar - ((h || tb->lbytes == -1) ? 0 : 1), -1, h ? (vn->vn_nr_item-rpar):(rpar - ((tb->rbytes != -1) ? 1 : 0)), -1,
+ snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
+ if (!h)
+ {
+ int lrnver1;
+
+ lrnver1 = get_num_ver (vn->vn_mode, tb, h,
+ lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes,
+ snum012 + LR_SHIFT_FLOW, FLOW);
+ if (lrnver > lrnver1)
+ lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
+ }
+
+
+
+ /* Our general shifting strategy is:
+ 1) to minimized number of new nodes;
+ 2) to minimized number of neighbors involved in shifting;
+ 3) to minimized number of disk reads; */
+
+ /* we can win TWO or ONE nodes by shifting in both directions */
+ if (lrnver < lnver && lrnver < rnver)
+ {
+ RFALSE( h &&
+ (tb->lnum[h] != 1 ||
+ tb->rnum[h] != 1 ||
+ lrnver != 1 || rnver != 2 || lnver != 2 || h != 1),
+ "vs-8230: bad h");
+ if (lrset == LR_SHIFT_FLOW)
+ set_parameters (tb, h, tb->lnum[h], tb->rnum[h], lrnver, snum012 + lrset,
+ tb->lbytes, tb->rbytes);
+ else
+ set_parameters (tb, h, tb->lnum[h] - ((tb->lbytes == -1) ? 0 : 1),
+ tb->rnum[h] - ((tb->rbytes == -1) ? 0 : 1), lrnver, snum012 + lrset, -1, -1);
+
+ return CARRY_ON;
+ }
+
+ /* if shifting doesn't lead to better packing then don't shift */
+ if (nver == lrnver)
+ {
+ set_parameters (tb, h, 0, 0, nver, snum012 + nset, -1, -1);
+ return CARRY_ON;
+ }
+
+
+ /* now we know that for better packing shifting in only one
+ direction either to the left or to the right is required */
+
+ /* if shifting to the left is better than shifting to the right */
+ if (lnver < rnver)
+ {
+ SET_PAR_SHIFT_LEFT;
+ return CARRY_ON;
+ }
+
+ /* if shifting to the right is better than shifting to the left */
+ if (lnver > rnver)
+ {
+ SET_PAR_SHIFT_RIGHT;
+ return CARRY_ON;
+ }
+
+
+ /* now shifting in either direction gives the same number
+ of nodes and we can make use of the cached neighbors */
+ if (is_left_neighbor_in_cache (tb,h))
+ {
+ SET_PAR_SHIFT_LEFT;
+ return CARRY_ON;
+ }
+
+ /* shift to the right independently on whether the right neighbor in cache or not */
+ SET_PAR_SHIFT_RIGHT;
+ return CARRY_ON;
+ }
+}
+
+
+/* Check whether current node S[h] is balanced when Decreasing its size by
+ * Deleting or Cutting for INTERNAL node of S+tree.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ * tb tree_balance structure;
+ * h current level of the node;
+ * inum item number in S[h];
+ * mode i - insert, p - paste;
+ * Returns: 1 - schedule occurred;
+ * 0 - balancing for higher levels needed;
+ * -1 - no balancing for higher levels needed;
+ * -2 - no disk space.
+ *
+ * Note: Items of internal nodes have fixed size, so the balance condition for
+ * the internal part of S+tree is as for the B-trees.
+ */
+static int dc_check_balance_internal (struct tree_balance * tb, int h)
+{
+ struct virtual_node * vn = tb->tb_vn;
+
+ /* Sh is the node whose balance is currently being checked,
+ and Fh is its father. */
+ struct buffer_head * Sh, * Fh;
+ int maxsize,
+ n_ret_value;
+ int lfree, rfree /* free space in L and R */;
+
+ Sh = PATH_H_PBUFFER (tb->tb_path, h);
+ Fh = PATH_H_PPARENT (tb->tb_path, h);
+
+ maxsize = MAX_CHILD_SIZE(Sh);
+
+/* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
+/* new_nr_item = number of items node would have if operation is */
+/* performed without balancing (new_nr_item); */
+ create_virtual_node (tb, h);
+
+ if ( ! Fh )
+ { /* S[h] is the root. */
+ if ( vn->vn_nr_item > 0 )
+ {
+ set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
+ return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
+ }
+ /* new_nr_item == 0.
+ * Current root will be deleted resulting in
+ * decrementing the tree height. */
+ set_parameters (tb, h, 0, 0, 0, NULL, -1, -1);
+ return CARRY_ON;
+ }
+
+ if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
+ return n_ret_value;
+
+
+ /* get free space of neighbors */
+ rfree = get_rfree (tb, h);
+ lfree = get_lfree (tb, h);
+
+ /* determine maximal number of items we can fit into neighbors */
+ check_left (tb, h, lfree);
+ check_right (tb, h, rfree);
+
+
+ if ( vn->vn_nr_item >= MIN_NR_KEY(Sh) )
+ { /* Balance condition for the internal node is valid.
+ * In this case we balance only if it leads to better packing. */
+ if ( vn->vn_nr_item == MIN_NR_KEY(Sh) )
+ { /* Here we join S[h] with one of its neighbors,
+ * which is impossible with greater values of new_nr_item. */
+ if ( tb->lnum[h] >= vn->vn_nr_item + 1 )
+ {
+ /* All contents of S[h] can be moved to L[h]. */
+ int n;
+ int order_L;
+
+ order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
+ n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
+ set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
+ return CARRY_ON;
+ }
+
+ if ( tb->rnum[h] >= vn->vn_nr_item + 1 )
+ {
+ /* All contents of S[h] can be moved to R[h]. */
+ int n;
+ int order_R;
+
+ order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : n + 1;
+ n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
+ set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
+ return CARRY_ON;
+ }
+ }
+
+ if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
+ {
+ /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
+ int to_r;
+
+ to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
+ (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
+ set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
+ return CARRY_ON;
+ }
+
+ /* Balancing does not lead to better packing. */
+ set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
+ return NO_BALANCING_NEEDED;
+ }
+
+ /* Current node contain insufficient number of items. Balancing is required. */
+ /* Check whether we can merge S[h] with left neighbor. */
+ if (tb->lnum[h] >= vn->vn_nr_item + 1)
+ if (is_left_neighbor_in_cache (tb,h) || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h])
+ {
+ int n;
+ int order_L;
+
+ order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
+ n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
+ set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
+ return CARRY_ON;
+ }
+
+ /* Check whether we can merge S[h] with right neighbor. */
+ if (tb->rnum[h] >= vn->vn_nr_item + 1)
+ {
+ int n;
+ int order_R;
+
+ order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : (n + 1);
+ n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
+ set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
+ return CARRY_ON;
+ }
+
+ /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
+ if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
+ {
+ int to_r;
+
+ to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
+ (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
+ set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
+ return CARRY_ON;
+ }
+
+ /* For internal nodes try to borrow item from a neighbor */
+ RFALSE( !tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
+
+ /* Borrow one or two items from caching neighbor */
+ if (is_left_neighbor_in_cache (tb,h) || !tb->FR[h])
+ {
+ int from_l;
+
+ from_l = (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + 1) / 2 - (vn->vn_nr_item + 1);
+ set_parameters (tb, h, -from_l, 0, 1, NULL, -1, -1);
+ return CARRY_ON;
+ }
+
+ set_parameters (tb, h, 0, -((MAX_NR_KEY(Sh)+1-tb->rnum[h]+vn->vn_nr_item+1)/2-(vn->vn_nr_item+1)), 1,
+ NULL, -1, -1);
+ return CARRY_ON;
+}
+
+
+/* Check whether current node S[h] is balanced when Decreasing its size by
+ * Deleting or Truncating for LEAF node of S+tree.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ * tb tree_balance structure;
+ * h current level of the node;
+ * inum item number in S[h];
+ * mode i - insert, p - paste;
+ * Returns: 1 - schedule occurred;
+ * 0 - balancing for higher levels needed;
+ * -1 - no balancing for higher levels needed;
+ * -2 - no disk space.
+ */
+static int dc_check_balance_leaf (struct tree_balance * tb, int h)
+{
+ struct virtual_node * vn = tb->tb_vn;
+
+ /* Number of bytes that must be deleted from
+ (value is negative if bytes are deleted) buffer which
+ contains node being balanced. The mnemonic is that the
+ attempted change in node space used level is levbytes bytes. */
+ int levbytes;
+ /* the maximal item size */
+ int maxsize,
+ n_ret_value;
+ /* S0 is the node whose balance is currently being checked,
+ and F0 is its father. */
+ struct buffer_head * S0, * F0;
+ int lfree, rfree /* free space in L and R */;
+
+ S0 = PATH_H_PBUFFER (tb->tb_path, 0);
+ F0 = PATH_H_PPARENT (tb->tb_path, 0);
+
+ levbytes = tb->insert_size[h];
+
+ maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */
+
+ if ( ! F0 )
+ { /* S[0] is the root now. */
+
+ RFALSE( -levbytes >= maxsize - B_FREE_SPACE (S0),
+ "vs-8240: attempt to create empty buffer tree");
+
+ set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
+ return NO_BALANCING_NEEDED;
+ }
+
+ if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
+ return n_ret_value;
+
+ /* get free space of neighbors */
+ rfree = get_rfree (tb, h);
+ lfree = get_lfree (tb, h);
+
+ create_virtual_node (tb, h);
+
+ /* if 3 leaves can be merge to one, set parameters and return */
+ if (are_leaves_removable (tb, lfree, rfree))
+ return CARRY_ON;
+
+ /* determine maximal number of items we can shift to the left/right neighbor
+ and the maximal number of bytes that can flow to the left/right neighbor
+ from the left/right most liquid item that cannot be shifted from S[0] entirely
+ */
+ check_left (tb, h, lfree);
+ check_right (tb, h, rfree);
+
+ /* check whether we can merge S with left neighbor. */
+ if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
+ if (is_left_neighbor_in_cache (tb,h) ||
+ ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */
+ !tb->FR[h]) {
+
+ RFALSE( !tb->FL[h], "vs-8245: dc_check_balance_leaf: FL[h] must exist");
+
+ /* set parameter to merge S[0] with its left neighbor */
+ set_parameters (tb, h, -1, 0, 0, NULL, -1, -1);
+ return CARRY_ON;
+ }
+
+ /* check whether we can merge S[0] with right neighbor. */
+ if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
+ set_parameters (tb, h, 0, -1, 0, NULL, -1, -1);
+ return CARRY_ON;
+ }
+
+ /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
+ if (is_leaf_removable (tb))
+ return CARRY_ON;
+
+ /* Balancing is not required. */
+ tb->s0num = vn->vn_nr_item;
+ set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
+ return NO_BALANCING_NEEDED;
+}
+
+
+
+/* Check whether current node S[h] is balanced when Decreasing its size by
+ * Deleting or Cutting.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ * tb tree_balance structure;
+ * h current level of the node;
+ * inum item number in S[h];
+ * mode d - delete, c - cut.
+ * Returns: 1 - schedule occurred;
+ * 0 - balancing for higher levels needed;
+ * -1 - no balancing for higher levels needed;
+ * -2 - no disk space.
+ */
+static int dc_check_balance (struct tree_balance * tb, int h)
+{
+ RFALSE( ! (PATH_H_PBUFFER (tb->tb_path, h)), "vs-8250: S is not initialized");
+
+ if ( h )
+ return dc_check_balance_internal (tb, h);
+ else
+ return dc_check_balance_leaf (tb, h);
+}
+
+
+
+/* Check whether current node S[h] is balanced.
+ * Calculate parameters for balancing for current level h.
+ * Parameters:
+ *
+ * tb tree_balance structure:
+ *
+ * tb is a large structure that must be read about in the header file
+ * at the same time as this procedure if the reader is to successfully
+ * understand this procedure
+ *
+ * h current level of the node;
+ * inum item number in S[h];
+ * mode i - insert, p - paste, d - delete, c - cut.
+ * Returns: 1 - schedule occurred;
+ * 0 - balancing for higher levels needed;
+ * -1 - no balancing for higher levels needed;
+ * -2 - no disk space.
+ */
+static int check_balance (int mode,
+ struct tree_balance * tb,
+ int h,
+ int inum,
+ int pos_in_item,
+ struct item_head * ins_ih,
+ const void * data
+ )
+{
+ struct virtual_node * vn;
+
+ vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
+ vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
+ vn->vn_mode = mode;
+ vn->vn_affected_item_num = inum;
+ vn->vn_pos_in_item = pos_in_item;
+ vn->vn_ins_ih = ins_ih;
+ vn->vn_data = data;
+
+ RFALSE( mode == M_INSERT && !vn->vn_ins_ih,
+ "vs-8255: ins_ih can not be 0 in insert mode");
+
+ if ( tb->insert_size[h] > 0 )
+ /* Calculate balance parameters when size of node is increasing. */
+ return ip_check_balance (tb, h);
+
+ /* Calculate balance parameters when size of node is decreasing. */
+ return dc_check_balance (tb, h);
+}
+
+
+
+/* Check whether parent at the path is the really parent of the current node.*/
+static int get_direct_parent(
+ struct tree_balance * p_s_tb,
+ int n_h
+ ) {
+ struct buffer_head * p_s_bh;
+ struct path * p_s_path = p_s_tb->tb_path;
+ int n_position,
+ n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
+
+ /* We are in the root or in the new root. */
+ if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
+
+ RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
+ "PAP-8260: invalid offset in the path");
+
+ if ( PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
+ SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
+ /* Root is not changed. */
+ PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
+ PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
+ return CARRY_ON;
+ }
+ return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */
+ }
+
+ if ( ! B_IS_IN_TREE(p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)) )
+ return REPEAT_SEARCH; /* Parent in the path is not in the tree. */
+
+ if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) > B_NR_ITEMS(p_s_bh) )
+ return REPEAT_SEARCH;
+
+ if ( B_N_CHILD_NUM(p_s_bh, n_position) != PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr )
+ /* Parent in the path is not parent of the current node in the tree. */
+ return REPEAT_SEARCH;
+
+ if ( buffer_locked(p_s_bh) ) {
+ __wait_on_buffer(p_s_bh);
+ if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
+ return REPEAT_SEARCH;
+ }
+
+ return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */
+}
+
+
+/* Using lnum[n_h] and rnum[n_h] we should determine what neighbors
+ * of S[n_h] we
+ * need in order to balance S[n_h], and get them if necessary.
+ * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
+ * CARRY_ON - schedule didn't occur while the function worked;
+ */
+static int get_neighbors(
+ struct tree_balance * p_s_tb,
+ int n_h
+ ) {
+ int n_child_position,
+ n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
+ unsigned long n_son_number;
+ struct super_block * p_s_sb = p_s_tb->tb_sb;
+ struct buffer_head * p_s_bh;
+
+
+ PROC_INFO_INC( p_s_sb, get_neighbors[ n_h ] );
+
+ if ( p_s_tb->lnum[n_h] ) {
+ /* We need left neighbor to balance S[n_h]. */
+ PROC_INFO_INC( p_s_sb, need_l_neighbor[ n_h ] );
+ p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
+
+ RFALSE( p_s_bh == p_s_tb->FL[n_h] &&
+ ! PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
+ "PAP-8270: invalid position in the parent");
+
+ n_child_position = ( p_s_bh == p_s_tb->FL[n_h] ) ? p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
+ n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
+ p_s_bh = sb_bread(p_s_sb, n_son_number);
+ if (!p_s_bh)
+ return IO_ERROR;
+ if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
+ decrement_bcount(p_s_bh);
+ PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
+ return REPEAT_SEARCH;
+ }
+
+ RFALSE( ! B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
+ n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
+ B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
+ p_s_bh->b_blocknr, "PAP-8275: invalid parent");
+ RFALSE( ! B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
+ RFALSE( ! n_h &&
+ B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FL[0],n_child_position)),
+ "PAP-8290: invalid child size of left neighbor");
+
+ decrement_bcount(p_s_tb->L[n_h]);
+ p_s_tb->L[n_h] = p_s_bh;
+ }
+
+
+ if ( p_s_tb->rnum[n_h] ) { /* We need right neighbor to balance S[n_path_offset]. */
+ PROC_INFO_INC( p_s_sb, need_r_neighbor[ n_h ] );
+ p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
+
+ RFALSE( p_s_bh == p_s_tb->FR[n_h] &&
+ PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset) >= B_NR_ITEMS(p_s_bh),
+ "PAP-8295: invalid position in the parent");
+
+ n_child_position = ( p_s_bh == p_s_tb->FR[n_h] ) ? p_s_tb->rkey[n_h] + 1 : 0;
+ n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
+ p_s_bh = sb_bread(p_s_sb, n_son_number);
+ if (!p_s_bh)
+ return IO_ERROR;
+ if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
+ decrement_bcount(p_s_bh);
+ PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
+ return REPEAT_SEARCH;
+ }
+ decrement_bcount(p_s_tb->R[n_h]);
+ p_s_tb->R[n_h] = p_s_bh;
+
+ RFALSE( ! n_h && B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)),
+ "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
+ B_FREE_SPACE (p_s_bh), MAX_CHILD_SIZE (p_s_bh),
+ dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)));
+
+ }
+ return CARRY_ON;
+}
+
+#ifdef CONFIG_REISERFS_CHECK
+void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s)
+{
+ void * vp;
+ static size_t malloced;
+
+
+ vp = kmalloc (size, flags);
+ if (vp) {
+ REISERFS_SB(s)->s_kmallocs += size;
+ if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) {
+ reiserfs_warning (s,
+ "vs-8301: reiserfs_kmalloc: allocated memory %d",
+ REISERFS_SB(s)->s_kmallocs);
+ malloced = REISERFS_SB(s)->s_kmallocs;
+ }
+ }
+ return vp;
+}
+
+void reiserfs_kfree (const void * vp, size_t size, struct super_block * s)
+{
+ kfree (vp);
+
+ REISERFS_SB(s)->s_kmallocs -= size;
+ if (REISERFS_SB(s)->s_kmallocs < 0)
+ reiserfs_warning (s, "vs-8302: reiserfs_kfree: allocated memory %d",
+ REISERFS_SB(s)->s_kmallocs);
+
+}
+#endif
+
+
+static int get_virtual_node_size (struct super_block * sb, struct buffer_head * bh)
+{
+ int max_num_of_items;
+ int max_num_of_entries;
+ unsigned long blocksize = sb->s_blocksize;
+
+#define MIN_NAME_LEN 1
+
+ max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
+ max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
+ (DEH_SIZE + MIN_NAME_LEN);
+
+ return sizeof(struct virtual_node) +
+ max(max_num_of_items * sizeof (struct virtual_item),
+ sizeof (struct virtual_item) + sizeof(struct direntry_uarea) +
+ (max_num_of_entries - 1) * sizeof (__u16));
+}
+
+
+
+/* maybe we should fail balancing we are going to perform when kmalloc
+ fails several times. But now it will loop until kmalloc gets
+ required memory */
+static int get_mem_for_virtual_node (struct tree_balance * tb)
+{
+ int check_fs = 0;
+ int size;
+ char * buf;
+
+ size = get_virtual_node_size (tb->tb_sb, PATH_PLAST_BUFFER (tb->tb_path));
+
+ if (size > tb->vn_buf_size) {
+ /* we have to allocate more memory for virtual node */
+ if (tb->vn_buf) {
+ /* free memory allocated before */
+ reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
+ /* this is not needed if kfree is atomic */
+ check_fs = 1;
+ }
+
+ /* virtual node requires now more memory */
+ tb->vn_buf_size = size;
+
+ /* get memory for virtual item */
+ buf = reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN, tb->tb_sb);
+ if ( ! buf ) {
+ /* getting memory with GFP_KERNEL priority may involve
+ balancing now (due to indirect_to_direct conversion on
+ dcache shrinking). So, release path and collected
+ resources here */
+ free_buffers_in_tb (tb);
+ buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb);
+ if ( !buf ) {
+#ifdef CONFIG_REISERFS_CHECK
+ reiserfs_warning (tb->tb_sb,
+ "vs-8345: get_mem_for_virtual_node: "
+ "kmalloc failed. reiserfs kmalloced %d bytes",
+ REISERFS_SB(tb->tb_sb)->s_kmallocs);
+#endif
+ tb->vn_buf_size = 0;
+ }
+ tb->vn_buf = buf;
+ schedule() ;
+ return REPEAT_SEARCH;
+ }
+
+ tb->vn_buf = buf;
+ }
+
+ if ( check_fs && FILESYSTEM_CHANGED_TB (tb) )
+ return REPEAT_SEARCH;
+
+ return CARRY_ON;
+}
+
+
+#ifdef CONFIG_REISERFS_CHECK
+static void tb_buffer_sanity_check (struct super_block * p_s_sb,
+ struct buffer_head * p_s_bh,
+ const char *descr, int level) {
+ if (p_s_bh) {
+ if (atomic_read (&(p_s_bh->b_count)) <= 0) {
+
+ reiserfs_panic (p_s_sb, "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n", descr, level, p_s_bh);
+ }
+
+ if ( ! buffer_uptodate (p_s_bh) ) {
+ reiserfs_panic (p_s_sb, "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n", descr, level, p_s_bh);
+ }
+
+ if ( ! B_IS_IN_TREE (p_s_bh) ) {
+ reiserfs_panic (p_s_sb, "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n", descr, level, p_s_bh);
+ }
+
+ if (p_s_bh->b_bdev != p_s_sb->s_bdev) {
+ reiserfs_panic (p_s_sb, "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n", descr, level, p_s_bh);
+ }
+
+ if (p_s_bh->b_size != p_s_sb->s_blocksize) {
+ reiserfs_panic (p_s_sb, "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n", descr, level, p_s_bh);
+ }
+
+ if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
+ reiserfs_panic (p_s_sb, "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n", descr, level, p_s_bh);
+ }
+ }
+}
+#else
+static void tb_buffer_sanity_check (struct super_block * p_s_sb,
+ struct buffer_head * p_s_bh,
+ const char *descr, int level)
+{;}
+#endif
+
+static int clear_all_dirty_bits(struct super_block *s,
+ struct buffer_head *bh) {
+ return reiserfs_prepare_for_journal(s, bh, 0) ;
+}
+
+static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
+{
+ struct buffer_head * locked;
+#ifdef CONFIG_REISERFS_CHECK
+ int repeat_counter = 0;
+#endif
+ int i;
+
+ do {
+
+ locked = NULL;
+
+ for ( i = p_s_tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i-- ) {
+ if ( PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i) ) {
+ /* if I understand correctly, we can only be sure the last buffer
+ ** in the path is in the tree --clm
+ */
+#ifdef CONFIG_REISERFS_CHECK
+ if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
+ PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
+ tb_buffer_sanity_check (p_s_tb->tb_sb,
+ PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i),
+ "S",
+ p_s_tb->tb_path->path_length - i);
+ }
+#endif
+ if (!clear_all_dirty_bits(p_s_tb->tb_sb,
+ PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)))
+ {
+ locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i);
+ }
+ }
+ }
+
+ for ( i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i]; i++ ) {
+
+ if (p_s_tb->lnum[i] ) {
+
+ if ( p_s_tb->L[i] ) {
+ tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i);
+ if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]))
+ locked = p_s_tb->L[i];
+ }
+
+ if ( !locked && p_s_tb->FL[i] ) {
+ tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i);
+ if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]))
+ locked = p_s_tb->FL[i];
+ }
+
+ if ( !locked && p_s_tb->CFL[i] ) {
+ tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i);
+ if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]))
+ locked = p_s_tb->CFL[i];
+ }
+
+ }
+
+ if ( !locked && (p_s_tb->rnum[i]) ) {
+
+ if ( p_s_tb->R[i] ) {
+ tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i);
+ if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]))
+ locked = p_s_tb->R[i];
+ }
+
+
+ if ( !locked && p_s_tb->FR[i] ) {
+ tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i);
+ if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]))
+ locked = p_s_tb->FR[i];
+ }
+
+ if ( !locked && p_s_tb->CFR[i] ) {
+ tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i);
+ if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]))
+ locked = p_s_tb->CFR[i];
+ }
+ }
+ }
+ /* as far as I can tell, this is not required. The FEB list seems
+ ** to be full of newly allocated nodes, which will never be locked,
+ ** dirty, or anything else.
+ ** To be safe, I'm putting in the checks and waits in. For the moment,
+ ** they are needed to keep the code in journal.c from complaining
+ ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well.
+ ** --clm
+ */
+ for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) {
+ if ( p_s_tb->FEB[i] ) {
+ if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]))
+ locked = p_s_tb->FEB[i] ;
+ }
+ }
+
+ if (locked) {
+#ifdef CONFIG_REISERFS_CHECK
+ repeat_counter++;
+ if ( (repeat_counter % 10000) == 0) {
+ reiserfs_warning (p_s_tb->tb_sb,
+ "wait_tb_buffers_until_released(): too many "
+ "iterations waiting for buffer to unlock "
+ "(%b)", locked);
+
+ /* Don't loop forever. Try to recover from possible error. */
+
+ return ( FILESYSTEM_CHANGED_TB (p_s_tb) ) ? REPEAT_SEARCH : CARRY_ON;
+ }
+#endif
+ __wait_on_buffer (locked);
+ if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
+ return REPEAT_SEARCH;
+ }
+ }
+
+ } while (locked);
+
+ return CARRY_ON;
+}
+
+
+/* Prepare for balancing, that is
+ * get all necessary parents, and neighbors;
+ * analyze what and where should be moved;
+ * get sufficient number of new nodes;
+ * Balancing will start only after all resources will be collected at a time.
+ *
+ * When ported to SMP kernels, only at the last moment after all needed nodes
+ * are collected in cache, will the resources be locked using the usual
+ * textbook ordered lock acquisition algorithms. Note that ensuring that
+ * this code neither write locks what it does not need to write lock nor locks out of order
+ * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans
+ *
+ * fix is meant in the sense of render unchanging
+ *
+ * Latency might be improved by first gathering a list of what buffers are needed
+ * and then getting as many of them in parallel as possible? -Hans
+ *
+ * Parameters:
+ * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
+ * tb tree_balance structure;
+ * inum item number in S[h];
+ * pos_in_item - comment this if you can
+ * ins_ih & ins_sd are used when inserting
+ * Returns: 1 - schedule occurred while the function worked;
+ * 0 - schedule didn't occur while the function worked;
+ * -1 - if no_disk_space
+ */
+
+
+int fix_nodes (int n_op_mode,
+ struct tree_balance * p_s_tb,
+ struct item_head * p_s_ins_ih, // item head of item being inserted
+ const void * data // inserted item or data to be pasted
+ ) {
+ int n_ret_value,
+ n_h,
+ n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
+ int n_pos_in_item;
+
+ /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
+ ** during wait_tb_buffers_run
+ */
+ int wait_tb_buffers_run = 0 ;
+ struct buffer_head * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
+
+ ++ REISERFS_SB(p_s_tb -> tb_sb) -> s_fix_nodes;
+
+ n_pos_in_item = p_s_tb->tb_path->pos_in_item;
+
+
+ p_s_tb->fs_gen = get_generation (p_s_tb->tb_sb);
+
+ /* we prepare and log the super here so it will already be in the
+ ** transaction when do_balance needs to change it.
+ ** This way do_balance won't have to schedule when trying to prepare
+ ** the super for logging
+ */
+ reiserfs_prepare_for_journal(p_s_tb->tb_sb,
+ SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1) ;
+ journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb,
+ SB_BUFFER_WITH_SB(p_s_tb->tb_sb)) ;
+ if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
+ return REPEAT_SEARCH;
+
+ /* if it possible in indirect_to_direct conversion */
+ if (buffer_locked (p_s_tbS0)) {
+ __wait_on_buffer (p_s_tbS0);
+ if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
+ return REPEAT_SEARCH;
+ }
+
+#ifdef CONFIG_REISERFS_CHECK
+ if ( cur_tb ) {
+ print_cur_tb ("fix_nodes");
+ reiserfs_panic(p_s_tb->tb_sb,"PAP-8305: fix_nodes: there is pending do_balance");
+ }
+
+ if (!buffer_uptodate (p_s_tbS0) || !B_IS_IN_TREE (p_s_tbS0)) {
+ reiserfs_panic (p_s_tb->tb_sb, "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
+ "at the beginning of fix_nodes or not in tree (mode %c)", p_s_tbS0, p_s_tbS0, n_op_mode);
+ }
+
+ /* Check parameters. */
+ switch (n_op_mode) {
+ case M_INSERT:
+ if ( n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0) )
+ reiserfs_panic(p_s_tb->tb_sb,"PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
+ n_item_num, B_NR_ITEMS(p_s_tbS0));
+ break;
+ case M_PASTE:
+ case M_DELETE:
+ case M_CUT:
+ if ( n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0) ) {
+ print_block (p_s_tbS0, 0, -1, -1);
+ reiserfs_panic(p_s_tb->tb_sb,"PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n", n_item_num, n_op_mode, p_s_tb->insert_size[0]);
+ }
+ break;
+ default:
+ reiserfs_panic(p_s_tb->tb_sb,"PAP-8340: fix_nodes: Incorrect mode of operation");
+ }
+#endif
+
+ if (get_mem_for_virtual_node (p_s_tb) == REPEAT_SEARCH)
+ // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
+ return REPEAT_SEARCH;
+
+
+ /* Starting from the leaf level; for all levels n_h of the tree. */
+ for ( n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++ ) {
+ if ( (n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON ) {
+ goto repeat;
+ }
+
+ if ( (n_ret_value = check_balance (n_op_mode, p_s_tb, n_h, n_item_num,
+ n_pos_in_item, p_s_ins_ih, data)) != CARRY_ON ) {
+ if ( n_ret_value == NO_BALANCING_NEEDED ) {
+ /* No balancing for higher levels needed. */
+ if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
+ goto repeat;
+ }
+ if ( n_h != MAX_HEIGHT - 1 )
+ p_s_tb->insert_size[n_h + 1] = 0;
+ /* ok, analysis and resource gathering are complete */
+ break;
+ }
+ goto repeat;
+ }
+
+ if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
+ goto repeat;
+ }
+
+ if ( (n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON ) {
+ goto repeat; /* No disk space, or schedule occurred and
+ analysis may be invalid and needs to be redone. */
+ }
+
+ if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h) ) {
+ /* We have a positive insert size but no nodes exist on this
+ level, this means that we are creating a new root. */
+
+ RFALSE( p_s_tb->blknum[n_h] != 1,
+ "PAP-8350: creating new empty root");
+
+ if ( n_h < MAX_HEIGHT - 1 )
+ p_s_tb->insert_size[n_h + 1] = 0;
+ }
+ else
+ if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1) ) {
+ if ( p_s_tb->blknum[n_h] > 1 ) {
+ /* The tree needs to be grown, so this node S[n_h]
+ which is the root node is split into two nodes,
+ and a new node (S[n_h+1]) will be created to
+ become the root node. */
+
+ RFALSE( n_h == MAX_HEIGHT - 1,
+ "PAP-8355: attempt to create too high of a tree");
+
+ p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) + DC_SIZE;
+ }
+ else
+ if ( n_h < MAX_HEIGHT - 1 )
+ p_s_tb->insert_size[n_h + 1] = 0;
+ }
+ else
+ p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
+ }
+
+ if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) {
+ if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+ wait_tb_buffers_run = 1 ;
+ n_ret_value = REPEAT_SEARCH ;
+ goto repeat;
+ } else {
+ return CARRY_ON;
+ }
+ } else {
+ wait_tb_buffers_run = 1 ;
+ goto repeat;
+ }
+
+ repeat:
+ // fix_nodes was unable to perform its calculation due to
+ // filesystem got changed under us, lack of free disk space or i/o
+ // failure. If the first is the case - the search will be
+ // repeated. For now - free all resources acquired so far except
+ // for the new allocated nodes
+ {
+ int i;
+
+ /* Release path buffers. */
+ if (wait_tb_buffers_run) {
+ pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path) ;
+ } else {
+ pathrelse (p_s_tb->tb_path);
+ }
+ /* brelse all resources collected for balancing */
+ for ( i = 0; i < MAX_HEIGHT; i++ ) {
+ if (wait_tb_buffers_run) {
+ reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->L[i]);
+ reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->R[i]);
+ reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FL[i]);
+ reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FR[i]);
+ reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFL[i]);
+ reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFR[i]);
+ }
+
+ brelse (p_s_tb->L[i]);p_s_tb->L[i] = NULL;
+ brelse (p_s_tb->R[i]);p_s_tb->R[i] = NULL;
+ brelse (p_s_tb->FL[i]);p_s_tb->FL[i] = NULL;
+ brelse (p_s_tb->FR[i]);p_s_tb->FR[i] = NULL;
+ brelse (p_s_tb->CFL[i]);p_s_tb->CFL[i] = NULL;
+ brelse (p_s_tb->CFR[i]);p_s_tb->CFR[i] = NULL;
+ }
+
+ if (wait_tb_buffers_run) {
+ for ( i = 0; i < MAX_FEB_SIZE; i++ ) {
+ if ( p_s_tb->FEB[i] ) {
+ reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
+ p_s_tb->FEB[i]) ;
+ }
+ }
+ }
+ return n_ret_value;
+ }
+
+}
+
+
+/* Anatoly will probably forgive me renaming p_s_tb to tb. I just
+ wanted to make lines shorter */
+void unfix_nodes (struct tree_balance * tb)
+{
+ int i;
+
+ /* Release path buffers. */
+ pathrelse_and_restore (tb->tb_sb, tb->tb_path);
+
+ /* brelse all resources collected for balancing */
+ for ( i = 0; i < MAX_HEIGHT; i++ ) {
+ reiserfs_restore_prepared_buffer (tb->tb_sb, tb->L[i]);
+ reiserfs_restore_prepared_buffer (tb->tb_sb, tb->R[i]);
+ reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FL[i]);
+ reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FR[i]);
+ reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFL[i]);
+ reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFR[i]);
+
+ brelse (tb->L[i]);
+ brelse (tb->R[i]);
+ brelse (tb->FL[i]);
+ brelse (tb->FR[i]);
+ brelse (tb->CFL[i]);
+ brelse (tb->CFR[i]);
+ }
+
+ /* deal with list of allocated (used and unused) nodes */
+ for ( i = 0; i < MAX_FEB_SIZE; i++ ) {
+ if ( tb->FEB[i] ) {
+ b_blocknr_t blocknr = tb->FEB[i]->b_blocknr ;
+ /* de-allocated block which was not used by balancing and
+ bforget about buffer for it */
+ brelse (tb->FEB[i]);
+ reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
+ }
+ if (tb->used[i]) {
+ /* release used as new nodes including a new root */
+ brelse (tb->used[i]);
+ }
+ }
+
+ if (tb->vn_buf)
+ reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
+
+}
+
+
+
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
new file mode 100644
index 00000000000..08d0508c2d3
--- /dev/null
+++ b/fs/reiserfs/hashes.c
@@ -0,0 +1,209 @@
+
+/*
+ * Keyed 32-bit hash function using TEA in a Davis-Meyer function
+ * H0 = Key
+ * Hi = E Mi(Hi-1) + Hi-1
+ *
+ * (see Applied Cryptography, 2nd edition, p448).
+ *
+ * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
+ *
+ * Jeremy has agreed to the contents of reiserfs/README. -Hans
+ * Yura's function is added (04/07/2000)
+ */
+
+//
+// keyed_hash
+// yura_hash
+// r5_hash
+//
+
+#include <linux/kernel.h>
+#include <asm/types.h>
+#include <asm/bug.h>
+
+
+#define DELTA 0x9E3779B9
+#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
+#define PARTROUNDS 6 /* 6 gets complete mixing */
+
+/* a, b, c, d - data; h0, h1 - accumulated hash */
+#define TEACORE(rounds) \
+ do { \
+ u32 sum = 0; \
+ int n = rounds; \
+ u32 b0, b1; \
+ \
+ b0 = h0; \
+ b1 = h1; \
+ \
+ do \
+ { \
+ sum += DELTA; \
+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
+ } while(--n); \
+ \
+ h0 += b0; \
+ h1 += b1; \
+ } while(0)
+
+
+u32 keyed_hash(const signed char *msg, int len)
+{
+ u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3};
+
+ u32 h0 = k[0], h1 = k[1];
+ u32 a, b, c, d;
+ u32 pad;
+ int i;
+
+ // assert(len >= 0 && len < 256);
+
+ pad = (u32)len | ((u32)len << 8);
+ pad |= pad << 16;
+
+ while(len >= 16)
+ {
+ a = (u32)msg[ 0] |
+ (u32)msg[ 1] << 8 |
+ (u32)msg[ 2] << 16|
+ (u32)msg[ 3] << 24;
+ b = (u32)msg[ 4] |
+ (u32)msg[ 5] << 8 |
+ (u32)msg[ 6] << 16|
+ (u32)msg[ 7] << 24;
+ c = (u32)msg[ 8] |
+ (u32)msg[ 9] << 8 |
+ (u32)msg[10] << 16|
+ (u32)msg[11] << 24;
+ d = (u32)msg[12] |
+ (u32)msg[13] << 8 |
+ (u32)msg[14] << 16|
+ (u32)msg[15] << 24;
+
+ TEACORE(PARTROUNDS);
+
+ len -= 16;
+ msg += 16;
+ }
+
+ if (len >= 12)
+ {
+ a = (u32)msg[ 0] |
+ (u32)msg[ 1] << 8 |
+ (u32)msg[ 2] << 16|
+ (u32)msg[ 3] << 24;
+ b = (u32)msg[ 4] |
+ (u32)msg[ 5] << 8 |
+ (u32)msg[ 6] << 16|
+ (u32)msg[ 7] << 24;
+ c = (u32)msg[ 8] |
+ (u32)msg[ 9] << 8 |
+ (u32)msg[10] << 16|
+ (u32)msg[11] << 24;
+
+ d = pad;
+ for(i = 12; i < len; i++)
+ {
+ d <<= 8;
+ d |= msg[i];
+ }
+ }
+ else if (len >= 8)
+ {
+ a = (u32)msg[ 0] |
+ (u32)msg[ 1] << 8 |
+ (u32)msg[ 2] << 16|
+ (u32)msg[ 3] << 24;
+ b = (u32)msg[ 4] |
+ (u32)msg[ 5] << 8 |
+ (u32)msg[ 6] << 16|
+ (u32)msg[ 7] << 24;
+
+ c = d = pad;
+ for(i = 8; i < len; i++)
+ {
+ c <<= 8;
+ c |= msg[i];
+ }
+ }
+ else if (len >= 4)
+ {
+ a = (u32)msg[ 0] |
+ (u32)msg[ 1] << 8 |
+ (u32)msg[ 2] << 16|
+ (u32)msg[ 3] << 24;
+
+ b = c = d = pad;
+ for(i = 4; i < len; i++)
+ {
+ b <<= 8;
+ b |= msg[i];
+ }
+ }
+ else
+ {
+ a = b = c = d = pad;
+ for(i = 0; i < len; i++)
+ {
+ a <<= 8;
+ a |= msg[i];
+ }
+ }
+
+ TEACORE(FULLROUNDS);
+
+/* return 0;*/
+ return h0^h1;
+}
+
+/* What follows in this file is copyright 2000 by Hans Reiser, and the
+ * licensing of what follows is governed by reiserfs/README */
+
+u32 yura_hash (const signed char *msg, int len)
+{
+ int j, pow;
+ u32 a, c;
+ int i;
+
+ for (pow=1,i=1; i < len; i++) pow = pow * 10;
+
+ if (len == 1)
+ a = msg[0]-48;
+ else
+ a = (msg[0] - 48) * pow;
+
+ for (i=1; i < len; i++) {
+ c = msg[i] - 48;
+ for (pow=1,j=i; j < len-1; j++) pow = pow * 10;
+ a = a + c * pow;
+ }
+
+ for (; i < 40; i++) {
+ c = '0' - 48;
+ for (pow=1,j=i; j < len-1; j++) pow = pow * 10;
+ a = a + c * pow;
+ }
+
+ for (; i < 256; i++) {
+ c = i;
+ for (pow=1,j=i; j < len-1; j++) pow = pow * 10;
+ a = a + c * pow;
+ }
+
+ a = a << 7;
+ return a;
+}
+
+u32 r5_hash (const signed char *msg, int len)
+{
+ u32 a=0;
+ while(*msg) {
+ a += *msg << 4;
+ a += *msg >> 4;
+ a *= 11;
+ msg++;
+ }
+ return a;
+}
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
new file mode 100644
index 00000000000..a362125da0d
--- /dev/null
+++ b/fs/reiserfs/ibalance.c
@@ -0,0 +1,1058 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/buffer_head.h>
+
+/* this is one and only function that is used outside (do_balance.c) */
+int balance_internal (
+ struct tree_balance * ,
+ int,
+ int,
+ struct item_head * ,
+ struct buffer_head **
+ );
+
+/* modes of internal_shift_left, internal_shift_right and internal_insert_childs */
+#define INTERNAL_SHIFT_FROM_S_TO_L 0
+#define INTERNAL_SHIFT_FROM_R_TO_S 1
+#define INTERNAL_SHIFT_FROM_L_TO_S 2
+#define INTERNAL_SHIFT_FROM_S_TO_R 3
+#define INTERNAL_INSERT_TO_S 4
+#define INTERNAL_INSERT_TO_L 5
+#define INTERNAL_INSERT_TO_R 6
+
+static void internal_define_dest_src_infos (
+ int shift_mode,
+ struct tree_balance * tb,
+ int h,
+ struct buffer_info * dest_bi,
+ struct buffer_info * src_bi,
+ int * d_key,
+ struct buffer_head ** cf
+ )
+{
+ memset (dest_bi, 0, sizeof (struct buffer_info));
+ memset (src_bi, 0, sizeof (struct buffer_info));
+ /* define dest, src, dest parent, dest position */
+ switch (shift_mode) {
+ case INTERNAL_SHIFT_FROM_S_TO_L: /* used in internal_shift_left */
+ src_bi->tb = tb;
+ src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
+ src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
+ src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
+ dest_bi->tb = tb;
+ dest_bi->bi_bh = tb->L[h];
+ dest_bi->bi_parent = tb->FL[h];
+ dest_bi->bi_position = get_left_neighbor_position (tb, h);
+ *d_key = tb->lkey[h];
+ *cf = tb->CFL[h];
+ break;
+ case INTERNAL_SHIFT_FROM_L_TO_S:
+ src_bi->tb = tb;
+ src_bi->bi_bh = tb->L[h];
+ src_bi->bi_parent = tb->FL[h];
+ src_bi->bi_position = get_left_neighbor_position (tb, h);
+ dest_bi->tb = tb;
+ dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
+ dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
+ dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); /* dest position is analog of dest->b_item_order */
+ *d_key = tb->lkey[h];
+ *cf = tb->CFL[h];
+ break;
+
+ case INTERNAL_SHIFT_FROM_R_TO_S: /* used in internal_shift_left */
+ src_bi->tb = tb;
+ src_bi->bi_bh = tb->R[h];
+ src_bi->bi_parent = tb->FR[h];
+ src_bi->bi_position = get_right_neighbor_position (tb, h);
+ dest_bi->tb = tb;
+ dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
+ dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
+ dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
+ *d_key = tb->rkey[h];
+ *cf = tb->CFR[h];
+ break;
+
+ case INTERNAL_SHIFT_FROM_S_TO_R:
+ src_bi->tb = tb;
+ src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
+ src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
+ src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
+ dest_bi->tb = tb;
+ dest_bi->bi_bh = tb->R[h];
+ dest_bi->bi_parent = tb->FR[h];
+ dest_bi->bi_position = get_right_neighbor_position (tb, h);
+ *d_key = tb->rkey[h];
+ *cf = tb->CFR[h];
+ break;
+
+ case INTERNAL_INSERT_TO_L:
+ dest_bi->tb = tb;
+ dest_bi->bi_bh = tb->L[h];
+ dest_bi->bi_parent = tb->FL[h];
+ dest_bi->bi_position = get_left_neighbor_position (tb, h);
+ break;
+
+ case INTERNAL_INSERT_TO_S:
+ dest_bi->tb = tb;
+ dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
+ dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
+ dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
+ break;
+
+ case INTERNAL_INSERT_TO_R:
+ dest_bi->tb = tb;
+ dest_bi->bi_bh = tb->R[h];
+ dest_bi->bi_parent = tb->FR[h];
+ dest_bi->bi_position = get_right_neighbor_position (tb, h);
+ break;
+
+ default:
+ reiserfs_panic (tb->tb_sb, "internal_define_dest_src_infos: shift type is unknown (%d)", shift_mode);
+ }
+}
+
+
+
+/* Insert count node pointers into buffer cur before position to + 1.
+ * Insert count items into buffer cur before position to.
+ * Items and node pointers are specified by inserted and bh respectively.
+ */
+static void internal_insert_childs (struct buffer_info * cur_bi,
+ int to, int count,
+ struct item_head * inserted,
+ struct buffer_head ** bh
+ )
+{
+ struct buffer_head * cur = cur_bi->bi_bh;
+ struct block_head * blkh;
+ int nr;
+ struct reiserfs_key * ih;
+ struct disk_child new_dc[2];
+ struct disk_child * dc;
+ int i;
+
+ if (count <= 0)
+ return;
+
+ blkh = B_BLK_HEAD(cur);
+ nr = blkh_nr_item(blkh);
+
+ RFALSE( count > 2,
+ "too many children (%d) are to be inserted", count);
+ RFALSE( B_FREE_SPACE (cur) < count * (KEY_SIZE + DC_SIZE),
+ "no enough free space (%d), needed %d bytes",
+ B_FREE_SPACE (cur), count * (KEY_SIZE + DC_SIZE));
+
+ /* prepare space for count disk_child */
+ dc = B_N_CHILD(cur,to+1);
+
+ memmove (dc + count, dc, (nr+1-(to+1)) * DC_SIZE);
+
+ /* copy to_be_insert disk children */
+ for (i = 0; i < count; i ++) {
+ put_dc_size( &(new_dc[i]), MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i]));
+ put_dc_block_number( &(new_dc[i]), bh[i]->b_blocknr );
+ }
+ memcpy (dc, new_dc, DC_SIZE * count);
+
+
+ /* prepare space for count items */
+ ih = B_N_PDELIM_KEY (cur, ((to == -1) ? 0 : to));
+
+ memmove (ih + count, ih, (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE);
+
+ /* copy item headers (keys) */
+ memcpy (ih, inserted, KEY_SIZE);
+ if ( count > 1 )
+ memcpy (ih + 1, inserted + 1, KEY_SIZE);
+
+ /* sizes, item number */
+ set_blkh_nr_item( blkh, blkh_nr_item(blkh) + count );
+ set_blkh_free_space( blkh,
+ blkh_free_space(blkh) - count * (DC_SIZE + KEY_SIZE ) );
+
+ do_balance_mark_internal_dirty (cur_bi->tb, cur,0);
+
+ /*&&&&&&&&&&&&&&&&&&&&&&&&*/
+ check_internal (cur);
+ /*&&&&&&&&&&&&&&&&&&&&&&&&*/
+
+ if (cur_bi->bi_parent) {
+ struct disk_child *t_dc = B_N_CHILD (cur_bi->bi_parent,cur_bi->bi_position);
+ put_dc_size( t_dc, dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE)));
+ do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, 0);
+
+ /*&&&&&&&&&&&&&&&&&&&&&&&&*/
+ check_internal (cur_bi->bi_parent);
+ /*&&&&&&&&&&&&&&&&&&&&&&&&*/
+ }
+
+}
+
+
+/* Delete del_num items and node pointers from buffer cur starting from *
+ * the first_i'th item and first_p'th pointers respectively. */
+static void internal_delete_pointers_items (
+ struct buffer_info * cur_bi,
+ int first_p,
+ int first_i,
+ int del_num
+ )
+{
+ struct buffer_head * cur = cur_bi->bi_bh;
+ int nr;
+ struct block_head * blkh;
+ struct reiserfs_key * key;
+ struct disk_child * dc;
+
+ RFALSE( cur == NULL, "buffer is 0");
+ RFALSE( del_num < 0,
+ "negative number of items (%d) can not be deleted", del_num);
+ RFALSE( first_p < 0 || first_p + del_num > B_NR_ITEMS (cur) + 1 || first_i < 0,
+ "first pointer order (%d) < 0 or "
+ "no so many pointers (%d), only (%d) or "
+ "first key order %d < 0", first_p,
+ first_p + del_num, B_NR_ITEMS (cur) + 1, first_i);
+ if ( del_num == 0 )
+ return;
+
+ blkh = B_BLK_HEAD(cur);
+ nr = blkh_nr_item(blkh);
+
+ if ( first_p == 0 && del_num == nr + 1 ) {
+ RFALSE( first_i != 0, "1st deleted key must have order 0, not %d", first_i);
+ make_empty_node (cur_bi);
+ return;
+ }
+
+ RFALSE( first_i + del_num > B_NR_ITEMS (cur),
+ "first_i = %d del_num = %d "
+ "no so many keys (%d) in the node (%b)(%z)",
+ first_i, del_num, first_i + del_num, cur, cur);
+
+
+ /* deleting */
+ dc = B_N_CHILD (cur, first_p);
+
+ memmove (dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE);
+ key = B_N_PDELIM_KEY (cur, first_i);
+ memmove (key, key + del_num, (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - del_num) * DC_SIZE);
+
+
+ /* sizes, item number */
+ set_blkh_nr_item( blkh, blkh_nr_item(blkh) - del_num );
+ set_blkh_free_space( blkh,
+ blkh_free_space(blkh) + (del_num * (KEY_SIZE + DC_SIZE) ) );
+
+ do_balance_mark_internal_dirty (cur_bi->tb, cur, 0);
+ /*&&&&&&&&&&&&&&&&&&&&&&&*/
+ check_internal (cur);
+ /*&&&&&&&&&&&&&&&&&&&&&&&*/
+
+ if (cur_bi->bi_parent) {
+ struct disk_child *t_dc;
+ t_dc = B_N_CHILD (cur_bi->bi_parent, cur_bi->bi_position);
+ put_dc_size( t_dc, dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE) ) );
+
+ do_balance_mark_internal_dirty (cur_bi->tb, cur_bi->bi_parent,0);
+ /*&&&&&&&&&&&&&&&&&&&&&&&&*/
+ check_internal (cur_bi->bi_parent);
+ /*&&&&&&&&&&&&&&&&&&&&&&&&*/
+ }
+}
+
+
+/* delete n node pointers and items starting from given position */
+static void internal_delete_childs (struct buffer_info * cur_bi,
+ int from, int n)
+{
+ int i_from;
+
+ i_from = (from == 0) ? from : from - 1;
+
+ /* delete n pointers starting from `from' position in CUR;
+ delete n keys starting from 'i_from' position in CUR;
+ */
+ internal_delete_pointers_items (cur_bi, from, i_from, n);
+}
+
+
+/* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest
+* last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest
+ * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest
+ */
+static void internal_copy_pointers_items (
+ struct buffer_info * dest_bi,
+ struct buffer_head * src,
+ int last_first, int cpy_num
+ )
+{
+ /* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST *
+ * as delimiting key have already inserted to buffer dest.*/
+ struct buffer_head * dest = dest_bi->bi_bh;
+ int nr_dest, nr_src;
+ int dest_order, src_order;
+ struct block_head * blkh;
+ struct reiserfs_key * key;
+ struct disk_child * dc;
+
+ nr_src = B_NR_ITEMS (src);
+
+ RFALSE( dest == NULL || src == NULL,
+ "src (%p) or dest (%p) buffer is 0", src, dest);
+ RFALSE( last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
+ "invalid last_first parameter (%d)", last_first);
+ RFALSE( nr_src < cpy_num - 1,
+ "no so many items (%d) in src (%d)", cpy_num, nr_src);
+ RFALSE( cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num);
+ RFALSE( cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest),
+ "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)",
+ cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest));
+
+ if ( cpy_num == 0 )
+ return;
+
+ /* coping */
+ blkh = B_BLK_HEAD(dest);
+ nr_dest = blkh_nr_item(blkh);
+
+ /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest;*/
+ /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0;*/
+ (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order = nr_src - cpy_num + 1) :
+ (dest_order = nr_dest, src_order = 0);
+
+ /* prepare space for cpy_num pointers */
+ dc = B_N_CHILD (dest, dest_order);
+
+ memmove (dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE);
+
+ /* insert pointers */
+ memcpy (dc, B_N_CHILD (src, src_order), DC_SIZE * cpy_num);
+
+
+ /* prepare space for cpy_num - 1 item headers */
+ key = B_N_PDELIM_KEY(dest, dest_order);
+ memmove (key + cpy_num - 1, key,
+ KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + cpy_num));
+
+
+ /* insert headers */
+ memcpy (key, B_N_PDELIM_KEY (src, src_order), KEY_SIZE * (cpy_num - 1));
+
+ /* sizes, item number */
+ set_blkh_nr_item( blkh, blkh_nr_item(blkh) + (cpy_num - 1 ) );
+ set_blkh_free_space( blkh,
+ blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num ) );
+
+ do_balance_mark_internal_dirty (dest_bi->tb, dest, 0);
+
+ /*&&&&&&&&&&&&&&&&&&&&&&&&*/
+ check_internal (dest);
+ /*&&&&&&&&&&&&&&&&&&&&&&&&*/
+
+ if (dest_bi->bi_parent) {
+ struct disk_child *t_dc;
+ t_dc = B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position);
+ put_dc_size( t_dc, dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num) );
+
+ do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0);
+ /*&&&&&&&&&&&&&&&&&&&&&&&&*/
+ check_internal (dest_bi->bi_parent);
+ /*&&&&&&&&&&&&&&&&&&&&&&&&*/
+ }
+
+}
+
+
+/* Copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest.
+ * Delete cpy_num - del_par items and node pointers from buffer src.
+ * last_first == FIRST_TO_LAST means, that we copy/delete first items from src.
+ * last_first == LAST_TO_FIRST means, that we copy/delete last items from src.
+ */
+static void internal_move_pointers_items (struct buffer_info * dest_bi,
+ struct buffer_info * src_bi,
+ int last_first, int cpy_num, int del_par)
+{
+ int first_pointer;
+ int first_item;
+
+ internal_copy_pointers_items (dest_bi, src_bi->bi_bh, last_first, cpy_num);
+
+ if (last_first == FIRST_TO_LAST) { /* shift_left occurs */
+ first_pointer = 0;
+ first_item = 0;
+ /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer,
+ for key - with first_item */
+ internal_delete_pointers_items (src_bi, first_pointer, first_item, cpy_num - del_par);
+ } else { /* shift_right occurs */
+ int i, j;
+
+ i = ( cpy_num - del_par == ( j = B_NR_ITEMS(src_bi->bi_bh)) + 1 ) ? 0 : j - cpy_num + del_par;
+
+ internal_delete_pointers_items (src_bi, j + 1 - cpy_num + del_par, i, cpy_num - del_par);
+ }
+}
+
+/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */
+static void internal_insert_key (struct buffer_info * dest_bi,
+ int dest_position_before, /* insert key before key with n_dest number */
+ struct buffer_head * src,
+ int src_position)
+{
+ struct buffer_head * dest = dest_bi->bi_bh;
+ int nr;
+ struct block_head * blkh;
+ struct reiserfs_key * key;
+
+ RFALSE( dest == NULL || src == NULL,
+ "source(%p) or dest(%p) buffer is 0", src, dest);
+ RFALSE( dest_position_before < 0 || src_position < 0,
+ "source(%d) or dest(%d) key number less than 0",
+ src_position, dest_position_before);
+ RFALSE( dest_position_before > B_NR_ITEMS (dest) ||
+ src_position >= B_NR_ITEMS(src),
+ "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))",
+ dest_position_before, B_NR_ITEMS (dest),
+ src_position, B_NR_ITEMS(src));
+ RFALSE( B_FREE_SPACE (dest) < KEY_SIZE,
+ "no enough free space (%d) in dest buffer", B_FREE_SPACE (dest));
+
+ blkh = B_BLK_HEAD(dest);
+ nr = blkh_nr_item(blkh);
+
+ /* prepare space for inserting key */
+ key = B_N_PDELIM_KEY (dest, dest_position_before);
+ memmove (key + 1, key, (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE);
+
+ /* insert key */
+ memcpy (key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE);
+
+ /* Change dirt, free space, item number fields. */
+
+ set_blkh_nr_item( blkh, blkh_nr_item(blkh) + 1 );
+ set_blkh_free_space( blkh, blkh_free_space(blkh) - KEY_SIZE );
+
+ do_balance_mark_internal_dirty (dest_bi->tb, dest, 0);
+
+ if (dest_bi->bi_parent) {
+ struct disk_child *t_dc;
+ t_dc = B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position);
+ put_dc_size( t_dc, dc_size(t_dc) + KEY_SIZE );
+
+ do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0);
+ }
+}
+
+
+
+/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
+ * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest.
+ * Replace d_key'th key in buffer cfl.
+ * Delete pointer_amount items and node pointers from buffer src.
+ */
+/* this can be invoked both to shift from S to L and from R to S */
+static void internal_shift_left (
+ int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */
+ struct tree_balance * tb,
+ int h,
+ int pointer_amount
+ )
+{
+ struct buffer_info dest_bi, src_bi;
+ struct buffer_head * cf;
+ int d_key_position;
+
+ internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf);
+
+ /*printk("pointer_amount = %d\n",pointer_amount);*/
+
+ if (pointer_amount) {
+ /* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */
+ internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position);
+
+ if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) {
+ if (src_bi.bi_position/*src->b_item_order*/ == 0)
+ replace_key (tb, cf, d_key_position, src_bi.bi_parent/*src->b_parent*/, 0);
+ } else
+ replace_key (tb, cf, d_key_position, src_bi.bi_bh, pointer_amount - 1);
+ }
+ /* last parameter is del_parameter */
+ internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 0);
+
+}
+
+/* Insert delimiting key to L[h].
+ * Copy n node pointers and n - 1 items from buffer S[h] to L[h].
+ * Delete n - 1 items and node pointers from buffer S[h].
+ */
+/* it always shifts from S[h] to L[h] */
+static void internal_shift1_left (
+ struct tree_balance * tb,
+ int h,
+ int pointer_amount
+ )
+{
+ struct buffer_info dest_bi, src_bi;
+ struct buffer_head * cf;
+ int d_key_position;
+
+ internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, &dest_bi, &src_bi, &d_key_position, &cf);
+
+ if ( pointer_amount > 0 ) /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */
+ internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position);
+ /* internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]);*/
+
+ /* last parameter is del_parameter */
+ internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 1);
+ /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1);*/
+}
+
+
+/* Insert d_key'th (delimiting) key from buffer cfr to head of dest.
+ * Copy n node pointers and n - 1 items from buffer src to buffer dest.
+ * Replace d_key'th key in buffer cfr.
+ * Delete n items and node pointers from buffer src.
+ */
+static void internal_shift_right (
+ int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */
+ struct tree_balance * tb,
+ int h,
+ int pointer_amount
+ )
+{
+ struct buffer_info dest_bi, src_bi;
+ struct buffer_head * cf;
+ int d_key_position;
+ int nr;
+
+
+ internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf);
+
+ nr = B_NR_ITEMS (src_bi.bi_bh);
+
+ if (pointer_amount > 0) {
+ /* insert delimiting key from common father of dest and src to dest node into position 0 */
+ internal_insert_key (&dest_bi, 0, cf, d_key_position);
+ if (nr == pointer_amount - 1) {
+ RFALSE( src_bi.bi_bh != PATH_H_PBUFFER (tb->tb_path, h)/*tb->S[h]*/ ||
+ dest_bi.bi_bh != tb->R[h],
+ "src (%p) must be == tb->S[h](%p) when it disappears",
+ src_bi.bi_bh, PATH_H_PBUFFER (tb->tb_path, h));
+ /* when S[h] disappers replace left delemiting key as well */
+ if (tb->CFL[h])
+ replace_key (tb, cf, d_key_position, tb->CFL[h], tb->lkey[h]);
+ } else
+ replace_key (tb, cf, d_key_position, src_bi.bi_bh, nr - pointer_amount);
+ }
+
+ /* last parameter is del_parameter */
+ internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 0);
+}
+
+/* Insert delimiting key to R[h].
+ * Copy n node pointers and n - 1 items from buffer S[h] to R[h].
+ * Delete n - 1 items and node pointers from buffer S[h].
+ */
+/* it always shift from S[h] to R[h] */
+static void internal_shift1_right (
+ struct tree_balance * tb,
+ int h,
+ int pointer_amount
+ )
+{
+ struct buffer_info dest_bi, src_bi;
+ struct buffer_head * cf;
+ int d_key_position;
+
+ internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, &dest_bi, &src_bi, &d_key_position, &cf);
+
+ if (pointer_amount > 0) /* insert rkey from CFR[h] to right neighbor R[h] */
+ internal_insert_key (&dest_bi, 0, cf, d_key_position);
+ /* internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]);*/
+
+ /* last parameter is del_parameter */
+ internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 1);
+ /* internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1);*/
+}
+
+
+/* Delete insert_num node pointers together with their left items
+ * and balance current node.*/
+static void balance_internal_when_delete (struct tree_balance * tb,
+ int h, int child_pos)
+{
+ int insert_num;
+ int n;
+ struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h);
+ struct buffer_info bi;
+
+ insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE));
+
+ /* delete child-node-pointer(s) together with their left item(s) */
+ bi.tb = tb;
+ bi.bi_bh = tbSh;
+ bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h);
+ bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
+
+ internal_delete_childs (&bi, child_pos, -insert_num);
+
+ RFALSE( tb->blknum[h] > 1,
+ "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]);
+
+ n = B_NR_ITEMS(tbSh);
+
+ if ( tb->lnum[h] == 0 && tb->rnum[h] == 0 ) {
+ if ( tb->blknum[h] == 0 ) {
+ /* node S[h] (root of the tree) is empty now */
+ struct buffer_head *new_root;
+
+ RFALSE( n || B_FREE_SPACE (tbSh) != MAX_CHILD_SIZE(tbSh) - DC_SIZE,
+ "buffer must have only 0 keys (%d)", n);
+ RFALSE( bi.bi_parent, "root has parent (%p)", bi.bi_parent);
+
+ /* choose a new root */
+ if ( ! tb->L[h-1] || ! B_NR_ITEMS(tb->L[h-1]) )
+ new_root = tb->R[h-1];
+ else
+ new_root = tb->L[h-1];
+ /* switch super block's tree root block number to the new value */
+ PUT_SB_ROOT_BLOCK( tb->tb_sb, new_root->b_blocknr );
+ //REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --;
+ PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) - 1 );
+
+ do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
+ /*&&&&&&&&&&&&&&&&&&&&&&*/
+ if (h > 1)
+ /* use check_internal if new root is an internal node */
+ check_internal (new_root);
+ /*&&&&&&&&&&&&&&&&&&&&&&*/
+
+ /* do what is needed for buffer thrown from tree */
+ reiserfs_invalidate_buffer(tb, tbSh);
+ return;
+ }
+ return;
+ }
+
+ if ( tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1 ) { /* join S[h] with L[h] */
+
+ RFALSE( tb->rnum[h] != 0,
+ "invalid tb->rnum[%d]==%d when joining S[h] with L[h]",
+ h, tb->rnum[h]);
+
+ internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1);
+ reiserfs_invalidate_buffer(tb, tbSh);
+
+ return;
+ }
+
+ if ( tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1 ) { /* join S[h] with R[h] */
+ RFALSE( tb->lnum[h] != 0,
+ "invalid tb->lnum[%d]==%d when joining S[h] with R[h]",
+ h, tb->lnum[h]);
+
+ internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1);
+
+ reiserfs_invalidate_buffer(tb,tbSh);
+ return;
+ }
+
+ if ( tb->lnum[h] < 0 ) { /* borrow from left neighbor L[h] */
+ RFALSE( tb->rnum[h] != 0,
+ "wrong tb->rnum[%d]==%d when borrow from L[h]", h, tb->rnum[h]);
+ /*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]);*/
+ internal_shift_right (INTERNAL_SHIFT_FROM_L_TO_S, tb, h, -tb->lnum[h]);
+ return;
+ }
+
+ if ( tb->rnum[h] < 0 ) { /* borrow from right neighbor R[h] */
+ RFALSE( tb->lnum[h] != 0,
+ "invalid tb->lnum[%d]==%d when borrow from R[h]",
+ h, tb->lnum[h]);
+ internal_shift_left (INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]);/*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]);*/
+ return;
+ }
+
+ if ( tb->lnum[h] > 0 ) { /* split S[h] into two parts and put them into neighbors */
+ RFALSE( tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1,
+ "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them",
+ h, tb->lnum[h], h, tb->rnum[h], n);
+
+ internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);/*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]);*/
+ internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]);
+
+ reiserfs_invalidate_buffer (tb, tbSh);
+
+ return;
+ }
+ reiserfs_panic (tb->tb_sb, "balance_internal_when_delete: unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
+ h, tb->lnum[h], h, tb->rnum[h]);
+}
+
+
+/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/
+static void replace_lkey (
+ struct tree_balance * tb,
+ int h,
+ struct item_head * key
+ )
+{
+ RFALSE( tb->L[h] == NULL || tb->CFL[h] == NULL,
+ "L[h](%p) and CFL[h](%p) must exist in replace_lkey",
+ tb->L[h], tb->CFL[h]);
+
+ if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0)
+ return;
+
+ memcpy (B_N_PDELIM_KEY(tb->CFL[h],tb->lkey[h]), key, KEY_SIZE);
+
+ do_balance_mark_internal_dirty (tb, tb->CFL[h],0);
+}
+
+
+/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/
+static void replace_rkey (
+ struct tree_balance * tb,
+ int h,
+ struct item_head * key
+ )
+{
+ RFALSE( tb->R[h] == NULL || tb->CFR[h] == NULL,
+ "R[h](%p) and CFR[h](%p) must exist in replace_rkey",
+ tb->R[h], tb->CFR[h]);
+ RFALSE( B_NR_ITEMS(tb->R[h]) == 0,
+ "R[h] can not be empty if it exists (item number=%d)",
+ B_NR_ITEMS(tb->R[h]));
+
+ memcpy (B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]), key, KEY_SIZE);
+
+ do_balance_mark_internal_dirty (tb, tb->CFR[h], 0);
+}
+
+
+int balance_internal (struct tree_balance * tb, /* tree_balance structure */
+ int h, /* level of the tree */
+ int child_pos,
+ struct item_head * insert_key, /* key for insertion on higher level */
+ struct buffer_head ** insert_ptr /* node for insertion on higher level*/
+ )
+ /* if inserting/pasting
+ {
+ child_pos is the position of the node-pointer in S[h] that *
+ pointed to S[h-1] before balancing of the h-1 level; *
+ this means that new pointers and items must be inserted AFTER *
+ child_pos
+ }
+ else
+ {
+ it is the position of the leftmost pointer that must be deleted (together with
+ its corresponding key to the left of the pointer)
+ as a result of the previous level's balancing.
+ }
+*/
+{
+ struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h);
+ struct buffer_info bi;
+ int order; /* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */
+ int insert_num, n, k;
+ struct buffer_head * S_new;
+ struct item_head new_insert_key;
+ struct buffer_head * new_insert_ptr = NULL;
+ struct item_head * new_insert_key_addr = insert_key;
+
+ RFALSE( h < 1, "h (%d) can not be < 1 on internal level", h);
+
+ PROC_INFO_INC( tb -> tb_sb, balance_at[ h ] );
+
+ order = ( tbSh ) ? PATH_H_POSITION (tb->tb_path, h + 1)/*tb->S[h]->b_item_order*/ : 0;
+
+ /* Using insert_size[h] calculate the number insert_num of items
+ that must be inserted to or deleted from S[h]. */
+ insert_num = tb->insert_size[h]/((int)(KEY_SIZE + DC_SIZE));
+
+ /* Check whether insert_num is proper **/
+ RFALSE( insert_num < -2 || insert_num > 2,
+ "incorrect number of items inserted to the internal node (%d)",
+ insert_num);
+ RFALSE( h > 1 && (insert_num > 1 || insert_num < -1),
+ "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level",
+ insert_num, h);
+
+ /* Make balance in case insert_num < 0 */
+ if ( insert_num < 0 ) {
+ balance_internal_when_delete (tb, h, child_pos);
+ return order;
+ }
+
+ k = 0;
+ if ( tb->lnum[h] > 0 ) {
+ /* shift lnum[h] items from S[h] to the left neighbor L[h].
+ check how many of new items fall into L[h] or CFL[h] after
+ shifting */
+ n = B_NR_ITEMS (tb->L[h]); /* number of items in L[h] */
+ if ( tb->lnum[h] <= child_pos ) {
+ /* new items don't fall into L[h] or CFL[h] */
+ internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);
+ /*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]);*/
+ child_pos -= tb->lnum[h];
+ } else if ( tb->lnum[h] > child_pos + insert_num ) {
+ /* all new items fall into L[h] */
+ internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h] - insert_num);
+ /* internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,
+ tb->lnum[h]-insert_num);
+ */
+ /* insert insert_num keys and node-pointers into L[h] */
+ bi.tb = tb;
+ bi.bi_bh = tb->L[h];
+ bi.bi_parent = tb->FL[h];
+ bi.bi_position = get_left_neighbor_position (tb, h);
+ internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next*/ n + child_pos + 1,
+ insert_num,insert_key,insert_ptr);
+
+ insert_num = 0;
+ } else {
+ struct disk_child * dc;
+
+ /* some items fall into L[h] or CFL[h], but some don't fall */
+ internal_shift1_left(tb,h,child_pos+1);
+ /* calculate number of new items that fall into L[h] */
+ k = tb->lnum[h] - child_pos - 1;
+ bi.tb = tb;
+ bi.bi_bh = tb->L[h];
+ bi.bi_parent = tb->FL[h];
+ bi.bi_position = get_left_neighbor_position (tb, h);
+ internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next,*/ n + child_pos + 1,k,
+ insert_key,insert_ptr);
+
+ replace_lkey(tb,h,insert_key + k);
+
+ /* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */
+ dc = B_N_CHILD(tbSh, 0);
+ put_dc_size( dc, MAX_CHILD_SIZE(insert_ptr[k]) - B_FREE_SPACE (insert_ptr[k]));
+ put_dc_block_number( dc, insert_ptr[k]->b_blocknr );
+
+ do_balance_mark_internal_dirty (tb, tbSh, 0);
+
+ k++;
+ insert_key += k;
+ insert_ptr += k;
+ insert_num -= k;
+ child_pos = 0;
+ }
+ } /* tb->lnum[h] > 0 */
+
+ if ( tb->rnum[h] > 0 ) {
+ /*shift rnum[h] items from S[h] to the right neighbor R[h]*/
+ /* check how many of new items fall into R or CFR after shifting */
+ n = B_NR_ITEMS (tbSh); /* number of items in S[h] */
+ if ( n - tb->rnum[h] >= child_pos )
+ /* new items fall into S[h] */
+ /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]);*/
+ internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]);
+ else
+ if ( n + insert_num - tb->rnum[h] < child_pos )
+ {
+ /* all new items fall into R[h] */
+ /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],
+ tb->rnum[h] - insert_num);*/
+ internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h] - insert_num);
+
+ /* insert insert_num keys and node-pointers into R[h] */
+ bi.tb = tb;
+ bi.bi_bh = tb->R[h];
+ bi.bi_parent = tb->FR[h];
+ bi.bi_position = get_right_neighbor_position (tb, h);
+ internal_insert_childs (&bi, /*tb->R[h],tb->S[h-1]->b_next*/ child_pos - n - insert_num + tb->rnum[h] - 1,
+ insert_num,insert_key,insert_ptr);
+ insert_num = 0;
+ }
+ else
+ {
+ struct disk_child * dc;
+
+ /* one of the items falls into CFR[h] */
+ internal_shift1_right(tb,h,n - child_pos + 1);
+ /* calculate number of new items that fall into R[h] */
+ k = tb->rnum[h] - n + child_pos - 1;
+ bi.tb = tb;
+ bi.bi_bh = tb->R[h];
+ bi.bi_parent = tb->FR[h];
+ bi.bi_position = get_right_neighbor_position (tb, h);
+ internal_insert_childs (&bi, /*tb->R[h], tb->R[h]->b_child,*/ 0, k, insert_key + 1, insert_ptr + 1);
+
+ replace_rkey(tb,h,insert_key + insert_num - k - 1);
+
+ /* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1]*/
+ dc = B_N_CHILD(tb->R[h], 0);
+ put_dc_size( dc, MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) -
+ B_FREE_SPACE (insert_ptr[insert_num-k-1]));
+ put_dc_block_number( dc, insert_ptr[insert_num-k-1]->b_blocknr );
+
+ do_balance_mark_internal_dirty (tb, tb->R[h],0);
+
+ insert_num -= (k + 1);
+ }
+ }
+
+ /** Fill new node that appears instead of S[h] **/
+ RFALSE( tb->blknum[h] > 2, "blknum can not be > 2 for internal level");
+ RFALSE( tb->blknum[h] < 0, "blknum can not be < 0");
+
+ if ( ! tb->blknum[h] )
+ { /* node S[h] is empty now */
+ RFALSE( ! tbSh, "S[h] is equal NULL");
+
+ /* do what is needed for buffer thrown from tree */
+ reiserfs_invalidate_buffer(tb,tbSh);
+ return order;
+ }
+
+ if ( ! tbSh ) {
+ /* create new root */
+ struct disk_child * dc;
+ struct buffer_head * tbSh_1 = PATH_H_PBUFFER (tb->tb_path, h - 1);
+ struct block_head * blkh;
+
+
+ if ( tb->blknum[h] != 1 )
+ reiserfs_panic(NULL, "balance_internal: One new node required for creating the new root");
+ /* S[h] = empty buffer from the list FEB. */
+ tbSh = get_FEB (tb);
+ blkh = B_BLK_HEAD(tbSh);
+ set_blkh_level( blkh, h + 1 );
+
+ /* Put the unique node-pointer to S[h] that points to S[h-1]. */
+
+ dc = B_N_CHILD(tbSh, 0);
+ put_dc_block_number( dc, tbSh_1->b_blocknr );
+ put_dc_size( dc, (MAX_CHILD_SIZE (tbSh_1) - B_FREE_SPACE (tbSh_1)));
+
+ tb->insert_size[h] -= DC_SIZE;
+ set_blkh_free_space( blkh, blkh_free_space(blkh) - DC_SIZE );
+
+ do_balance_mark_internal_dirty (tb, tbSh, 0);
+
+ /*&&&&&&&&&&&&&&&&&&&&&&&&*/
+ check_internal (tbSh);
+ /*&&&&&&&&&&&&&&&&&&&&&&&&*/
+
+ /* put new root into path structure */
+ PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) = tbSh;
+
+ /* Change root in structure super block. */
+ PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr );
+ PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 );
+ do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
+ }
+
+ if ( tb->blknum[h] == 2 ) {
+ int snum;
+ struct buffer_info dest_bi, src_bi;
+
+
+ /* S_new = free buffer from list FEB */
+ S_new = get_FEB(tb);
+
+ set_blkh_level( B_BLK_HEAD(S_new), h + 1 );
+
+ dest_bi.tb = tb;
+ dest_bi.bi_bh = S_new;
+ dest_bi.bi_parent = NULL;
+ dest_bi.bi_position = 0;
+ src_bi.tb = tb;
+ src_bi.bi_bh = tbSh;
+ src_bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h);
+ src_bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
+
+ n = B_NR_ITEMS (tbSh); /* number of items in S[h] */
+ snum = (insert_num + n + 1)/2;
+ if ( n - snum >= child_pos ) {
+ /* new items don't fall into S_new */
+ /* store the delimiting key for the next level */
+ /* new_insert_key = (n - snum)'th key in S[h] */
+ memcpy (&new_insert_key,B_N_PDELIM_KEY(tbSh,n - snum),
+ KEY_SIZE);
+ /* last parameter is del_par */
+ internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum, 0);
+ /* internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0);*/
+ } else if ( n + insert_num - snum < child_pos ) {
+ /* all new items fall into S_new */
+ /* store the delimiting key for the next level */
+ /* new_insert_key = (n + insert_item - snum)'th key in S[h] */
+ memcpy(&new_insert_key,B_N_PDELIM_KEY(tbSh,n + insert_num - snum),
+ KEY_SIZE);
+ /* last parameter is del_par */
+ internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum - insert_num, 0);
+ /* internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0);*/
+
+ /* insert insert_num keys and node-pointers into S_new */
+ internal_insert_childs (&dest_bi, /*S_new,tb->S[h-1]->b_next,*/child_pos - n - insert_num + snum - 1,
+ insert_num,insert_key,insert_ptr);
+
+ insert_num = 0;
+ } else {
+ struct disk_child * dc;
+
+ /* some items fall into S_new, but some don't fall */
+ /* last parameter is del_par */
+ internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, n - child_pos + 1, 1);
+ /* internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1);*/
+ /* calculate number of new items that fall into S_new */
+ k = snum - n + child_pos - 1;
+
+ internal_insert_childs (&dest_bi, /*S_new,*/ 0, k, insert_key + 1, insert_ptr+1);
+
+ /* new_insert_key = insert_key[insert_num - k - 1] */
+ memcpy(&new_insert_key,insert_key + insert_num - k - 1,
+ KEY_SIZE);
+ /* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */
+
+ dc = B_N_CHILD(S_new,0);
+ put_dc_size( dc, (MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) -
+ B_FREE_SPACE(insert_ptr[insert_num-k-1])) );
+ put_dc_block_number( dc, insert_ptr[insert_num-k-1]->b_blocknr );
+
+ do_balance_mark_internal_dirty (tb, S_new,0);
+
+ insert_num -= (k + 1);
+ }
+ /* new_insert_ptr = node_pointer to S_new */
+ new_insert_ptr = S_new;
+
+ RFALSE (!buffer_journaled(S_new) || buffer_journal_dirty(S_new) ||
+ buffer_dirty (S_new),
+ "cm-00001: bad S_new (%b)", S_new);
+
+ // S_new is released in unfix_nodes
+ }
+
+ n = B_NR_ITEMS (tbSh); /*number of items in S[h] */
+
+ if ( 0 <= child_pos && child_pos <= n && insert_num > 0 ) {
+ bi.tb = tb;
+ bi.bi_bh = tbSh;
+ bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h);
+ bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
+ internal_insert_childs (
+ &bi,/*tbSh,*/
+ /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next,*/
+ child_pos,insert_num,insert_key,insert_ptr
+ );
+ }
+
+
+ memcpy (new_insert_key_addr,&new_insert_key,KEY_SIZE);
+ insert_ptr[0] = new_insert_ptr;
+
+ return order;
+ }
+
+
+
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
new file mode 100644
index 00000000000..7543031396f
--- /dev/null
+++ b/fs/reiserfs/inode.c
@@ -0,0 +1,2846 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/config.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_acl.h>
+#include <linux/reiserfs_xattr.h>
+#include <linux/smp_lock.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/quotaops.h>
+
+extern int reiserfs_default_io_size; /* default io size devuned in super.c */
+
+static int reiserfs_commit_write(struct file *f, struct page *page,
+ unsigned from, unsigned to);
+static int reiserfs_prepare_write(struct file *f, struct page *page,
+ unsigned from, unsigned to);
+
+void reiserfs_delete_inode (struct inode * inode)
+{
+ /* We need blocks for transaction + (user+group) quota update (possibly delete) */
+ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS;
+ struct reiserfs_transaction_handle th ;
+
+ reiserfs_write_lock(inode->i_sb);
+
+ /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
+ if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
+ down (&inode->i_sem);
+
+ reiserfs_delete_xattrs (inode);
+
+ if (journal_begin(&th, inode->i_sb, jbegin_count)) {
+ up (&inode->i_sem);
+ goto out;
+ }
+ reiserfs_update_inode_transaction(inode) ;
+
+ if (reiserfs_delete_object (&th, inode)) {
+ up (&inode->i_sem);
+ goto out;
+ }
+
+ /* Do quota update inside a transaction for journaled quotas. We must do that
+ * after delete_object so that quota updates go into the same transaction as
+ * stat data deletion */
+ DQUOT_FREE_INODE(inode);
+
+ if (journal_end(&th, inode->i_sb, jbegin_count)) {
+ up (&inode->i_sem);
+ goto out;
+ }
+
+ up (&inode->i_sem);
+
+ /* all items of file are deleted, so we can remove "save" link */
+ remove_save_link (inode, 0/* not truncate */); /* we can't do anything
+ * about an error here */
+ } else {
+ /* no object items are in the tree */
+ ;
+ }
+out:
+ clear_inode (inode); /* note this must go after the journal_end to prevent deadlock */
+ inode->i_blocks = 0;
+ reiserfs_write_unlock(inode->i_sb);
+}
+
+static void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid,
+ loff_t offset, int type, int length )
+{
+ key->version = version;
+
+ key->on_disk_key.k_dir_id = dirid;
+ key->on_disk_key.k_objectid = objectid;
+ set_cpu_key_k_offset (key, offset);
+ set_cpu_key_k_type (key, type);
+ key->key_length = length;
+}
+
+
+/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
+ offset and type of key */
+void make_cpu_key (struct cpu_key * key, struct inode * inode, loff_t offset,
+ int type, int length )
+{
+ _make_cpu_key (key, get_inode_item_key_version (inode), le32_to_cpu (INODE_PKEY (inode)->k_dir_id),
+ le32_to_cpu (INODE_PKEY (inode)->k_objectid),
+ offset, type, length);
+}
+
+
+//
+// when key is 0, do not set version and short key
+//
+inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key,
+ int version,
+ loff_t offset, int type, int length,
+ int entry_count/*or ih_free_space*/)
+{
+ if (key) {
+ ih->ih_key.k_dir_id = cpu_to_le32 (key->on_disk_key.k_dir_id);
+ ih->ih_key.k_objectid = cpu_to_le32 (key->on_disk_key.k_objectid);
+ }
+ put_ih_version( ih, version );
+ set_le_ih_k_offset (ih, offset);
+ set_le_ih_k_type (ih, type);
+ put_ih_item_len( ih, length );
+ /* set_ih_free_space (ih, 0);*/
+ // for directory items it is entry count, for directs and stat
+ // datas - 0xffff, for indirects - 0
+ put_ih_entry_count( ih, entry_count );
+}
+
+//
+// FIXME: we might cache recently accessed indirect item
+
+// Ugh. Not too eager for that....
+// I cut the code until such time as I see a convincing argument (benchmark).
+// I don't want a bloated inode struct..., and I don't like code complexity....
+
+/* cutting the code is fine, since it really isn't in use yet and is easy
+** to add back in. But, Vladimir has a really good idea here. Think
+** about what happens for reading a file. For each page,
+** The VFS layer calls reiserfs_readpage, who searches the tree to find
+** an indirect item. This indirect item has X number of pointers, where
+** X is a big number if we've done the block allocation right. But,
+** we only use one or two of these pointers during each call to readpage,
+** needlessly researching again later on.
+**
+** The size of the cache could be dynamic based on the size of the file.
+**
+** I'd also like to see us cache the location the stat data item, since
+** we are needlessly researching for that frequently.
+**
+** --chris
+*/
+
+/* If this page has a file tail in it, and
+** it was read in by get_block_create_0, the page data is valid,
+** but tail is still sitting in a direct item, and we can't write to
+** it. So, look through this page, and check all the mapped buffers
+** to make sure they have valid block numbers. Any that don't need
+** to be unmapped, so that block_prepare_write will correctly call
+** reiserfs_get_block to convert the tail into an unformatted node
+*/
+static inline void fix_tail_page_for_writing(struct page *page) {
+ struct buffer_head *head, *next, *bh ;
+
+ if (page && page_has_buffers(page)) {
+ head = page_buffers(page) ;
+ bh = head ;
+ do {
+ next = bh->b_this_page ;
+ if (buffer_mapped(bh) && bh->b_blocknr == 0) {
+ reiserfs_unmap_buffer(bh) ;
+ }
+ bh = next ;
+ } while (bh != head) ;
+ }
+}
+
+/* reiserfs_get_block does not need to allocate a block only if it has been
+ done already or non-hole position has been found in the indirect item */
+static inline int allocation_needed (int retval, b_blocknr_t allocated,
+ struct item_head * ih,
+ __u32 * item, int pos_in_item)
+{
+ if (allocated)
+ return 0;
+ if (retval == POSITION_FOUND && is_indirect_le_ih (ih) &&
+ get_block_num(item, pos_in_item))
+ return 0;
+ return 1;
+}
+
+static inline int indirect_item_found (int retval, struct item_head * ih)
+{
+ return (retval == POSITION_FOUND) && is_indirect_le_ih (ih);
+}
+
+
+static inline void set_block_dev_mapped (struct buffer_head * bh,
+ b_blocknr_t block, struct inode * inode)
+{
+ map_bh(bh, inode->i_sb, block);
+}
+
+
+//
+// files which were created in the earlier version can not be longer,
+// than 2 gb
+//
+static int file_capable (struct inode * inode, long block)
+{
+ if (get_inode_item_key_version (inode) != KEY_FORMAT_3_5 || // it is new file.
+ block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb
+ return 1;
+
+ return 0;
+}
+
+/*static*/ int restart_transaction(struct reiserfs_transaction_handle *th,
+ struct inode *inode, struct path *path) {
+ struct super_block *s = th->t_super ;
+ int len = th->t_blocks_allocated ;
+ int err;
+
+ BUG_ON (!th->t_trans_id);
+ BUG_ON (!th->t_refcount);
+
+ /* we cannot restart while nested */
+ if (th->t_refcount > 1) {
+ return 0 ;
+ }
+ pathrelse(path) ;
+ reiserfs_update_sd(th, inode) ;
+ err = journal_end(th, s, len) ;
+ if (!err) {
+ err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6) ;
+ if (!err)
+ reiserfs_update_inode_transaction(inode) ;
+ }
+ return err;
+}
+
+// it is called by get_block when create == 0. Returns block number
+// for 'block'-th logical block of file. When it hits direct item it
+// returns 0 (being called from bmap) or read direct item into piece
+// of page (bh_result)
+
+// Please improve the english/clarity in the comment above, as it is
+// hard to understand.
+
+static int _get_block_create_0 (struct inode * inode, long block,
+ struct buffer_head * bh_result,
+ int args)
+{
+ INITIALIZE_PATH (path);
+ struct cpu_key key;
+ struct buffer_head * bh;
+ struct item_head * ih, tmp_ih;
+ int fs_gen ;
+ int blocknr;
+ char * p = NULL;
+ int chars;
+ int ret ;
+ int done = 0 ;
+ unsigned long offset ;
+
+ // prepare the key to look for the 'block'-th block of file
+ make_cpu_key (&key, inode,
+ (loff_t)block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 3);
+
+research:
+ if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND) {
+ pathrelse (&path);
+ if (p)
+ kunmap(bh_result->b_page) ;
+ // We do not return -ENOENT if there is a hole but page is uptodate, because it means
+ // That there is some MMAPED data associated with it that is yet to be written to disk.
+ if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
+ return -ENOENT ;
+ }
+ return 0 ;
+ }
+
+ //
+ bh = get_last_bh (&path);
+ ih = get_ih (&path);
+ if (is_indirect_le_ih (ih)) {
+ __u32 * ind_item = (__u32 *)B_I_PITEM (bh, ih);
+
+ /* FIXME: here we could cache indirect item or part of it in
+ the inode to avoid search_by_key in case of subsequent
+ access to file */
+ blocknr = get_block_num(ind_item, path.pos_in_item) ;
+ ret = 0 ;
+ if (blocknr) {
+ map_bh(bh_result, inode->i_sb, blocknr);
+ if (path.pos_in_item == ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
+ set_buffer_boundary(bh_result);
+ }
+ } else
+ // We do not return -ENOENT if there is a hole but page is uptodate, because it means
+ // That there is some MMAPED data associated with it that is yet to be written to disk.
+ if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
+ ret = -ENOENT ;
+ }
+
+ pathrelse (&path);
+ if (p)
+ kunmap(bh_result->b_page) ;
+ return ret ;
+ }
+
+ // requested data are in direct item(s)
+ if (!(args & GET_BLOCK_READ_DIRECT)) {
+ // we are called by bmap. FIXME: we can not map block of file
+ // when it is stored in direct item(s)
+ pathrelse (&path);
+ if (p)
+ kunmap(bh_result->b_page) ;
+ return -ENOENT;
+ }
+
+ /* if we've got a direct item, and the buffer or page was uptodate,
+ ** we don't want to pull data off disk again. skip to the
+ ** end, where we map the buffer and return
+ */
+ if (buffer_uptodate(bh_result)) {
+ goto finished ;
+ } else
+ /*
+ ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
+ ** pages without any buffers. If the page is up to date, we don't want
+ ** read old data off disk. Set the up to date bit on the buffer instead
+ ** and jump to the end
+ */
+ if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
+ set_buffer_uptodate(bh_result);
+ goto finished ;
+ }
+
+ // read file tail into part of page
+ offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1) ;
+ fs_gen = get_generation(inode->i_sb) ;
+ copy_item_head (&tmp_ih, ih);
+
+ /* we only want to kmap if we are reading the tail into the page.
+ ** this is not the common case, so we don't kmap until we are
+ ** sure we need to. But, this means the item might move if
+ ** kmap schedules
+ */
+ if (!p) {
+ p = (char *)kmap(bh_result->b_page) ;
+ if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
+ goto research;
+ }
+ }
+ p += offset ;
+ memset (p, 0, inode->i_sb->s_blocksize);
+ do {
+ if (!is_direct_le_ih (ih)) {
+ BUG ();
+ }
+ /* make sure we don't read more bytes than actually exist in
+ ** the file. This can happen in odd cases where i_size isn't
+ ** correct, and when direct item padding results in a few
+ ** extra bytes at the end of the direct item
+ */
+ if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
+ break ;
+ if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
+ chars = inode->i_size - (le_ih_k_offset(ih) - 1) - path.pos_in_item;
+ done = 1 ;
+ } else {
+ chars = ih_item_len(ih) - path.pos_in_item;
+ }
+ memcpy (p, B_I_PITEM (bh, ih) + path.pos_in_item, chars);
+
+ if (done)
+ break ;
+
+ p += chars;
+
+ if (PATH_LAST_POSITION (&path) != (B_NR_ITEMS (bh) - 1))
+ // we done, if read direct item is not the last item of
+ // node FIXME: we could try to check right delimiting key
+ // to see whether direct item continues in the right
+ // neighbor or rely on i_size
+ break;
+
+ // update key to look for the next piece
+ set_cpu_key_k_offset (&key, cpu_key_k_offset (&key) + chars);
+ if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND)
+ // we read something from tail, even if now we got IO_ERROR
+ break;
+ bh = get_last_bh (&path);
+ ih = get_ih (&path);
+ } while (1);
+
+ flush_dcache_page(bh_result->b_page) ;
+ kunmap(bh_result->b_page) ;
+
+finished:
+ pathrelse (&path);
+ /* this buffer has valid data, but isn't valid for io. mapping it to
+ * block #0 tells the rest of reiserfs it just has a tail in it
+ */
+ map_bh(bh_result, inode->i_sb, 0);
+ set_buffer_uptodate (bh_result);
+ return 0;
+}
+
+
+// this is called to create file map. So, _get_block_create_0 will not
+// read direct item
+static int reiserfs_bmap (struct inode * inode, sector_t block,
+ struct buffer_head * bh_result, int create)
+{
+ if (!file_capable (inode, block))
+ return -EFBIG;
+
+ reiserfs_write_lock(inode->i_sb);
+ /* do not read the direct item */
+ _get_block_create_0 (inode, block, bh_result, 0) ;
+ reiserfs_write_unlock(inode->i_sb);
+ return 0;
+}
+
+/* special version of get_block that is only used by grab_tail_page right
+** now. It is sent to block_prepare_write, and when you try to get a
+** block past the end of the file (or a block from a hole) it returns
+** -ENOENT instead of a valid buffer. block_prepare_write expects to
+** be able to do i/o on the buffers returned, unless an error value
+** is also returned.
+**
+** So, this allows block_prepare_write to be used for reading a single block
+** in a page. Where it does not produce a valid page for holes, or past the
+** end of the file. This turns out to be exactly what we need for reading
+** tails for conversion.
+**
+** The point of the wrapper is forcing a certain value for create, even
+** though the VFS layer is calling this function with create==1. If you
+** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
+** don't use this function.
+*/
+static int reiserfs_get_block_create_0 (struct inode * inode, sector_t block,
+ struct buffer_head * bh_result, int create) {
+ return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ;
+}
+
+/* This is special helper for reiserfs_get_block in case we are executing
+ direct_IO request. */
+static int reiserfs_get_blocks_direct_io(struct inode *inode,
+ sector_t iblock,
+ unsigned long max_blocks,
+ struct buffer_head *bh_result,
+ int create)
+{
+ int ret ;
+
+ bh_result->b_page = NULL;
+
+ /* We set the b_size before reiserfs_get_block call since it is
+ referenced in convert_tail_for_hole() that may be called from
+ reiserfs_get_block() */
+ bh_result->b_size = (1 << inode->i_blkbits);
+
+ ret = reiserfs_get_block(inode, iblock, bh_result,
+ create | GET_BLOCK_NO_DANGLE) ;
+ if (ret)
+ goto out;
+
+ /* don't allow direct io onto tail pages */
+ if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
+ /* make sure future calls to the direct io funcs for this offset
+ ** in the file fail by unmapping the buffer
+ */
+ clear_buffer_mapped(bh_result);
+ ret = -EINVAL ;
+ }
+ /* Possible unpacked tail. Flush the data before pages have
+ disappeared */
+ if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
+ int err;
+ lock_kernel();
+ err = reiserfs_commit_for_inode(inode);
+ REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
+ unlock_kernel();
+ if (err < 0)
+ ret = err;
+ }
+out:
+ return ret ;
+}
+
+
+/*
+** helper function for when reiserfs_get_block is called for a hole
+** but the file tail is still in a direct item
+** bh_result is the buffer head for the hole
+** tail_offset is the offset of the start of the tail in the file
+**
+** This calls prepare_write, which will start a new transaction
+** you should not be in a transaction, or have any paths held when you
+** call this.
+*/
+static int convert_tail_for_hole(struct inode *inode,
+ struct buffer_head *bh_result,
+ loff_t tail_offset) {
+ unsigned long index ;
+ unsigned long tail_end ;
+ unsigned long tail_start ;
+ struct page * tail_page ;
+ struct page * hole_page = bh_result->b_page ;
+ int retval = 0 ;
+
+ if ((tail_offset & (bh_result->b_size - 1)) != 1)
+ return -EIO ;
+
+ /* always try to read until the end of the block */
+ tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ;
+ tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ;
+
+ index = tail_offset >> PAGE_CACHE_SHIFT ;
+ /* hole_page can be zero in case of direct_io, we are sure
+ that we cannot get here if we write with O_DIRECT into
+ tail page */
+ if (!hole_page || index != hole_page->index) {
+ tail_page = grab_cache_page(inode->i_mapping, index) ;
+ retval = -ENOMEM;
+ if (!tail_page) {
+ goto out ;
+ }
+ } else {
+ tail_page = hole_page ;
+ }
+
+ /* we don't have to make sure the conversion did not happen while
+ ** we were locking the page because anyone that could convert
+ ** must first take i_sem.
+ **
+ ** We must fix the tail page for writing because it might have buffers
+ ** that are mapped, but have a block number of 0. This indicates tail
+ ** data that has been read directly into the page, and block_prepare_write
+ ** won't trigger a get_block in this case.
+ */
+ fix_tail_page_for_writing(tail_page) ;
+ retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
+ if (retval)
+ goto unlock ;
+
+ /* tail conversion might change the data in the page */
+ flush_dcache_page(tail_page) ;
+
+ retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ;
+
+unlock:
+ if (tail_page != hole_page) {
+ unlock_page(tail_page) ;
+ page_cache_release(tail_page) ;
+ }
+out:
+ return retval ;
+}
+
+static inline int _allocate_block(struct reiserfs_transaction_handle *th,
+ long block,
+ struct inode *inode,
+ b_blocknr_t *allocated_block_nr,
+ struct path * path,
+ int flags) {
+ BUG_ON (!th->t_trans_id);
+
+#ifdef REISERFS_PREALLOCATE
+ if (!(flags & GET_BLOCK_NO_ISEM)) {
+ return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, path, block);
+ }
+#endif
+ return reiserfs_new_unf_blocknrs (th, inode, allocated_block_nr, path, block);
+}
+
+int reiserfs_get_block (struct inode * inode, sector_t block,
+ struct buffer_head * bh_result, int create)
+{
+ int repeat, retval = 0;
+ b_blocknr_t allocated_block_nr = 0;// b_blocknr_t is (unsigned) 32 bit int
+ INITIALIZE_PATH(path);
+ int pos_in_item;
+ struct cpu_key key;
+ struct buffer_head * bh, * unbh = NULL;
+ struct item_head * ih, tmp_ih;
+ __u32 * item;
+ int done;
+ int fs_gen;
+ struct reiserfs_transaction_handle *th = NULL;
+ /* space reserved in transaction batch:
+ . 3 balancings in direct->indirect conversion
+ . 1 block involved into reiserfs_update_sd()
+ XXX in practically impossible worst case direct2indirect()
+ can incur (much) more than 3 balancings.
+ quota update for user, group */
+ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS;
+ int version;
+ int dangle = 1;
+ loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
+
+ /* bad.... */
+ reiserfs_write_lock(inode->i_sb);
+ version = get_inode_item_key_version (inode);
+
+ if (block < 0) {
+ reiserfs_write_unlock(inode->i_sb);
+ return -EIO;
+ }
+
+ if (!file_capable (inode, block)) {
+ reiserfs_write_unlock(inode->i_sb);
+ return -EFBIG;
+ }
+
+ /* if !create, we aren't changing the FS, so we don't need to
+ ** log anything, so we don't need to start a transaction
+ */
+ if (!(create & GET_BLOCK_CREATE)) {
+ int ret ;
+ /* find number of block-th logical block of the file */
+ ret = _get_block_create_0 (inode, block, bh_result,
+ create | GET_BLOCK_READ_DIRECT) ;
+ reiserfs_write_unlock(inode->i_sb);
+ return ret;
+ }
+ /*
+ * if we're already in a transaction, make sure to close
+ * any new transactions we start in this func
+ */
+ if ((create & GET_BLOCK_NO_DANGLE) ||
+ reiserfs_transaction_running(inode->i_sb))
+ dangle = 0;
+
+ /* If file is of such a size, that it might have a tail and tails are enabled
+ ** we should mark it as possibly needing tail packing on close
+ */
+ if ( (have_large_tails (inode->i_sb) && inode->i_size < i_block_size (inode)*4) ||
+ (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) )
+ REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
+
+ /* set the key of the first byte in the 'block'-th block of file */
+ make_cpu_key (&key, inode, new_offset,
+ TYPE_ANY, 3/*key length*/);
+ if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
+start_trans:
+ th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
+ if (!th) {
+ retval = -ENOMEM;
+ goto failure;
+ }
+ reiserfs_update_inode_transaction(inode) ;
+ }
+ research:
+
+ retval = search_for_position_by_key (inode->i_sb, &key, &path);
+ if (retval == IO_ERROR) {
+ retval = -EIO;
+ goto failure;
+ }
+
+ bh = get_last_bh (&path);
+ ih = get_ih (&path);
+ item = get_item (&path);
+ pos_in_item = path.pos_in_item;
+
+ fs_gen = get_generation (inode->i_sb);
+ copy_item_head (&tmp_ih, ih);
+
+ if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
+ /* we have to allocate block for the unformatted node */
+ if (!th) {
+ pathrelse(&path) ;
+ goto start_trans;
+ }
+
+ repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
+
+ if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
+ /* restart the transaction to give the journal a chance to free
+ ** some blocks. releases the path, so we have to go back to
+ ** research if we succeed on the second try
+ */
+ SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
+ retval = restart_transaction(th, inode, &path) ;
+ if (retval)
+ goto failure;
+ repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
+
+ if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
+ goto research ;
+ }
+ if (repeat == QUOTA_EXCEEDED)
+ retval = -EDQUOT;
+ else
+ retval = -ENOSPC;
+ goto failure;
+ }
+
+ if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
+ goto research;
+ }
+ }
+
+ if (indirect_item_found (retval, ih)) {
+ b_blocknr_t unfm_ptr;
+ /* 'block'-th block is in the file already (there is
+ corresponding cell in some indirect item). But it may be
+ zero unformatted node pointer (hole) */
+ unfm_ptr = get_block_num (item, pos_in_item);
+ if (unfm_ptr == 0) {
+ /* use allocated block to plug the hole */
+ reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
+ if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
+ reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
+ goto research;
+ }
+ set_buffer_new(bh_result);
+ if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb))
+ reiserfs_add_ordered_list(inode, bh_result);
+ put_block_num(item, pos_in_item, allocated_block_nr) ;
+ unfm_ptr = allocated_block_nr;
+ journal_mark_dirty (th, inode->i_sb, bh);
+ reiserfs_update_sd(th, inode) ;
+ }
+ set_block_dev_mapped(bh_result, unfm_ptr, inode);
+ pathrelse (&path);
+ retval = 0;
+ if (!dangle && th)
+ retval = reiserfs_end_persistent_transaction(th);
+
+ reiserfs_write_unlock(inode->i_sb);
+
+ /* the item was found, so new blocks were not added to the file
+ ** there is no need to make sure the inode is updated with this
+ ** transaction
+ */
+ return retval;
+ }
+
+ if (!th) {
+ pathrelse(&path) ;
+ goto start_trans;
+ }
+
+ /* desired position is not found or is in the direct item. We have
+ to append file with holes up to 'block'-th block converting
+ direct items to indirect one if necessary */
+ done = 0;
+ do {
+ if (is_statdata_le_ih (ih)) {
+ __u32 unp = 0;
+ struct cpu_key tmp_key;
+
+ /* indirect item has to be inserted */
+ make_le_item_head (&tmp_ih, &key, version, 1, TYPE_INDIRECT,
+ UNFM_P_SIZE, 0/* free_space */);
+
+ if (cpu_key_k_offset (&key) == 1) {
+ /* we are going to add 'block'-th block to the file. Use
+ allocated block for that */
+ unp = cpu_to_le32 (allocated_block_nr);
+ set_block_dev_mapped (bh_result, allocated_block_nr, inode);
+ set_buffer_new(bh_result);
+ done = 1;
+ }
+ tmp_key = key; // ;)
+ set_cpu_key_k_offset (&tmp_key, 1);
+ PATH_LAST_POSITION(&path) ++;
+
+ retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp);
+ if (retval) {
+ reiserfs_free_block (th, inode, allocated_block_nr, 1);
+ goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
+ }
+ //mark_tail_converted (inode);
+ } else if (is_direct_le_ih (ih)) {
+ /* direct item has to be converted */
+ loff_t tail_offset;
+
+ tail_offset = ((le_ih_k_offset (ih) - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
+ if (tail_offset == cpu_key_k_offset (&key)) {
+ /* direct item we just found fits into block we have
+ to map. Convert it into unformatted node: use
+ bh_result for the conversion */
+ set_block_dev_mapped (bh_result, allocated_block_nr, inode);
+ unbh = bh_result;
+ done = 1;
+ } else {
+ /* we have to padd file tail stored in direct item(s)
+ up to block size and convert it to unformatted
+ node. FIXME: this should also get into page cache */
+
+ pathrelse(&path) ;
+ /*
+ * ugly, but we can only end the transaction if
+ * we aren't nested
+ */
+ BUG_ON (!th->t_refcount);
+ if (th->t_refcount == 1) {
+ retval = reiserfs_end_persistent_transaction(th);
+ th = NULL;
+ if (retval)
+ goto failure;
+ }
+
+ retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
+ if (retval) {
+ if ( retval != -ENOSPC )
+ reiserfs_warning (inode->i_sb, "clm-6004: convert tail failed inode %lu, error %d", inode->i_ino, retval) ;
+ if (allocated_block_nr) {
+ /* the bitmap, the super, and the stat data == 3 */
+ if (!th)
+ th = reiserfs_persistent_transaction(inode->i_sb,3);
+ if (th)
+ reiserfs_free_block (th,inode,allocated_block_nr,1);
+ }
+ goto failure ;
+ }
+ goto research ;
+ }
+ retval = direct2indirect (th, inode, &path, unbh, tail_offset);
+ if (retval) {
+ reiserfs_unmap_buffer(unbh);
+ reiserfs_free_block (th, inode, allocated_block_nr, 1);
+ goto failure;
+ }
+ /* it is important the set_buffer_uptodate is done after
+ ** the direct2indirect. The buffer might contain valid
+ ** data newer than the data on disk (read by readpage, changed,
+ ** and then sent here by writepage). direct2indirect needs
+ ** to know if unbh was already up to date, so it can decide
+ ** if the data in unbh needs to be replaced with data from
+ ** the disk
+ */
+ set_buffer_uptodate (unbh);
+
+ /* unbh->b_page == NULL in case of DIRECT_IO request, this means
+ buffer will disappear shortly, so it should not be added to
+ */
+ if ( unbh->b_page ) {
+ /* we've converted the tail, so we must
+ ** flush unbh before the transaction commits
+ */
+ reiserfs_add_tail_list(inode, unbh) ;
+
+ /* mark it dirty now to prevent commit_write from adding
+ ** this buffer to the inode's dirty buffer list
+ */
+ /*
+ * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
+ * It's still atomic, but it sets the page dirty too,
+ * which makes it eligible for writeback at any time by the
+ * VM (which was also the case with __mark_buffer_dirty())
+ */
+ mark_buffer_dirty(unbh) ;
+ }
+ } else {
+ /* append indirect item with holes if needed, when appending
+ pointer to 'block'-th block use block, which is already
+ allocated */
+ struct cpu_key tmp_key;
+ unp_t unf_single=0; // We use this in case we need to allocate only
+ // one block which is a fastpath
+ unp_t *un;
+ __u64 max_to_insert=MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE;
+ __u64 blocks_needed;
+
+ RFALSE( pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
+ "vs-804: invalid position for append");
+ /* indirect item has to be appended, set up key of that position */
+ make_cpu_key (&tmp_key, inode,
+ le_key_k_offset (version, &(ih->ih_key)) + op_bytes_number (ih, inode->i_sb->s_blocksize),
+ //pos_in_item * inode->i_sb->s_blocksize,
+ TYPE_INDIRECT, 3);// key type is unimportant
+
+ blocks_needed = 1 + ((cpu_key_k_offset (&key) - cpu_key_k_offset (&tmp_key)) >> inode->i_sb->s_blocksize_bits);
+ RFALSE( blocks_needed < 0, "green-805: invalid offset");
+
+ if ( blocks_needed == 1 ) {
+ un = &unf_single;
+ } else {
+ un=kmalloc( min(blocks_needed,max_to_insert)*UNFM_P_SIZE,
+ GFP_ATOMIC); // We need to avoid scheduling.
+ if ( !un) {
+ un = &unf_single;
+ blocks_needed = 1;
+ max_to_insert = 0;
+ } else
+ memset(un, 0, UNFM_P_SIZE * min(blocks_needed,max_to_insert));
+ }
+ if ( blocks_needed <= max_to_insert) {
+ /* we are going to add target block to the file. Use allocated
+ block for that */
+ un[blocks_needed-1] = cpu_to_le32 (allocated_block_nr);
+ set_block_dev_mapped (bh_result, allocated_block_nr, inode);
+ set_buffer_new(bh_result);
+ done = 1;
+ } else {
+ /* paste hole to the indirect item */
+ /* If kmalloc failed, max_to_insert becomes zero and it means we
+ only have space for one block */
+ blocks_needed=max_to_insert?max_to_insert:1;
+ }
+ retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed);
+
+ if (blocks_needed != 1)
+ kfree(un);
+
+ if (retval) {
+ reiserfs_free_block (th, inode, allocated_block_nr, 1);
+ goto failure;
+ }
+ if (!done) {
+ /* We need to mark new file size in case this function will be
+ interrupted/aborted later on. And we may do this only for
+ holes. */
+ inode->i_size += inode->i_sb->s_blocksize * blocks_needed;
+ }
+ }
+
+ if (done == 1)
+ break;
+
+ /* this loop could log more blocks than we had originally asked
+ ** for. So, we have to allow the transaction to end if it is
+ ** too big or too full. Update the inode so things are
+ ** consistent if we crash before the function returns
+ **
+ ** release the path so that anybody waiting on the path before
+ ** ending their transaction will be able to continue.
+ */
+ if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
+ retval = restart_transaction(th, inode, &path) ;
+ if (retval)
+ goto failure;
+ }
+ /* inserting indirect pointers for a hole can take a
+ ** long time. reschedule if needed
+ */
+ cond_resched();
+
+ retval = search_for_position_by_key (inode->i_sb, &key, &path);
+ if (retval == IO_ERROR) {
+ retval = -EIO;
+ goto failure;
+ }
+ if (retval == POSITION_FOUND) {
+ reiserfs_warning (inode->i_sb, "vs-825: reiserfs_get_block: "
+ "%K should not be found", &key);
+ retval = -EEXIST;
+ if (allocated_block_nr)
+ reiserfs_free_block (th, inode, allocated_block_nr, 1);
+ pathrelse(&path) ;
+ goto failure;
+ }
+ bh = get_last_bh (&path);
+ ih = get_ih (&path);
+ item = get_item (&path);
+ pos_in_item = path.pos_in_item;
+ } while (1);
+
+
+ retval = 0;
+
+ failure:
+ if (th && (!dangle || (retval && !th->t_trans_id))) {
+ int err;
+ if (th->t_trans_id)
+ reiserfs_update_sd(th, inode);
+ err = reiserfs_end_persistent_transaction(th);
+ if (err)
+ retval = err;
+ }
+
+ reiserfs_write_unlock(inode->i_sb);
+ reiserfs_check_path(&path) ;
+ return retval;
+}
+
+static int
+reiserfs_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
+}
+
+/* Compute real number of used bytes by file
+ * Following three functions can go away when we'll have enough space in stat item
+ */
+static int real_space_diff(struct inode *inode, int sd_size)
+{
+ int bytes;
+ loff_t blocksize = inode->i_sb->s_blocksize ;
+
+ if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
+ return sd_size ;
+
+ /* End of file is also in full block with indirect reference, so round
+ ** up to the next block.
+ **
+ ** there is just no way to know if the tail is actually packed
+ ** on the file, so we have to assume it isn't. When we pack the
+ ** tail, we add 4 bytes to pretend there really is an unformatted
+ ** node pointer
+ */
+ bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size;
+ return bytes ;
+}
+
+static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
+ int sd_size)
+{
+ if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+ return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ;
+ }
+ return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9);
+}
+
+/* Compute number of blocks used by file in ReiserFS counting */
+static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
+{
+ loff_t bytes = inode_get_bytes(inode) ;
+ loff_t real_space = real_space_diff(inode, sd_size) ;
+
+ /* keeps fsck and non-quota versions of reiserfs happy */
+ if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+ bytes += (loff_t)511 ;
+ }
+
+ /* files from before the quota patch might i_blocks such that
+ ** bytes < real_space. Deal with that here to prevent it from
+ ** going negative.
+ */
+ if (bytes < real_space)
+ return 0 ;
+ return (bytes - real_space) >> 9;
+}
+
+//
+// BAD: new directories have stat data of new type and all other items
+// of old type. Version stored in the inode says about body items, so
+// in update_stat_data we can not rely on inode, but have to check
+// item version directly
+//
+
+// called by read_locked_inode
+static void init_inode (struct inode * inode, struct path * path)
+{
+ struct buffer_head * bh;
+ struct item_head * ih;
+ __u32 rdev;
+ //int version = ITEM_VERSION_1;
+
+ bh = PATH_PLAST_BUFFER (path);
+ ih = PATH_PITEM_HEAD (path);
+
+
+ copy_key (INODE_PKEY (inode), &(ih->ih_key));
+ inode->i_blksize = reiserfs_default_io_size;
+
+ INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
+ REISERFS_I(inode)->i_flags = 0;
+ REISERFS_I(inode)->i_prealloc_block = 0;
+ REISERFS_I(inode)->i_prealloc_count = 0;
+ REISERFS_I(inode)->i_trans_id = 0;
+ REISERFS_I(inode)->i_jl = NULL;
+ REISERFS_I(inode)->i_acl_access = NULL;
+ REISERFS_I(inode)->i_acl_default = NULL;
+ init_rwsem (&REISERFS_I(inode)->xattr_sem);
+
+ if (stat_data_v1 (ih)) {
+ struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih);
+ unsigned long blocks;
+
+ set_inode_item_key_version (inode, KEY_FORMAT_3_5);
+ set_inode_sd_version (inode, STAT_DATA_V1);
+ inode->i_mode = sd_v1_mode(sd);
+ inode->i_nlink = sd_v1_nlink(sd);
+ inode->i_uid = sd_v1_uid(sd);
+ inode->i_gid = sd_v1_gid(sd);
+ inode->i_size = sd_v1_size(sd);
+ inode->i_atime.tv_sec = sd_v1_atime(sd);
+ inode->i_mtime.tv_sec = sd_v1_mtime(sd);
+ inode->i_ctime.tv_sec = sd_v1_ctime(sd);
+ inode->i_atime.tv_nsec = 0;
+ inode->i_ctime.tv_nsec = 0;
+ inode->i_mtime.tv_nsec = 0;
+
+ inode->i_blocks = sd_v1_blocks(sd);
+ inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
+ blocks = (inode->i_size + 511) >> 9;
+ blocks = _ROUND_UP (blocks, inode->i_sb->s_blocksize >> 9);
+ if (inode->i_blocks > blocks) {
+ // there was a bug in <=3.5.23 when i_blocks could take negative
+ // values. Starting from 3.5.17 this value could even be stored in
+ // stat data. For such files we set i_blocks based on file
+ // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
+ // only updated if file's inode will ever change
+ inode->i_blocks = blocks;
+ }
+
+ rdev = sd_v1_rdev(sd);
+ REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd);
+ /* an early bug in the quota code can give us an odd number for the
+ ** block count. This is incorrect, fix it here.
+ */
+ if (inode->i_blocks & 1) {
+ inode->i_blocks++ ;
+ }
+ inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
+ SD_V1_SIZE));
+ /* nopack is initially zero for v1 objects. For v2 objects,
+ nopack is initialised from sd_attrs */
+ REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
+ } else {
+ // new stat data found, but object may have old items
+ // (directories and symlinks)
+ struct stat_data * sd = (struct stat_data *)B_I_PITEM (bh, ih);
+
+ inode->i_mode = sd_v2_mode(sd);
+ inode->i_nlink = sd_v2_nlink(sd);
+ inode->i_uid = sd_v2_uid(sd);
+ inode->i_size = sd_v2_size(sd);
+ inode->i_gid = sd_v2_gid(sd);
+ inode->i_mtime.tv_sec = sd_v2_mtime(sd);
+ inode->i_atime.tv_sec = sd_v2_atime(sd);
+ inode->i_ctime.tv_sec = sd_v2_ctime(sd);
+ inode->i_ctime.tv_nsec = 0;
+ inode->i_mtime.tv_nsec = 0;
+ inode->i_atime.tv_nsec = 0;
+ inode->i_blocks = sd_v2_blocks(sd);
+ rdev = sd_v2_rdev(sd);
+ if( S_ISCHR( inode -> i_mode ) || S_ISBLK( inode -> i_mode ) )
+ inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
+ else
+ inode->i_generation = sd_v2_generation(sd);
+
+ if (S_ISDIR (inode->i_mode) || S_ISLNK (inode->i_mode))
+ set_inode_item_key_version (inode, KEY_FORMAT_3_5);
+ else
+ set_inode_item_key_version (inode, KEY_FORMAT_3_6);
+ REISERFS_I(inode)->i_first_direct_byte = 0;
+ set_inode_sd_version (inode, STAT_DATA_V2);
+ inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
+ SD_V2_SIZE));
+ /* read persistent inode attributes from sd and initalise
+ generic inode flags from them */
+ REISERFS_I(inode)->i_attrs = sd_v2_attrs( sd );
+ sd_attrs_to_i_attrs( sd_v2_attrs( sd ), inode );
+ }
+
+ pathrelse (path);
+ if (S_ISREG (inode->i_mode)) {
+ inode->i_op = &reiserfs_file_inode_operations;
+ inode->i_fop = &reiserfs_file_operations;
+ inode->i_mapping->a_ops = &reiserfs_address_space_operations ;
+ } else if (S_ISDIR (inode->i_mode)) {
+ inode->i_op = &reiserfs_dir_inode_operations;
+ inode->i_fop = &reiserfs_dir_operations;
+ } else if (S_ISLNK (inode->i_mode)) {
+ inode->i_op = &reiserfs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &reiserfs_address_space_operations;
+ } else {
+ inode->i_blocks = 0;
+ inode->i_op = &reiserfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+ }
+}
+
+
+// update new stat data with inode fields
+static void inode2sd (void * sd, struct inode * inode, loff_t size)
+{
+ struct stat_data * sd_v2 = (struct stat_data *)sd;
+ __u16 flags;
+
+ set_sd_v2_mode(sd_v2, inode->i_mode );
+ set_sd_v2_nlink(sd_v2, inode->i_nlink );
+ set_sd_v2_uid(sd_v2, inode->i_uid );
+ set_sd_v2_size(sd_v2, size );
+ set_sd_v2_gid(sd_v2, inode->i_gid );
+ set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec );
+ set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec );
+ set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec );
+ set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+ set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
+ else
+ set_sd_v2_generation(sd_v2, inode->i_generation);
+ flags = REISERFS_I(inode)->i_attrs;
+ i_attrs_to_sd_attrs( inode, &flags );
+ set_sd_v2_attrs( sd_v2, flags );
+}
+
+
+// used to copy inode's fields to old stat data
+static void inode2sd_v1 (void * sd, struct inode * inode, loff_t size)
+{
+ struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd;
+
+ set_sd_v1_mode(sd_v1, inode->i_mode );
+ set_sd_v1_uid(sd_v1, inode->i_uid );
+ set_sd_v1_gid(sd_v1, inode->i_gid );
+ set_sd_v1_nlink(sd_v1, inode->i_nlink );
+ set_sd_v1_size(sd_v1, size );
+ set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec );
+ set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec );
+ set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec );
+
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+ set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
+ else
+ set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
+
+ // Sigh. i_first_direct_byte is back
+ set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte);
+}
+
+
+/* NOTE, you must prepare the buffer head before sending it here,
+** and then log it after the call
+*/
+static void update_stat_data (struct path * path, struct inode * inode,
+ loff_t size)
+{
+ struct buffer_head * bh;
+ struct item_head * ih;
+
+ bh = PATH_PLAST_BUFFER (path);
+ ih = PATH_PITEM_HEAD (path);
+
+ if (!is_statdata_le_ih (ih))
+ reiserfs_panic (inode->i_sb, "vs-13065: update_stat_data: key %k, found item %h",
+ INODE_PKEY (inode), ih);
+
+ if (stat_data_v1 (ih)) {
+ // path points to old stat data
+ inode2sd_v1 (B_I_PITEM (bh, ih), inode, size);
+ } else {
+ inode2sd (B_I_PITEM (bh, ih), inode, size);
+ }
+
+ return;
+}
+
+
+void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th,
+ struct inode * inode, loff_t size)
+{
+ struct cpu_key key;
+ INITIALIZE_PATH(path);
+ struct buffer_head *bh ;
+ int fs_gen ;
+ struct item_head *ih, tmp_ih ;
+ int retval;
+
+ BUG_ON (!th->t_trans_id);
+
+ make_cpu_key (&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);//key type is unimportant
+
+ for(;;) {
+ int pos;
+ /* look for the object's stat data */
+ retval = search_item (inode->i_sb, &key, &path);
+ if (retval == IO_ERROR) {
+ reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: "
+ "i/o failure occurred trying to update %K stat data",
+ &key);
+ return;
+ }
+ if (retval == ITEM_NOT_FOUND) {
+ pos = PATH_LAST_POSITION (&path);
+ pathrelse(&path) ;
+ if (inode->i_nlink == 0) {
+ /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found");*/
+ return;
+ }
+ reiserfs_warning (inode->i_sb, "vs-13060: reiserfs_update_sd: "
+ "stat data of object %k (nlink == %d) not found (pos %d)",
+ INODE_PKEY (inode), inode->i_nlink, pos);
+ reiserfs_check_path(&path) ;
+ return;
+ }
+
+ /* sigh, prepare_for_journal might schedule. When it schedules the
+ ** FS might change. We have to detect that, and loop back to the
+ ** search if the stat data item has moved
+ */
+ bh = get_last_bh(&path) ;
+ ih = get_ih(&path) ;
+ copy_item_head (&tmp_ih, ih);
+ fs_gen = get_generation (inode->i_sb);
+ reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
+ if (fs_changed (fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
+ reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
+ continue ; /* Stat_data item has been moved after scheduling. */
+ }
+ break;
+ }
+ update_stat_data (&path, inode, size);
+ journal_mark_dirty(th, th->t_super, bh) ;
+ pathrelse (&path);
+ return;
+}
+
+/* reiserfs_read_locked_inode is called to read the inode off disk, and it
+** does a make_bad_inode when things go wrong. But, we need to make sure
+** and clear the key in the private portion of the inode, otherwise a
+** corresponding iput might try to delete whatever object the inode last
+** represented.
+*/
+static void reiserfs_make_bad_inode(struct inode *inode) {
+ memset(INODE_PKEY(inode), 0, KEY_SIZE);
+ make_bad_inode(inode);
+}
+
+//
+// initially this function was derived from minix or ext2's analog and
+// evolved as the prototype did
+//
+
+int reiserfs_init_locked_inode (struct inode * inode, void *p)
+{
+ struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p ;
+ inode->i_ino = args->objectid;
+ INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
+ return 0;
+}
+
+/* looks for stat data in the tree, and fills up the fields of in-core
+ inode stat data fields */
+void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args *args)
+{
+ INITIALIZE_PATH (path_to_sd);
+ struct cpu_key key;
+ unsigned long dirino;
+ int retval;
+
+ dirino = args->dirid ;
+
+ /* set version 1, version 2 could be used too, because stat data
+ key is the same in both versions */
+ key.version = KEY_FORMAT_3_5;
+ key.on_disk_key.k_dir_id = dirino;
+ key.on_disk_key.k_objectid = inode->i_ino;
+ key.on_disk_key.u.k_offset_v1.k_offset = SD_OFFSET;
+ key.on_disk_key.u.k_offset_v1.k_uniqueness = SD_UNIQUENESS;
+
+ /* look for the object's stat data */
+ retval = search_item (inode->i_sb, &key, &path_to_sd);
+ if (retval == IO_ERROR) {
+ reiserfs_warning (inode->i_sb, "vs-13070: reiserfs_read_locked_inode: "
+ "i/o failure occurred trying to find stat data of %K",
+ &key);
+ reiserfs_make_bad_inode(inode) ;
+ return;
+ }
+ if (retval != ITEM_FOUND) {
+ /* a stale NFS handle can trigger this without it being an error */
+ pathrelse (&path_to_sd);
+ reiserfs_make_bad_inode(inode) ;
+ inode->i_nlink = 0;
+ return;
+ }
+
+ init_inode (inode, &path_to_sd);
+
+ /* It is possible that knfsd is trying to access inode of a file
+ that is being removed from the disk by some other thread. As we
+ update sd on unlink all that is required is to check for nlink
+ here. This bug was first found by Sizif when debugging
+ SquidNG/Butterfly, forgotten, and found again after Philippe
+ Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
+
+ More logical fix would require changes in fs/inode.c:iput() to
+ remove inode from hash-table _after_ fs cleaned disk stuff up and
+ in iget() to return NULL if I_FREEING inode is found in
+ hash-table. */
+ /* Currently there is one place where it's ok to meet inode with
+ nlink==0: processing of open-unlinked and half-truncated files
+ during mount (fs/reiserfs/super.c:finish_unfinished()). */
+ if( ( inode -> i_nlink == 0 ) &&
+ ! REISERFS_SB(inode -> i_sb) -> s_is_unlinked_ok ) {
+ reiserfs_warning (inode->i_sb,
+ "vs-13075: reiserfs_read_locked_inode: "
+ "dead inode read from disk %K. "
+ "This is likely to be race with knfsd. Ignore",
+ &key );
+ reiserfs_make_bad_inode( inode );
+ }
+
+ reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */
+
+}
+
+/**
+ * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
+ *
+ * @inode: inode from hash table to check
+ * @opaque: "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
+ *
+ * This function is called by iget5_locked() to distinguish reiserfs inodes
+ * having the same inode numbers. Such inodes can only exist due to some
+ * error condition. One of them should be bad. Inodes with identical
+ * inode numbers (objectids) are distinguished by parent directory ids.
+ *
+ */
+int reiserfs_find_actor( struct inode *inode, void *opaque )
+{
+ struct reiserfs_iget_args *args;
+
+ args = opaque;
+ /* args is already in CPU order */
+ return (inode->i_ino == args->objectid) &&
+ (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
+}
+
+struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key)
+{
+ struct inode * inode;
+ struct reiserfs_iget_args args ;
+
+ args.objectid = key->on_disk_key.k_objectid ;
+ args.dirid = key->on_disk_key.k_dir_id ;
+ inode = iget5_locked (s, key->on_disk_key.k_objectid,
+ reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args));
+ if (!inode)
+ return ERR_PTR(-ENOMEM) ;
+
+ if (inode->i_state & I_NEW) {
+ reiserfs_read_locked_inode(inode, &args);
+ unlock_new_inode(inode);
+ }
+
+ if (comp_short_keys (INODE_PKEY (inode), key) || is_bad_inode (inode)) {
+ /* either due to i/o error or a stale NFS handle */
+ iput (inode);
+ inode = NULL;
+ }
+ return inode;
+}
+
+struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp)
+{
+ __u32 *data = vobjp;
+ struct cpu_key key ;
+ struct dentry *result;
+ struct inode *inode;
+
+ key.on_disk_key.k_objectid = data[0] ;
+ key.on_disk_key.k_dir_id = data[1] ;
+ reiserfs_write_lock(sb);
+ inode = reiserfs_iget(sb, &key) ;
+ if (inode && !IS_ERR(inode) && data[2] != 0 &&
+ data[2] != inode->i_generation) {
+ iput(inode) ;
+ inode = NULL ;
+ }
+ reiserfs_write_unlock(sb);
+ if (!inode)
+ inode = ERR_PTR(-ESTALE);
+ if (IS_ERR(inode))
+ return ERR_PTR(PTR_ERR(inode));
+ result = d_alloc_anon(inode);
+ if (!result) {
+ iput(inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ return result;
+}
+
+struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 *data,
+ int len, int fhtype,
+ int (*acceptable)(void *contect, struct dentry *de),
+ void *context) {
+ __u32 obj[3], parent[3];
+
+ /* fhtype happens to reflect the number of u32s encoded.
+ * due to a bug in earlier code, fhtype might indicate there
+ * are more u32s then actually fitted.
+ * so if fhtype seems to be more than len, reduce fhtype.
+ * Valid types are:
+ * 2 - objectid + dir_id - legacy support
+ * 3 - objectid + dir_id + generation
+ * 4 - objectid + dir_id + objectid and dirid of parent - legacy
+ * 5 - objectid + dir_id + generation + objectid and dirid of parent
+ * 6 - as above plus generation of directory
+ * 6 does not fit in NFSv2 handles
+ */
+ if (fhtype > len) {
+ if (fhtype != 6 || len != 5)
+ reiserfs_warning (sb, "nfsd/reiserfs, fhtype=%d, len=%d - odd",
+ fhtype, len);
+ fhtype = 5;
+ }
+
+ obj[0] = data[0];
+ obj[1] = data[1];
+ if (fhtype == 3 || fhtype >= 5)
+ obj[2] = data[2];
+ else obj[2] = 0; /* generation number */
+
+ if (fhtype >= 4) {
+ parent[0] = data[fhtype>=5?3:2] ;
+ parent[1] = data[fhtype>=5?4:3] ;
+ if (fhtype == 6)
+ parent[2] = data[5];
+ else parent[2] = 0;
+ }
+ return sb->s_export_op->find_exported_dentry(sb, obj, fhtype < 4 ? NULL : parent,
+ acceptable, context);
+}
+
+int reiserfs_encode_fh(struct dentry *dentry, __u32 *data, int *lenp, int need_parent) {
+ struct inode *inode = dentry->d_inode ;
+ int maxlen = *lenp;
+
+ if (maxlen < 3)
+ return 255 ;
+
+ data[0] = inode->i_ino ;
+ data[1] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
+ data[2] = inode->i_generation ;
+ *lenp = 3 ;
+ /* no room for directory info? return what we've stored so far */
+ if (maxlen < 5 || ! need_parent)
+ return 3 ;
+
+ spin_lock(&dentry->d_lock);
+ inode = dentry->d_parent->d_inode ;
+ data[3] = inode->i_ino ;
+ data[4] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
+ *lenp = 5 ;
+ if (maxlen >= 6) {
+ data[5] = inode->i_generation ;
+ *lenp = 6 ;
+ }
+ spin_unlock(&dentry->d_lock);
+ return *lenp ;
+}
+
+
+/* looks for stat data, then copies fields to it, marks the buffer
+ containing stat data as dirty */
+/* reiserfs inodes are never really dirty, since the dirty inode call
+** always logs them. This call allows the VFS inode marking routines
+** to properly mark inodes for datasync and such, but only actually
+** does something when called for a synchronous update.
+*/
+int reiserfs_write_inode (struct inode * inode, int do_sync) {
+ struct reiserfs_transaction_handle th ;
+ int jbegin_count = 1 ;
+
+ if (inode->i_sb->s_flags & MS_RDONLY)
+ return -EROFS;
+ /* memory pressure can sometimes initiate write_inode calls with sync == 1,
+ ** these cases are just when the system needs ram, not when the
+ ** inode needs to reach disk for safety, and they can safely be
+ ** ignored because the altered inode has already been logged.
+ */
+ if (do_sync && !(current->flags & PF_MEMALLOC)) {
+ reiserfs_write_lock(inode->i_sb);
+ if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
+ reiserfs_update_sd (&th, inode);
+ journal_end_sync(&th, inode->i_sb, jbegin_count) ;
+ }
+ reiserfs_write_unlock(inode->i_sb);
+ }
+ return 0;
+}
+
+/* stat data of new object is inserted already, this inserts the item
+ containing "." and ".." entries */
+static int reiserfs_new_directory (struct reiserfs_transaction_handle *th,
+ struct inode *inode,
+ struct item_head * ih, struct path * path,
+ struct inode * dir)
+{
+ struct super_block * sb = th->t_super;
+ char empty_dir [EMPTY_DIR_SIZE];
+ char * body = empty_dir;
+ struct cpu_key key;
+ int retval;
+
+ BUG_ON (!th->t_trans_id);
+
+ _make_cpu_key (&key, KEY_FORMAT_3_5, le32_to_cpu (ih->ih_key.k_dir_id),
+ le32_to_cpu (ih->ih_key.k_objectid), DOT_OFFSET, TYPE_DIRENTRY, 3/*key length*/);
+
+ /* compose item head for new item. Directories consist of items of
+ old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
+ is done by reiserfs_new_inode */
+ if (old_format_only (sb)) {
+ make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
+
+ make_empty_dir_item_v1 (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
+ INODE_PKEY (dir)->k_dir_id,
+ INODE_PKEY (dir)->k_objectid );
+ } else {
+ make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
+
+ make_empty_dir_item (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
+ INODE_PKEY (dir)->k_dir_id,
+ INODE_PKEY (dir)->k_objectid );
+ }
+
+ /* look for place in the tree for new item */
+ retval = search_item (sb, &key, path);
+ if (retval == IO_ERROR) {
+ reiserfs_warning (sb, "vs-13080: reiserfs_new_directory: "
+ "i/o failure occurred creating new directory");
+ return -EIO;
+ }
+ if (retval == ITEM_FOUND) {
+ pathrelse (path);
+ reiserfs_warning (sb, "vs-13070: reiserfs_new_directory: "
+ "object with this key exists (%k)", &(ih->ih_key));
+ return -EEXIST;
+ }
+
+ /* insert item, that is empty directory item */
+ return reiserfs_insert_item (th, path, &key, ih, inode, body);
+}
+
+
+/* stat data of object has been inserted, this inserts the item
+ containing the body of symlink */
+static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th,
+ struct inode *inode, /* Inode of symlink */
+ struct item_head * ih,
+ struct path * path, const char * symname, int item_len)
+{
+ struct super_block * sb = th->t_super;
+ struct cpu_key key;
+ int retval;
+
+ BUG_ON (!th->t_trans_id);
+
+ _make_cpu_key (&key, KEY_FORMAT_3_5,
+ le32_to_cpu (ih->ih_key.k_dir_id),
+ le32_to_cpu (ih->ih_key.k_objectid),
+ 1, TYPE_DIRECT, 3/*key length*/);
+
+ make_le_item_head (ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, 0/*free_space*/);
+
+ /* look for place in the tree for new item */
+ retval = search_item (sb, &key, path);
+ if (retval == IO_ERROR) {
+ reiserfs_warning (sb, "vs-13080: reiserfs_new_symlinik: "
+ "i/o failure occurred creating new symlink");
+ return -EIO;
+ }
+ if (retval == ITEM_FOUND) {
+ pathrelse (path);
+ reiserfs_warning (sb, "vs-13080: reiserfs_new_symlink: "
+ "object with this key exists (%k)", &(ih->ih_key));
+ return -EEXIST;
+ }
+
+ /* insert item, that is body of symlink */
+ return reiserfs_insert_item (th, path, &key, ih, inode, symname);
+}
+
+
+/* inserts the stat data into the tree, and then calls
+ reiserfs_new_directory (to insert ".", ".." item if new object is
+ directory) or reiserfs_new_symlink (to insert symlink body if new
+ object is symlink) or nothing (if new object is regular file)
+
+ NOTE! uid and gid must already be set in the inode. If we return
+ non-zero due to an error, we have to drop the quota previously allocated
+ for the fresh inode. This can only be done outside a transaction, so
+ if we return non-zero, we also end the transaction. */
+int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
+ struct inode * dir, int mode,
+ const char * symname,
+ /* 0 for regular, EMTRY_DIR_SIZE for dirs,
+ strlen (symname) for symlinks)*/
+ loff_t i_size, struct dentry *dentry,
+ struct inode *inode)
+{
+ struct super_block * sb;
+ INITIALIZE_PATH (path_to_key);
+ struct cpu_key key;
+ struct item_head ih;
+ struct stat_data sd;
+ int retval;
+ int err;
+
+ BUG_ON (!th->t_trans_id);
+
+ if (DQUOT_ALLOC_INODE(inode)) {
+ err = -EDQUOT;
+ goto out_end_trans;
+ }
+ if (!dir || !dir->i_nlink) {
+ err = -EPERM;
+ goto out_bad_inode;
+ }
+
+ sb = dir->i_sb;
+
+ /* item head of new item */
+ ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
+ ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th));
+ if (!ih.ih_key.k_objectid) {
+ err = -ENOMEM;
+ goto out_bad_inode ;
+ }
+ if (old_format_only (sb))
+ /* not a perfect generation count, as object ids can be reused, but
+ ** this is as good as reiserfs can do right now.
+ ** note that the private part of inode isn't filled in yet, we have
+ ** to use the directory.
+ */
+ inode->i_generation = le32_to_cpu (INODE_PKEY (dir)->k_objectid);
+ else
+#if defined( USE_INODE_GENERATION_COUNTER )
+ inode->i_generation = le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
+#else
+ inode->i_generation = ++event;
+#endif
+
+ /* fill stat data */
+ inode->i_nlink = (S_ISDIR (mode) ? 2 : 1);
+
+ /* uid and gid must already be set by the caller for quota init */
+
+ /* symlink cannot be immutable or append only, right? */
+ if( S_ISLNK( inode -> i_mode ) )
+ inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND );
+
+ inode->i_mtime = inode->i_atime = inode->i_ctime =
+ CURRENT_TIME_SEC;
+ inode->i_size = i_size;
+ inode->i_blocks = 0;
+ inode->i_bytes = 0;
+ REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
+ U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/;
+
+ INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
+ REISERFS_I(inode)->i_flags = 0;
+ REISERFS_I(inode)->i_prealloc_block = 0;
+ REISERFS_I(inode)->i_prealloc_count = 0;
+ REISERFS_I(inode)->i_trans_id = 0;
+ REISERFS_I(inode)->i_jl = NULL;
+ REISERFS_I(inode)->i_attrs =
+ REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
+ sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode );
+ REISERFS_I(inode)->i_acl_access = NULL;
+ REISERFS_I(inode)->i_acl_default = NULL;
+ init_rwsem (&REISERFS_I(inode)->xattr_sem);
+
+ if (old_format_only (sb))
+ make_le_item_head (&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
+ else
+ make_le_item_head (&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
+
+ /* key to search for correct place for new stat data */
+ _make_cpu_key (&key, KEY_FORMAT_3_6, le32_to_cpu (ih.ih_key.k_dir_id),
+ le32_to_cpu (ih.ih_key.k_objectid), SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/);
+
+ /* find proper place for inserting of stat data */
+ retval = search_item (sb, &key, &path_to_key);
+ if (retval == IO_ERROR) {
+ err = -EIO;
+ goto out_bad_inode;
+ }
+ if (retval == ITEM_FOUND) {
+ pathrelse (&path_to_key);
+ err = -EEXIST;
+ goto out_bad_inode;
+ }
+ if (old_format_only (sb)) {
+ if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
+ pathrelse (&path_to_key);
+ /* i_uid or i_gid is too big to be stored in stat data v3.5 */
+ err = -EINVAL;
+ goto out_bad_inode;
+ }
+ inode2sd_v1 (&sd, inode, inode->i_size);
+ } else {
+ inode2sd (&sd, inode, inode->i_size);
+ }
+ // these do not go to on-disk stat data
+ inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid);
+ inode->i_blksize = reiserfs_default_io_size;
+
+ // store in in-core inode the key of stat data and version all
+ // object items will have (directory items will have old offset
+ // format, other new objects will consist of new items)
+ memcpy (INODE_PKEY (inode), &(ih.ih_key), KEY_SIZE);
+ if (old_format_only (sb) || S_ISDIR(mode) || S_ISLNK(mode))
+ set_inode_item_key_version (inode, KEY_FORMAT_3_5);
+ else
+ set_inode_item_key_version (inode, KEY_FORMAT_3_6);
+ if (old_format_only (sb))
+ set_inode_sd_version (inode, STAT_DATA_V1);
+ else
+ set_inode_sd_version (inode, STAT_DATA_V2);
+
+ /* insert the stat data into the tree */
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+ if (REISERFS_I(dir)->new_packing_locality)
+ th->displace_new_blocks = 1;
+#endif
+ retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd));
+ if (retval) {
+ err = retval;
+ reiserfs_check_path(&path_to_key) ;
+ goto out_bad_inode;
+ }
+
+#ifdef DISPLACE_NEW_PACKING_LOCALITIES
+ if (!th->displace_new_blocks)
+ REISERFS_I(dir)->new_packing_locality = 0;
+#endif
+ if (S_ISDIR(mode)) {
+ /* insert item with "." and ".." */
+ retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir);
+ }
+
+ if (S_ISLNK(mode)) {
+ /* insert body of symlink */
+ if (!old_format_only (sb))
+ i_size = ROUND_UP(i_size);
+ retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size);
+ }
+ if (retval) {
+ err = retval;
+ reiserfs_check_path(&path_to_key) ;
+ journal_end(th, th->t_super, th->t_blocks_allocated);
+ goto out_inserted_sd;
+ }
+
+ /* XXX CHECK THIS */
+ if (reiserfs_posixacl (inode->i_sb)) {
+ retval = reiserfs_inherit_default_acl (dir, dentry, inode);
+ if (retval) {
+ err = retval;
+ reiserfs_check_path(&path_to_key) ;
+ journal_end(th, th->t_super, th->t_blocks_allocated);
+ goto out_inserted_sd;
+ }
+ } else if (inode->i_sb->s_flags & MS_POSIXACL) {
+ reiserfs_warning (inode->i_sb, "ACLs aren't enabled in the fs, "
+ "but vfs thinks they are!");
+ } else if (is_reiserfs_priv_object (dir)) {
+ reiserfs_mark_inode_private (inode);
+ }
+
+ insert_inode_hash (inode);
+ reiserfs_update_sd(th, inode);
+ reiserfs_check_path(&path_to_key) ;
+
+ return 0;
+
+/* it looks like you can easily compress these two goto targets into
+ * one. Keeping it like this doesn't actually hurt anything, and they
+ * are place holders for what the quota code actually needs.
+ */
+out_bad_inode:
+ /* Invalidate the object, nothing was inserted yet */
+ INODE_PKEY(inode)->k_objectid = 0;
+
+ /* Quota change must be inside a transaction for journaling */
+ DQUOT_FREE_INODE(inode);
+
+out_end_trans:
+ journal_end(th, th->t_super, th->t_blocks_allocated) ;
+ /* Drop can be outside and it needs more credits so it's better to have it outside */
+ DQUOT_DROP(inode);
+ inode->i_flags |= S_NOQUOTA;
+ make_bad_inode(inode);
+
+out_inserted_sd:
+ inode->i_nlink = 0;
+ th->t_trans_id = 0; /* so the caller can't use this handle later */
+ iput(inode);
+ return err;
+}
+
+/*
+** finds the tail page in the page cache,
+** reads the last block in.
+**
+** On success, page_result is set to a locked, pinned page, and bh_result
+** is set to an up to date buffer for the last block in the file. returns 0.
+**
+** tail conversion is not done, so bh_result might not be valid for writing
+** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
+** trying to write the block.
+**
+** on failure, nonzero is returned, page_result and bh_result are untouched.
+*/
+static int grab_tail_page(struct inode *p_s_inode,
+ struct page **page_result,
+ struct buffer_head **bh_result) {
+
+ /* we want the page with the last byte in the file,
+ ** not the page that will hold the next byte for appending
+ */
+ unsigned long index = (p_s_inode->i_size-1) >> PAGE_CACHE_SHIFT ;
+ unsigned long pos = 0 ;
+ unsigned long start = 0 ;
+ unsigned long blocksize = p_s_inode->i_sb->s_blocksize ;
+ unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1) ;
+ struct buffer_head *bh ;
+ struct buffer_head *head ;
+ struct page * page ;
+ int error ;
+
+ /* we know that we are only called with inode->i_size > 0.
+ ** we also know that a file tail can never be as big as a block
+ ** If i_size % blocksize == 0, our file is currently block aligned
+ ** and it won't need converting or zeroing after a truncate.
+ */
+ if ((offset & (blocksize - 1)) == 0) {
+ return -ENOENT ;
+ }
+ page = grab_cache_page(p_s_inode->i_mapping, index) ;
+ error = -ENOMEM ;
+ if (!page) {
+ goto out ;
+ }
+ /* start within the page of the last block in the file */
+ start = (offset / blocksize) * blocksize ;
+
+ error = block_prepare_write(page, start, offset,
+ reiserfs_get_block_create_0) ;
+ if (error)
+ goto unlock ;
+
+ head = page_buffers(page) ;
+ bh = head;
+ do {
+ if (pos >= start) {
+ break ;
+ }
+ bh = bh->b_this_page ;
+ pos += blocksize ;
+ } while(bh != head) ;
+
+ if (!buffer_uptodate(bh)) {
+ /* note, this should never happen, prepare_write should
+ ** be taking care of this for us. If the buffer isn't up to date,
+ ** I've screwed up the code to find the buffer, or the code to
+ ** call prepare_write
+ */
+ reiserfs_warning (p_s_inode->i_sb,
+ "clm-6000: error reading block %lu on dev %s",
+ bh->b_blocknr,
+ reiserfs_bdevname (p_s_inode->i_sb)) ;
+ error = -EIO ;
+ goto unlock ;
+ }
+ *bh_result = bh ;
+ *page_result = page ;
+
+out:
+ return error ;
+
+unlock:
+ unlock_page(page) ;
+ page_cache_release(page) ;
+ return error ;
+}
+
+/*
+** vfs version of truncate file. Must NOT be called with
+** a transaction already started.
+**
+** some code taken from block_truncate_page
+*/
+int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) {
+ struct reiserfs_transaction_handle th ;
+ /* we want the offset for the first byte after the end of the file */
+ unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ;
+ unsigned blocksize = p_s_inode->i_sb->s_blocksize ;
+ unsigned length ;
+ struct page *page = NULL ;
+ int error ;
+ struct buffer_head *bh = NULL ;
+
+ reiserfs_write_lock(p_s_inode->i_sb);
+
+ if (p_s_inode->i_size > 0) {
+ if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
+ // -ENOENT means we truncated past the end of the file,
+ // and get_block_create_0 could not find a block to read in,
+ // which is ok.
+ if (error != -ENOENT)
+ reiserfs_warning (p_s_inode->i_sb,
+ "clm-6001: grab_tail_page failed %d",
+ error);
+ page = NULL ;
+ bh = NULL ;
+ }
+ }
+
+ /* so, if page != NULL, we have a buffer head for the offset at
+ ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
+ ** then we have an unformatted node. Otherwise, we have a direct item,
+ ** and no zeroing is required on disk. We zero after the truncate,
+ ** because the truncate might pack the item anyway
+ ** (it will unmap bh if it packs).
+ */
+ /* it is enough to reserve space in transaction for 2 balancings:
+ one for "save" link adding and another for the first
+ cut_from_item. 1 is for update_sd */
+ error = journal_begin (&th, p_s_inode->i_sb,
+ JOURNAL_PER_BALANCE_CNT * 2 + 1);
+ if (error)
+ goto out;
+ reiserfs_update_inode_transaction(p_s_inode) ;
+ if (update_timestamps)
+ /* we are doing real truncate: if the system crashes before the last
+ transaction of truncating gets committed - on reboot the file
+ either appears truncated properly or not truncated at all */
+ add_save_link (&th, p_s_inode, 1);
+ error = reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
+ if (error)
+ goto out;
+ error = journal_end (&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
+ if (error)
+ goto out;
+
+ if (update_timestamps) {
+ error = remove_save_link (p_s_inode, 1/* truncate */);
+ if (error)
+ goto out;
+ }
+
+ if (page) {
+ length = offset & (blocksize - 1) ;
+ /* if we are not on a block boundary */
+ if (length) {
+ char *kaddr;
+
+ length = blocksize - length ;
+ kaddr = kmap_atomic(page, KM_USER0) ;
+ memset(kaddr + offset, 0, length) ;
+ flush_dcache_page(page) ;
+ kunmap_atomic(kaddr, KM_USER0) ;
+ if (buffer_mapped(bh) && bh->b_blocknr != 0) {
+ mark_buffer_dirty(bh) ;
+ }
+ }
+ unlock_page(page) ;
+ page_cache_release(page) ;
+ }
+
+ reiserfs_write_unlock(p_s_inode->i_sb);
+ return 0;
+out:
+ if (page) {
+ unlock_page (page);
+ page_cache_release (page);
+ }
+ reiserfs_write_unlock(p_s_inode->i_sb);
+ return error;
+}
+
+static int map_block_for_writepage(struct inode *inode,
+ struct buffer_head *bh_result,
+ unsigned long block) {
+ struct reiserfs_transaction_handle th ;
+ int fs_gen ;
+ struct item_head tmp_ih ;
+ struct item_head *ih ;
+ struct buffer_head *bh ;
+ __u32 *item ;
+ struct cpu_key key ;
+ INITIALIZE_PATH(path) ;
+ int pos_in_item ;
+ int jbegin_count = JOURNAL_PER_BALANCE_CNT ;
+ loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1 ;
+ int retval ;
+ int use_get_block = 0 ;
+ int bytes_copied = 0 ;
+ int copy_size ;
+ int trans_running = 0;
+
+ /* catch places below that try to log something without starting a trans */
+ th.t_trans_id = 0;
+
+ if (!buffer_uptodate(bh_result)) {
+ return -EIO;
+ }
+
+ kmap(bh_result->b_page) ;
+start_over:
+ reiserfs_write_lock(inode->i_sb);
+ make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3) ;
+
+research:
+ retval = search_for_position_by_key(inode->i_sb, &key, &path) ;
+ if (retval != POSITION_FOUND) {
+ use_get_block = 1;
+ goto out ;
+ }
+
+ bh = get_last_bh(&path) ;
+ ih = get_ih(&path) ;
+ item = get_item(&path) ;
+ pos_in_item = path.pos_in_item ;
+
+ /* we've found an unformatted node */
+ if (indirect_item_found(retval, ih)) {
+ if (bytes_copied > 0) {
+ reiserfs_warning (inode->i_sb, "clm-6002: bytes_copied %d",
+ bytes_copied) ;
+ }
+ if (!get_block_num(item, pos_in_item)) {
+ /* crap, we are writing to a hole */
+ use_get_block = 1;
+ goto out ;
+ }
+ set_block_dev_mapped(bh_result, get_block_num(item,pos_in_item),inode);
+ } else if (is_direct_le_ih(ih)) {
+ char *p ;
+ p = page_address(bh_result->b_page) ;
+ p += (byte_offset -1) & (PAGE_CACHE_SIZE - 1) ;
+ copy_size = ih_item_len(ih) - pos_in_item;
+
+ fs_gen = get_generation(inode->i_sb) ;
+ copy_item_head(&tmp_ih, ih) ;
+
+ if (!trans_running) {
+ /* vs-3050 is gone, no need to drop the path */
+ retval = journal_begin(&th, inode->i_sb, jbegin_count) ;
+ if (retval)
+ goto out;
+ reiserfs_update_inode_transaction(inode) ;
+ trans_running = 1;
+ if (fs_changed(fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
+ reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
+ goto research;
+ }
+ }
+
+ reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
+
+ if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
+ reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
+ goto research;
+ }
+
+ memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ;
+
+ journal_mark_dirty(&th, inode->i_sb, bh) ;
+ bytes_copied += copy_size ;
+ set_block_dev_mapped(bh_result, 0, inode);
+
+ /* are there still bytes left? */
+ if (bytes_copied < bh_result->b_size &&
+ (byte_offset + bytes_copied) < inode->i_size) {
+ set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + copy_size) ;
+ goto research ;
+ }
+ } else {
+ reiserfs_warning (inode->i_sb,
+ "clm-6003: bad item inode %lu, device %s",
+ inode->i_ino, reiserfs_bdevname (inode->i_sb)) ;
+ retval = -EIO ;
+ goto out ;
+ }
+ retval = 0 ;
+
+out:
+ pathrelse(&path) ;
+ if (trans_running) {
+ int err = journal_end(&th, inode->i_sb, jbegin_count) ;
+ if (err)
+ retval = err;
+ trans_running = 0;
+ }
+ reiserfs_write_unlock(inode->i_sb);
+
+ /* this is where we fill in holes in the file. */
+ if (use_get_block) {
+ retval = reiserfs_get_block(inode, block, bh_result,
+ GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM |
+ GET_BLOCK_NO_DANGLE);
+ if (!retval) {
+ if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
+ /* get_block failed to find a mapped unformatted node. */
+ use_get_block = 0 ;
+ goto start_over ;
+ }
+ }
+ }
+ kunmap(bh_result->b_page) ;
+
+ if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
+ /* we've copied data from the page into the direct item, so the
+ * buffer in the page is now clean, mark it to reflect that.
+ */
+ lock_buffer(bh_result);
+ clear_buffer_dirty(bh_result);
+ unlock_buffer(bh_result);
+ }
+ return retval ;
+}
+
+/*
+ * mason@suse.com: updated in 2.5.54 to follow the same general io
+ * start/recovery path as __block_write_full_page, along with special
+ * code to handle reiserfs tails.
+ */
+static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) {
+ struct inode *inode = page->mapping->host ;
+ unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ;
+ int error = 0;
+ unsigned long block ;
+ struct buffer_head *head, *bh;
+ int partial = 0 ;
+ int nr = 0;
+ int checked = PageChecked(page);
+ struct reiserfs_transaction_handle th;
+ struct super_block *s = inode->i_sb;
+ int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+ th.t_trans_id = 0;
+
+ /* The page dirty bit is cleared before writepage is called, which
+ * means we have to tell create_empty_buffers to make dirty buffers
+ * The page really should be up to date at this point, so tossing
+ * in the BH_Uptodate is just a sanity check.
+ */
+ if (!page_has_buffers(page)) {
+ create_empty_buffers(page, s->s_blocksize,
+ (1 << BH_Dirty) | (1 << BH_Uptodate));
+ }
+ head = page_buffers(page) ;
+
+ /* last page in the file, zero out any contents past the
+ ** last byte in the file
+ */
+ if (page->index >= end_index) {
+ char *kaddr;
+ unsigned last_offset;
+
+ last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ;
+ /* no file contents in this page */
+ if (page->index >= end_index + 1 || !last_offset) {
+ unlock_page(page);
+ return 0;
+ }
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE-last_offset) ;
+ flush_dcache_page(page) ;
+ kunmap_atomic(kaddr, KM_USER0) ;
+ }
+ bh = head ;
+ block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits) ;
+ /* first map all the buffers, logging any direct items we find */
+ do {
+ if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) ||
+ (buffer_mapped(bh) && bh->b_blocknr == 0))) {
+ /* not mapped yet, or it points to a direct item, search
+ * the btree for the mapping info, and log any direct
+ * items found
+ */
+ if ((error = map_block_for_writepage(inode, bh, block))) {
+ goto fail ;
+ }
+ }
+ bh = bh->b_this_page;
+ block++;
+ } while(bh != head) ;
+
+ /*
+ * we start the transaction after map_block_for_writepage,
+ * because it can create holes in the file (an unbounded operation).
+ * starting it here, we can make a reliable estimate for how many
+ * blocks we're going to log
+ */
+ if (checked) {
+ ClearPageChecked(page);
+ reiserfs_write_lock(s);
+ error = journal_begin(&th, s, bh_per_page + 1);
+ if (error) {
+ reiserfs_write_unlock(s);
+ goto fail;
+ }
+ reiserfs_update_inode_transaction(inode);
+ }
+ /* now go through and lock any dirty buffers on the page */
+ do {
+ get_bh(bh);
+ if (!buffer_mapped(bh))
+ continue;
+ if (buffer_mapped(bh) && bh->b_blocknr == 0)
+ continue;
+
+ if (checked) {
+ reiserfs_prepare_for_journal(s, bh, 1);
+ journal_mark_dirty(&th, s, bh);
+ continue;
+ }
+ /* from this point on, we know the buffer is mapped to a
+ * real block and not a direct item
+ */
+ if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+ lock_buffer(bh);
+ } else {
+ if (test_set_buffer_locked(bh)) {
+ redirty_page_for_writepage(wbc, page);
+ continue;
+ }
+ }
+ if (test_clear_buffer_dirty(bh)) {
+ mark_buffer_async_write(bh);
+ } else {
+ unlock_buffer(bh);
+ }
+ } while((bh = bh->b_this_page) != head);
+
+ if (checked) {
+ error = journal_end(&th, s, bh_per_page + 1);
+ reiserfs_write_unlock(s);
+ if (error)
+ goto fail;
+ }
+ BUG_ON(PageWriteback(page));
+ set_page_writeback(page);
+ unlock_page(page);
+
+ /*
+ * since any buffer might be the only dirty buffer on the page,
+ * the first submit_bh can bring the page out of writeback.
+ * be careful with the buffers.
+ */
+ do {
+ struct buffer_head *next = bh->b_this_page;
+ if (buffer_async_write(bh)) {
+ submit_bh(WRITE, bh);
+ nr++;
+ }
+ put_bh(bh);
+ bh = next;
+ } while(bh != head);
+
+ error = 0;
+done:
+ if (nr == 0) {
+ /*
+ * if this page only had a direct item, it is very possible for
+ * no io to be required without there being an error. Or,
+ * someone else could have locked them and sent them down the
+ * pipe without locking the page
+ */
+ bh = head ;
+ do {
+ if (!buffer_uptodate(bh)) {
+ partial = 1;
+ break;
+ }
+ bh = bh->b_this_page;
+ } while(bh != head);
+ if (!partial)
+ SetPageUptodate(page);
+ end_page_writeback(page);
+ }
+ return error;
+
+fail:
+ /* catches various errors, we need to make sure any valid dirty blocks
+ * get to the media. The page is currently locked and not marked for
+ * writeback
+ */
+ ClearPageUptodate(page);
+ bh = head;
+ do {
+ get_bh(bh);
+ if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
+ lock_buffer(bh);
+ mark_buffer_async_write(bh);
+ } else {
+ /*
+ * clear any dirty bits that might have come from getting
+ * attached to a dirty page
+ */
+ clear_buffer_dirty(bh);
+ }
+ bh = bh->b_this_page;
+ } while(bh != head);
+ SetPageError(page);
+ BUG_ON(PageWriteback(page));
+ set_page_writeback(page);
+ unlock_page(page);
+ do {
+ struct buffer_head *next = bh->b_this_page;
+ if (buffer_async_write(bh)) {
+ clear_buffer_dirty(bh);
+ submit_bh(WRITE, bh);
+ nr++;
+ }
+ put_bh(bh);
+ bh = next;
+ } while(bh != head);
+ goto done;
+}
+
+
+static int reiserfs_readpage (struct file *f, struct page * page)
+{
+ return block_read_full_page (page, reiserfs_get_block);
+}
+
+
+static int reiserfs_writepage (struct page * page, struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host ;
+ reiserfs_wait_on_write_block(inode->i_sb) ;
+ return reiserfs_write_full_page(page, wbc) ;
+}
+
+static int reiserfs_prepare_write(struct file *f, struct page *page,
+ unsigned from, unsigned to) {
+ struct inode *inode = page->mapping->host ;
+ int ret;
+ int old_ref = 0;
+
+ reiserfs_wait_on_write_block(inode->i_sb) ;
+ fix_tail_page_for_writing(page) ;
+ if (reiserfs_transaction_running(inode->i_sb)) {
+ struct reiserfs_transaction_handle *th;
+ th = (struct reiserfs_transaction_handle *)current->journal_info;
+ BUG_ON (!th->t_refcount);
+ BUG_ON (!th->t_trans_id);
+ old_ref = th->t_refcount;
+ th->t_refcount++;
+ }
+
+ ret = block_prepare_write(page, from, to, reiserfs_get_block) ;
+ if (ret && reiserfs_transaction_running(inode->i_sb)) {
+ struct reiserfs_transaction_handle *th = current->journal_info;
+ /* this gets a little ugly. If reiserfs_get_block returned an
+ * error and left a transacstion running, we've got to close it,
+ * and we've got to free handle if it was a persistent transaction.
+ *
+ * But, if we had nested into an existing transaction, we need
+ * to just drop the ref count on the handle.
+ *
+ * If old_ref == 0, the transaction is from reiserfs_get_block,
+ * and it was a persistent trans. Otherwise, it was nested above.
+ */
+ if (th->t_refcount > old_ref) {
+ if (old_ref)
+ th->t_refcount--;
+ else {
+ int err;
+ reiserfs_write_lock(inode->i_sb);
+ err = reiserfs_end_persistent_transaction(th);
+ reiserfs_write_unlock(inode->i_sb);
+ if (err)
+ ret = err;
+ }
+ }
+ }
+ return ret;
+
+}
+
+
+static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) {
+ return generic_block_bmap(as, block, reiserfs_bmap) ;
+}
+
+static int reiserfs_commit_write(struct file *f, struct page *page,
+ unsigned from, unsigned to) {
+ struct inode *inode = page->mapping->host ;
+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ int ret = 0;
+ int update_sd = 0;
+ struct reiserfs_transaction_handle *th = NULL;
+
+ reiserfs_wait_on_write_block(inode->i_sb) ;
+ if (reiserfs_transaction_running(inode->i_sb)) {
+ th = current->journal_info;
+ }
+ reiserfs_commit_page(inode, page, from, to);
+
+ /* generic_commit_write does this for us, but does not update the
+ ** transaction tracking stuff when the size changes. So, we have
+ ** to do the i_size updates here.
+ */
+ if (pos > inode->i_size) {
+ struct reiserfs_transaction_handle myth ;
+ reiserfs_write_lock(inode->i_sb);
+ /* If the file have grown beyond the border where it
+ can have a tail, unmark it as needing a tail
+ packing */
+ if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) ||
+ (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
+ REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
+
+ ret = journal_begin(&myth, inode->i_sb, 1) ;
+ if (ret) {
+ reiserfs_write_unlock(inode->i_sb);
+ goto journal_error;
+ }
+ reiserfs_update_inode_transaction(inode) ;
+ inode->i_size = pos ;
+ reiserfs_update_sd(&myth, inode) ;
+ update_sd = 1;
+ ret = journal_end(&myth, inode->i_sb, 1) ;
+ reiserfs_write_unlock(inode->i_sb);
+ if (ret)
+ goto journal_error;
+ }
+ if (th) {
+ reiserfs_write_lock(inode->i_sb);
+ if (!update_sd)
+ reiserfs_update_sd(th, inode) ;
+ ret = reiserfs_end_persistent_transaction(th);
+ reiserfs_write_unlock(inode->i_sb);
+ if (ret)
+ goto out;
+ }
+
+ /* we test for O_SYNC here so we can commit the transaction
+ ** for any packed tails the file might have had
+ */
+ if (f && (f->f_flags & O_SYNC)) {
+ reiserfs_write_lock(inode->i_sb);
+ ret = reiserfs_commit_for_inode(inode) ;
+ reiserfs_write_unlock(inode->i_sb);
+ }
+out:
+ return ret ;
+
+journal_error:
+ if (th) {
+ reiserfs_write_lock(inode->i_sb);
+ if (!update_sd)
+ reiserfs_update_sd(th, inode) ;
+ ret = reiserfs_end_persistent_transaction(th);
+ reiserfs_write_unlock(inode->i_sb);
+ }
+
+ return ret;
+}
+
+void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode )
+{
+ if( reiserfs_attrs( inode -> i_sb ) ) {
+ if( sd_attrs & REISERFS_SYNC_FL )
+ inode -> i_flags |= S_SYNC;
+ else
+ inode -> i_flags &= ~S_SYNC;
+ if( sd_attrs & REISERFS_IMMUTABLE_FL )
+ inode -> i_flags |= S_IMMUTABLE;
+ else
+ inode -> i_flags &= ~S_IMMUTABLE;
+ if( sd_attrs & REISERFS_APPEND_FL )
+ inode -> i_flags |= S_APPEND;
+ else
+ inode -> i_flags &= ~S_APPEND;
+ if( sd_attrs & REISERFS_NOATIME_FL )
+ inode -> i_flags |= S_NOATIME;
+ else
+ inode -> i_flags &= ~S_NOATIME;
+ if( sd_attrs & REISERFS_NOTAIL_FL )
+ REISERFS_I(inode)->i_flags |= i_nopack_mask;
+ else
+ REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
+ }
+}
+
+void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs )
+{
+ if( reiserfs_attrs( inode -> i_sb ) ) {
+ if( inode -> i_flags & S_IMMUTABLE )
+ *sd_attrs |= REISERFS_IMMUTABLE_FL;
+ else
+ *sd_attrs &= ~REISERFS_IMMUTABLE_FL;
+ if( inode -> i_flags & S_SYNC )
+ *sd_attrs |= REISERFS_SYNC_FL;
+ else
+ *sd_attrs &= ~REISERFS_SYNC_FL;
+ if( inode -> i_flags & S_NOATIME )
+ *sd_attrs |= REISERFS_NOATIME_FL;
+ else
+ *sd_attrs &= ~REISERFS_NOATIME_FL;
+ if( REISERFS_I(inode)->i_flags & i_nopack_mask )
+ *sd_attrs |= REISERFS_NOTAIL_FL;
+ else
+ *sd_attrs &= ~REISERFS_NOTAIL_FL;
+ }
+}
+
+/* decide if this buffer needs to stay around for data logging or ordered
+** write purposes
+*/
+static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
+{
+ int ret = 1 ;
+ struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
+
+ spin_lock(&j->j_dirty_buffers_lock) ;
+ if (!buffer_mapped(bh)) {
+ goto free_jh;
+ }
+ /* the page is locked, and the only places that log a data buffer
+ * also lock the page.
+ */
+ if (reiserfs_file_data_log(inode)) {
+ /*
+ * very conservative, leave the buffer pinned if
+ * anyone might need it.
+ */
+ if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
+ ret = 0 ;
+ }
+ } else
+ if (buffer_dirty(bh) || buffer_locked(bh)) {
+ struct reiserfs_journal_list *jl;
+ struct reiserfs_jh *jh = bh->b_private;
+
+ /* why is this safe?
+ * reiserfs_setattr updates i_size in the on disk
+ * stat data before allowing vmtruncate to be called.
+ *
+ * If buffer was put onto the ordered list for this
+ * transaction, we know for sure either this transaction
+ * or an older one already has updated i_size on disk,
+ * and this ordered data won't be referenced in the file
+ * if we crash.
+ *
+ * if the buffer was put onto the ordered list for an older
+ * transaction, we need to leave it around
+ */
+ if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
+ ret = 0;
+ }
+free_jh:
+ if (ret && bh->b_private) {
+ reiserfs_free_jh(bh);
+ }
+ spin_unlock(&j->j_dirty_buffers_lock) ;
+ return ret ;
+}
+
+/* clm -- taken from fs/buffer.c:block_invalidate_page */
+static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct buffer_head *head, *bh, *next;
+ struct inode *inode = page->mapping->host;
+ unsigned int curr_off = 0;
+ int ret = 1;
+
+ BUG_ON(!PageLocked(page));
+
+ if (offset == 0)
+ ClearPageChecked(page);
+
+ if (!page_has_buffers(page))
+ goto out;
+
+ head = page_buffers(page);
+ bh = head;
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+ next = bh->b_this_page;
+
+ /*
+ * is this block fully invalidated?
+ */
+ if (offset <= curr_off) {
+ if (invalidatepage_can_drop(inode, bh))
+ reiserfs_unmap_buffer(bh);
+ else
+ ret = 0;
+ }
+ curr_off = next_off;
+ bh = next;
+ } while (bh != head);
+
+ /*
+ * We release buffers only if the entire page is being invalidated.
+ * The get_block cached value has been unconditionally invalidated,
+ * so real IO is not possible anymore.
+ */
+ if (!offset && ret)
+ ret = try_to_release_page(page, 0);
+out:
+ return ret;
+}
+
+static int reiserfs_set_page_dirty(struct page *page) {
+ struct inode *inode = page->mapping->host;
+ if (reiserfs_file_data_log(inode)) {
+ SetPageChecked(page);
+ return __set_page_dirty_nobuffers(page);
+ }
+ return __set_page_dirty_buffers(page);
+}
+
+/*
+ * Returns 1 if the page's buffers were dropped. The page is locked.
+ *
+ * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
+ * in the buffers at page_buffers(page).
+ *
+ * even in -o notail mode, we can't be sure an old mount without -o notail
+ * didn't create files with tails.
+ */
+static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
+{
+ struct inode *inode = page->mapping->host ;
+ struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
+ struct buffer_head *head ;
+ struct buffer_head *bh ;
+ int ret = 1 ;
+
+ WARN_ON(PageChecked(page));
+ spin_lock(&j->j_dirty_buffers_lock) ;
+ head = page_buffers(page) ;
+ bh = head ;
+ do {
+ if (bh->b_private) {
+ if (!buffer_dirty(bh) && !buffer_locked(bh)) {
+ reiserfs_free_jh(bh);
+ } else {
+ ret = 0 ;
+ break ;
+ }
+ }
+ bh = bh->b_this_page ;
+ } while (bh != head) ;
+ if (ret)
+ ret = try_to_free_buffers(page) ;
+ spin_unlock(&j->j_dirty_buffers_lock) ;
+ return ret ;
+}
+
+/* We thank Mingming Cao for helping us understand in great detail what
+ to do in this section of the code. */
+static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+
+ return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+ offset, nr_segs, reiserfs_get_blocks_direct_io, NULL);
+}
+
+int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
+ struct inode *inode = dentry->d_inode ;
+ int error ;
+ unsigned int ia_valid = attr->ia_valid;
+ reiserfs_write_lock(inode->i_sb);
+ if (attr->ia_valid & ATTR_SIZE) {
+ /* version 2 items will be caught by the s_maxbytes check
+ ** done for us in vmtruncate
+ */
+ if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
+ attr->ia_size > MAX_NON_LFS) {
+ error = -EFBIG ;
+ goto out;
+ }
+ /* fill in hole pointers in the expanding truncate case. */
+ if (attr->ia_size > inode->i_size) {
+ error = generic_cont_expand(inode, attr->ia_size) ;
+ if (REISERFS_I(inode)->i_prealloc_count > 0) {
+ int err;
+ struct reiserfs_transaction_handle th ;
+ /* we're changing at most 2 bitmaps, inode + super */
+ err = journal_begin(&th, inode->i_sb, 4) ;
+ if (!err) {
+ reiserfs_discard_prealloc (&th, inode);
+ err = journal_end(&th, inode->i_sb, 4) ;
+ }
+ if (err)
+ error = err;
+ }
+ if (error)
+ goto out;
+ }
+ }
+
+ if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
+ ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
+ (get_inode_sd_version (inode) == STAT_DATA_V1)) {
+ /* stat data of format v3.5 has 16 bit uid and gid */
+ error = -EINVAL;
+ goto out;
+ }
+
+ error = inode_change_ok(inode, attr) ;
+ if (!error) {
+ if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ error = reiserfs_chown_xattrs (inode, attr);
+
+ if (!error) {
+ struct reiserfs_transaction_handle th;
+
+ /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
+ journal_begin(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2);
+ error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+ if (error) {
+ journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2);
+ goto out;
+ }
+ /* Update corresponding info in inode so that everything is in
+ * one transaction */
+ if (attr->ia_valid & ATTR_UID)
+ inode->i_uid = attr->ia_uid;
+ if (attr->ia_valid & ATTR_GID)
+ inode->i_gid = attr->ia_gid;
+ mark_inode_dirty(inode);
+ journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2);
+ }
+ }
+ if (!error)
+ error = inode_setattr(inode, attr) ;
+ }
+
+
+ if (!error && reiserfs_posixacl (inode->i_sb)) {
+ if (attr->ia_valid & ATTR_MODE)
+ error = reiserfs_acl_chmod (inode);
+ }
+
+out:
+ reiserfs_write_unlock(inode->i_sb);
+ return error ;
+}
+
+
+
+struct address_space_operations reiserfs_address_space_operations = {
+ .writepage = reiserfs_writepage,
+ .readpage = reiserfs_readpage,
+ .readpages = reiserfs_readpages,
+ .releasepage = reiserfs_releasepage,
+ .invalidatepage = reiserfs_invalidatepage,
+ .sync_page = block_sync_page,
+ .prepare_write = reiserfs_prepare_write,
+ .commit_write = reiserfs_commit_write,
+ .bmap = reiserfs_aop_bmap,
+ .direct_IO = reiserfs_direct_IO,
+ .set_page_dirty = reiserfs_set_page_dirty,
+} ;
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
new file mode 100644
index 00000000000..94dc42475a0
--- /dev/null
+++ b/fs/reiserfs/ioctl.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/fs.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/time.h>
+#include <asm/uaccess.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+
+static int reiserfs_unpack (struct inode * inode, struct file * filp);
+
+/*
+** reiserfs_ioctl - handler for ioctl for inode
+** supported commands:
+** 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
+** and prevent packing file (argument arg has to be non-zero)
+** 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
+** 3) That's all for a while ...
+*/
+int reiserfs_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
+ unsigned long arg)
+{
+ unsigned int flags;
+
+ switch (cmd) {
+ case REISERFS_IOC_UNPACK:
+ if( S_ISREG( inode -> i_mode ) ) {
+ if (arg)
+ return reiserfs_unpack (inode, filp);
+ else
+ return 0;
+ } else
+ return -ENOTTY;
+ /* following two cases are taken from fs/ext2/ioctl.c by Remy
+ Card (card@masi.ibp.fr) */
+ case REISERFS_IOC_GETFLAGS:
+ flags = REISERFS_I(inode) -> i_attrs;
+ i_attrs_to_sd_attrs( inode, ( __u16 * ) &flags );
+ return put_user(flags, (int __user *) arg);
+ case REISERFS_IOC_SETFLAGS: {
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ return -EPERM;
+
+ if (get_user(flags, (int __user *) arg))
+ return -EFAULT;
+
+ if ( ( ( flags ^ REISERFS_I(inode) -> i_attrs) & ( REISERFS_IMMUTABLE_FL | REISERFS_APPEND_FL)) &&
+ !capable( CAP_LINUX_IMMUTABLE ) )
+ return -EPERM;
+
+ if( ( flags & REISERFS_NOTAIL_FL ) &&
+ S_ISREG( inode -> i_mode ) ) {
+ int result;
+
+ result = reiserfs_unpack( inode, filp );
+ if( result )
+ return result;
+ }
+ sd_attrs_to_i_attrs( flags, inode );
+ REISERFS_I(inode) -> i_attrs = flags;
+ inode->i_ctime = CURRENT_TIME_SEC;
+ mark_inode_dirty(inode);
+ return 0;
+ }
+ case REISERFS_IOC_GETVERSION:
+ return put_user(inode->i_generation, (int __user *) arg);
+ case REISERFS_IOC_SETVERSION:
+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ return -EPERM;
+ if (IS_RDONLY(inode))
+ return -EROFS;
+ if (get_user(inode->i_generation, (int __user *) arg))
+ return -EFAULT;
+ inode->i_ctime = CURRENT_TIME_SEC;
+ mark_inode_dirty(inode);
+ return 0;
+ default:
+ return -ENOTTY;
+ }
+}
+
+/*
+** reiserfs_unpack
+** Function try to convert tail from direct item into indirect.
+** It set up nopack attribute in the REISERFS_I(inode)->nopack
+*/
+static int reiserfs_unpack (struct inode * inode, struct file * filp)
+{
+ int retval = 0;
+ int index ;
+ struct page *page ;
+ struct address_space *mapping ;
+ unsigned long write_from ;
+ unsigned long blocksize = inode->i_sb->s_blocksize ;
+
+ if (inode->i_size == 0) {
+ REISERFS_I(inode)->i_flags |= i_nopack_mask;
+ return 0 ;
+ }
+ /* ioctl already done */
+ if (REISERFS_I(inode)->i_flags & i_nopack_mask) {
+ return 0 ;
+ }
+ reiserfs_write_lock(inode->i_sb);
+
+ /* we need to make sure nobody is changing the file size beneath
+ ** us
+ */
+ down(&inode->i_sem) ;
+
+ write_from = inode->i_size & (blocksize - 1) ;
+ /* if we are on a block boundary, we are already unpacked. */
+ if ( write_from == 0) {
+ REISERFS_I(inode)->i_flags |= i_nopack_mask;
+ goto out ;
+ }
+
+ /* we unpack by finding the page with the tail, and calling
+ ** reiserfs_prepare_write on that page. This will force a
+ ** reiserfs_get_block to unpack the tail for us.
+ */
+ index = inode->i_size >> PAGE_CACHE_SHIFT ;
+ mapping = inode->i_mapping ;
+ page = grab_cache_page(mapping, index) ;
+ retval = -ENOMEM;
+ if (!page) {
+ goto out ;
+ }
+ retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ;
+ if (retval)
+ goto out_unlock ;
+
+ /* conversion can change page contents, must flush */
+ flush_dcache_page(page) ;
+ retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ;
+ REISERFS_I(inode)->i_flags |= i_nopack_mask;
+
+out_unlock:
+ unlock_page(page) ;
+ page_cache_release(page) ;
+
+out:
+ up(&inode->i_sem) ;
+ reiserfs_write_unlock(inode->i_sb);
+ return retval;
+}
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
new file mode 100644
index 00000000000..9cf7c13b120
--- /dev/null
+++ b/fs/reiserfs/item_ops.c
@@ -0,0 +1,788 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/time.h>
+#include <linux/reiserfs_fs.h>
+
+// this contains item handlers for old item types: sd, direct,
+// indirect, directory
+
+/* and where are the comments? how about saying where we can find an
+ explanation of each item handler method? -Hans */
+
+//////////////////////////////////////////////////////////////////////////////
+// stat data functions
+//
+static int sd_bytes_number (struct item_head * ih, int block_size)
+{
+ return 0;
+}
+
+static void sd_decrement_key (struct cpu_key * key)
+{
+ key->on_disk_key.k_objectid --;
+ set_cpu_key_k_type (key, TYPE_ANY);
+ set_cpu_key_k_offset(key, (loff_t)(-1));
+}
+
+static int sd_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
+{
+ return 0;
+}
+
+
+
+static char * print_time (time_t t)
+{
+ static char timebuf[256];
+
+ sprintf (timebuf, "%ld", t);
+ return timebuf;
+}
+
+
+static void sd_print_item (struct item_head * ih, char * item)
+{
+ printk ("\tmode | size | nlinks | first direct | mtime\n");
+ if (stat_data_v1 (ih)) {
+ struct stat_data_v1 * sd = (struct stat_data_v1 *)item;
+
+ printk ("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd),
+ sd_v1_size(sd), sd_v1_nlink(sd), sd_v1_first_direct_byte(sd),
+ print_time( sd_v1_mtime(sd) ) );
+ } else {
+ struct stat_data * sd = (struct stat_data *)item;
+
+ printk ("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd),
+ (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
+ sd_v2_rdev(sd), print_time(sd_v2_mtime(sd)));
+ }
+}
+
+static void sd_check_item (struct item_head * ih, char * item)
+{
+ // FIXME: type something here!
+}
+
+
+static int sd_create_vi (struct virtual_node * vn,
+ struct virtual_item * vi,
+ int is_affected,
+ int insert_size)
+{
+ vi->vi_index = TYPE_STAT_DATA;
+ //vi->vi_type |= VI_TYPE_STAT_DATA;// not needed?
+ return 0;
+}
+
+
+static int sd_check_left (struct virtual_item * vi, int free,
+ int start_skip, int end_skip)
+{
+ if (start_skip || end_skip)
+ BUG ();
+ return -1;
+}
+
+
+static int sd_check_right (struct virtual_item * vi, int free)
+{
+ return -1;
+}
+
+static int sd_part_size (struct virtual_item * vi, int first, int count)
+{
+ if (count)
+ BUG ();
+ return 0;
+}
+
+static int sd_unit_num (struct virtual_item * vi)
+{
+ return vi->vi_item_len - IH_SIZE;
+}
+
+
+static void sd_print_vi (struct virtual_item * vi)
+{
+ reiserfs_warning (NULL, "STATDATA, index %d, type 0x%x, %h",
+ vi->vi_index, vi->vi_type, vi->vi_ih);
+}
+
+static struct item_operations stat_data_ops = {
+ .bytes_number = sd_bytes_number,
+ .decrement_key = sd_decrement_key,
+ .is_left_mergeable = sd_is_left_mergeable,
+ .print_item = sd_print_item,
+ .check_item = sd_check_item,
+
+ .create_vi = sd_create_vi,
+ .check_left = sd_check_left,
+ .check_right = sd_check_right,
+ .part_size = sd_part_size,
+ .unit_num = sd_unit_num,
+ .print_vi = sd_print_vi
+};
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+// direct item functions
+//
+static int direct_bytes_number (struct item_head * ih, int block_size)
+{
+ return ih_item_len(ih);
+}
+
+
+// FIXME: this should probably switch to indirect as well
+static void direct_decrement_key (struct cpu_key * key)
+{
+ cpu_key_k_offset_dec (key);
+ if (cpu_key_k_offset (key) == 0)
+ set_cpu_key_k_type (key, TYPE_STAT_DATA);
+}
+
+
+static int direct_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
+{
+ int version = le_key_version (key);
+ return ((le_key_k_offset (version, key) & (bsize - 1)) != 1);
+}
+
+
+static void direct_print_item (struct item_head * ih, char * item)
+{
+ int j = 0;
+
+// return;
+ printk ("\"");
+ while (j < ih_item_len(ih))
+ printk ("%c", item[j++]);
+ printk ("\"\n");
+}
+
+
+static void direct_check_item (struct item_head * ih, char * item)
+{
+ // FIXME: type something here!
+}
+
+
+static int direct_create_vi (struct virtual_node * vn,
+ struct virtual_item * vi,
+ int is_affected,
+ int insert_size)
+{
+ vi->vi_index = TYPE_DIRECT;
+ //vi->vi_type |= VI_TYPE_DIRECT;
+ return 0;
+}
+
+static int direct_check_left (struct virtual_item * vi, int free,
+ int start_skip, int end_skip)
+{
+ int bytes;
+
+ bytes = free - free % 8;
+ return bytes ?: -1;
+}
+
+
+static int direct_check_right (struct virtual_item * vi, int free)
+{
+ return direct_check_left (vi, free, 0, 0);
+}
+
+static int direct_part_size (struct virtual_item * vi, int first, int count)
+{
+ return count;
+}
+
+
+static int direct_unit_num (struct virtual_item * vi)
+{
+ return vi->vi_item_len - IH_SIZE;
+}
+
+
+static void direct_print_vi (struct virtual_item * vi)
+{
+ reiserfs_warning (NULL, "DIRECT, index %d, type 0x%x, %h",
+ vi->vi_index, vi->vi_type, vi->vi_ih);
+}
+
+static struct item_operations direct_ops = {
+ .bytes_number = direct_bytes_number,
+ .decrement_key = direct_decrement_key,
+ .is_left_mergeable = direct_is_left_mergeable,
+ .print_item = direct_print_item,
+ .check_item = direct_check_item,
+
+ .create_vi = direct_create_vi,
+ .check_left = direct_check_left,
+ .check_right = direct_check_right,
+ .part_size = direct_part_size,
+ .unit_num = direct_unit_num,
+ .print_vi = direct_print_vi
+};
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+// indirect item functions
+//
+
+static int indirect_bytes_number (struct item_head * ih, int block_size)
+{
+ return ih_item_len(ih) / UNFM_P_SIZE * block_size; //- get_ih_free_space (ih);
+}
+
+
+// decrease offset, if it becomes 0, change type to stat data
+static void indirect_decrement_key (struct cpu_key * key)
+{
+ cpu_key_k_offset_dec (key);
+ if (cpu_key_k_offset (key) == 0)
+ set_cpu_key_k_type (key, TYPE_STAT_DATA);
+}
+
+
+// if it is not first item of the body, then it is mergeable
+static int indirect_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
+{
+ int version = le_key_version (key);
+ return (le_key_k_offset (version, key) != 1);
+}
+
+
+// printing of indirect item
+static void start_new_sequence (__u32 * start, int * len, __u32 new)
+{
+ *start = new;
+ *len = 1;
+}
+
+
+static int sequence_finished (__u32 start, int * len, __u32 new)
+{
+ if (start == INT_MAX)
+ return 1;
+
+ if (start == 0 && new == 0) {
+ (*len) ++;
+ return 0;
+ }
+ if (start != 0 && (start + *len) == new) {
+ (*len) ++;
+ return 0;
+ }
+ return 1;
+}
+
+static void print_sequence (__u32 start, int len)
+{
+ if (start == INT_MAX)
+ return;
+
+ if (len == 1)
+ printk (" %d", start);
+ else
+ printk (" %d(%d)", start, len);
+}
+
+
+static void indirect_print_item (struct item_head * ih, char * item)
+{
+ int j;
+ __u32 * unp, prev = INT_MAX;
+ int num;
+
+ unp = (__u32 *)item;
+
+ if (ih_item_len(ih) % UNFM_P_SIZE)
+ reiserfs_warning (NULL, "indirect_print_item: invalid item len");
+
+ printk ("%d pointers\n[ ", (int)I_UNFM_NUM (ih));
+ for (j = 0; j < I_UNFM_NUM (ih); j ++) {
+ if (sequence_finished (prev, &num, get_block_num(unp, j))) {
+ print_sequence (prev, num);
+ start_new_sequence (&prev, &num, get_block_num(unp, j));
+ }
+ }
+ print_sequence (prev, num);
+ printk ("]\n");
+}
+
+static void indirect_check_item (struct item_head * ih, char * item)
+{
+ // FIXME: type something here!
+}
+
+
+static int indirect_create_vi (struct virtual_node * vn,
+ struct virtual_item * vi,
+ int is_affected,
+ int insert_size)
+{
+ vi->vi_index = TYPE_INDIRECT;
+ //vi->vi_type |= VI_TYPE_INDIRECT;
+ return 0;
+}
+
+static int indirect_check_left (struct virtual_item * vi, int free,
+ int start_skip, int end_skip)
+{
+ int bytes;
+
+ bytes = free - free % UNFM_P_SIZE;
+ return bytes ?: -1;
+}
+
+
+static int indirect_check_right (struct virtual_item * vi, int free)
+{
+ return indirect_check_left (vi, free, 0, 0);
+}
+
+
+
+// return size in bytes of 'units' units. If first == 0 - calculate from the head (left), otherwise - from tail (right)
+static int indirect_part_size (struct virtual_item * vi, int first, int units)
+{
+ // unit of indirect item is byte (yet)
+ return units;
+}
+
+static int indirect_unit_num (struct virtual_item * vi)
+{
+ // unit of indirect item is byte (yet)
+ return vi->vi_item_len - IH_SIZE;
+}
+
+static void indirect_print_vi (struct virtual_item * vi)
+{
+ reiserfs_warning (NULL, "INDIRECT, index %d, type 0x%x, %h",
+ vi->vi_index, vi->vi_type, vi->vi_ih);
+}
+
+static struct item_operations indirect_ops = {
+ .bytes_number = indirect_bytes_number,
+ .decrement_key = indirect_decrement_key,
+ .is_left_mergeable = indirect_is_left_mergeable,
+ .print_item = indirect_print_item,
+ .check_item = indirect_check_item,
+
+ .create_vi = indirect_create_vi,
+ .check_left = indirect_check_left,
+ .check_right = indirect_check_right,
+ .part_size = indirect_part_size,
+ .unit_num = indirect_unit_num,
+ .print_vi = indirect_print_vi
+};
+
+
+//////////////////////////////////////////////////////////////////////////////
+// direntry functions
+//
+
+
+static int direntry_bytes_number (struct item_head * ih, int block_size)
+{
+ reiserfs_warning (NULL, "vs-16090: direntry_bytes_number: "
+ "bytes number is asked for direntry");
+ return 0;
+}
+
+static void direntry_decrement_key (struct cpu_key * key)
+{
+ cpu_key_k_offset_dec (key);
+ if (cpu_key_k_offset (key) == 0)
+ set_cpu_key_k_type (key, TYPE_STAT_DATA);
+}
+
+
+static int direntry_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
+{
+ if (le32_to_cpu (key->u.k_offset_v1.k_offset) == DOT_OFFSET)
+ return 0;
+ return 1;
+
+}
+
+
+static void direntry_print_item (struct item_head * ih, char * item)
+{
+ int i;
+ int namelen;
+ struct reiserfs_de_head * deh;
+ char * name;
+ static char namebuf [80];
+
+
+ printk ("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name", "Key of pointed object", "Hash", "Gen number", "Status");
+
+ deh = (struct reiserfs_de_head *)item;
+
+ for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) {
+ namelen = (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - deh_location(deh);
+ name = item + deh_location(deh);
+ if (name[namelen-1] == 0)
+ namelen = strlen (name);
+ namebuf[0] = '"';
+ if (namelen > sizeof (namebuf) - 3) {
+ strncpy (namebuf + 1, name, sizeof (namebuf) - 3);
+ namebuf[sizeof (namebuf) - 2] = '"';
+ namebuf[sizeof (namebuf) - 1] = 0;
+ } else {
+ memcpy (namebuf + 1, name, namelen);
+ namebuf[namelen + 1] = '"';
+ namebuf[namelen + 2] = 0;
+ }
+
+ printk ("%d: %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n",
+ i, namebuf,
+ deh_dir_id(deh), deh_objectid(deh),
+ GET_HASH_VALUE (deh_offset (deh)), GET_GENERATION_NUMBER ((deh_offset (deh))),
+ (de_hidden (deh)) ? "HIDDEN" : "VISIBLE");
+ }
+}
+
+
+static void direntry_check_item (struct item_head * ih, char * item)
+{
+ int i;
+ struct reiserfs_de_head * deh;
+
+ // FIXME: type something here!
+ deh = (struct reiserfs_de_head *)item;
+ for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) {
+ ;
+ }
+}
+
+
+
+#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1
+
+/*
+ * function returns old entry number in directory item in real node
+ * using new entry number in virtual item in virtual node */
+static inline int old_entry_num (int is_affected, int virtual_entry_num, int pos_in_item, int mode)
+{
+ if ( mode == M_INSERT || mode == M_DELETE)
+ return virtual_entry_num;
+
+ if (!is_affected)
+ /* cut or paste is applied to another item */
+ return virtual_entry_num;
+
+ if (virtual_entry_num < pos_in_item)
+ return virtual_entry_num;
+
+ if (mode == M_CUT)
+ return virtual_entry_num + 1;
+
+ RFALSE( mode != M_PASTE || virtual_entry_num == 0,
+ "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'", mode);
+
+ return virtual_entry_num - 1;
+}
+
+
+
+
+/* Create an array of sizes of directory entries for virtual
+ item. Return space used by an item. FIXME: no control over
+ consuming of space used by this item handler */
+static int direntry_create_vi (struct virtual_node * vn,
+ struct virtual_item * vi,
+ int is_affected,
+ int insert_size)
+{
+ struct direntry_uarea * dir_u = vi->vi_uarea;
+ int i, j;
+ int size = sizeof (struct direntry_uarea);
+ struct reiserfs_de_head * deh;
+
+ vi->vi_index = TYPE_DIRENTRY;
+
+ if (!(vi->vi_ih) || !vi->vi_item)
+ BUG ();
+
+
+ dir_u->flags = 0;
+ if (le_ih_k_offset (vi->vi_ih) == DOT_OFFSET)
+ dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM;
+
+ deh = (struct reiserfs_de_head *)(vi->vi_item);
+
+
+ /* virtual directory item have this amount of entry after */
+ dir_u->entry_count = ih_entry_count (vi->vi_ih) +
+ ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 :
+ (vn->vn_mode == M_PASTE ? 1 : 0)) : 0);
+
+ for (i = 0; i < dir_u->entry_count; i ++) {
+ j = old_entry_num (is_affected, i, vn->vn_pos_in_item, vn->vn_mode);
+ dir_u->entry_sizes[i] = (j ? deh_location( &(deh[j - 1]) ) :
+ ih_item_len (vi->vi_ih)) -
+ deh_location( &(deh[j])) + DEH_SIZE;
+ }
+
+ size += (dir_u->entry_count * sizeof (short));
+
+ /* set size of pasted entry */
+ if (is_affected && vn->vn_mode == M_PASTE)
+ dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size;
+
+
+#ifdef CONFIG_REISERFS_CHECK
+ /* compare total size of entries with item length */
+ {
+ int k, l;
+
+ l = 0;
+ for (k = 0; k < dir_u->entry_count; k ++)
+ l += dir_u->entry_sizes[k];
+
+ if (l + IH_SIZE != vi->vi_item_len +
+ ((is_affected && (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT)) ? insert_size : 0) ) {
+ reiserfs_panic (NULL, "vs-8025: set_entry_sizes: (mode==%c, insert_size==%d), invalid length of directory item",
+ vn->vn_mode, insert_size);
+ }
+ }
+#endif
+
+ return size;
+
+
+}
+
+
+//
+// return number of entries which may fit into specified amount of
+// free space, or -1 if free space is not enough even for 1 entry
+//
+static int direntry_check_left (struct virtual_item * vi, int free,
+ int start_skip, int end_skip)
+{
+ int i;
+ int entries = 0;
+ struct direntry_uarea * dir_u = vi->vi_uarea;
+
+ for (i = start_skip; i < dir_u->entry_count - end_skip; i ++) {
+ if (dir_u->entry_sizes[i] > free)
+ /* i-th entry doesn't fit into the remaining free space */
+ break;
+
+ free -= dir_u->entry_sizes[i];
+ entries ++;
+ }
+
+ if (entries == dir_u->entry_count) {
+ reiserfs_panic (NULL, "free space %d, entry_count %d\n", free, dir_u->entry_count);
+ }
+
+ /* "." and ".." can not be separated from each other */
+ if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries < 2)
+ entries = 0;
+
+ return entries ?: -1;
+}
+
+
+static int direntry_check_right (struct virtual_item * vi, int free)
+{
+ int i;
+ int entries = 0;
+ struct direntry_uarea * dir_u = vi->vi_uarea;
+
+ for (i = dir_u->entry_count - 1; i >= 0; i --) {
+ if (dir_u->entry_sizes[i] > free)
+ /* i-th entry doesn't fit into the remaining free space */
+ break;
+
+ free -= dir_u->entry_sizes[i];
+ entries ++;
+ }
+ if (entries == dir_u->entry_count)
+ BUG ();
+
+ /* "." and ".." can not be separated from each other */
+ if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries > dir_u->entry_count - 2)
+ entries = dir_u->entry_count - 2;
+
+ return entries ?: -1;
+}
+
+
+/* sum of entry sizes between from-th and to-th entries including both edges */
+static int direntry_part_size (struct virtual_item * vi, int first, int count)
+{
+ int i, retval;
+ int from, to;
+ struct direntry_uarea * dir_u = vi->vi_uarea;
+
+ retval = 0;
+ if (first == 0)
+ from = 0;
+ else
+ from = dir_u->entry_count - count;
+ to = from + count - 1;
+
+ for (i = from; i <= to; i ++)
+ retval += dir_u->entry_sizes[i];
+
+ return retval;
+}
+
+static int direntry_unit_num (struct virtual_item * vi)
+{
+ struct direntry_uarea * dir_u = vi->vi_uarea;
+
+ return dir_u->entry_count;
+}
+
+
+
+static void direntry_print_vi (struct virtual_item * vi)
+{
+ int i;
+ struct direntry_uarea * dir_u = vi->vi_uarea;
+
+ reiserfs_warning (NULL, "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
+ vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
+ printk ("%d entries: ", dir_u->entry_count);
+ for (i = 0; i < dir_u->entry_count; i ++)
+ printk ("%d ", dir_u->entry_sizes[i]);
+ printk ("\n");
+}
+
+static struct item_operations direntry_ops = {
+ .bytes_number = direntry_bytes_number,
+ .decrement_key = direntry_decrement_key,
+ .is_left_mergeable = direntry_is_left_mergeable,
+ .print_item = direntry_print_item,
+ .check_item = direntry_check_item,
+
+ .create_vi = direntry_create_vi,
+ .check_left = direntry_check_left,
+ .check_right = direntry_check_right,
+ .part_size = direntry_part_size,
+ .unit_num = direntry_unit_num,
+ .print_vi = direntry_print_vi
+};
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Error catching functions to catch errors caused by incorrect item types.
+//
+static int errcatch_bytes_number (struct item_head * ih, int block_size)
+{
+ reiserfs_warning (NULL, "green-16001: Invalid item type observed, run fsck ASAP");
+ return 0;
+}
+
+static void errcatch_decrement_key (struct cpu_key * key)
+{
+ reiserfs_warning (NULL, "green-16002: Invalid item type observed, run fsck ASAP");
+}
+
+
+static int errcatch_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
+{
+ reiserfs_warning (NULL, "green-16003: Invalid item type observed, run fsck ASAP");
+ return 0;
+}
+
+
+static void errcatch_print_item (struct item_head * ih, char * item)
+{
+ reiserfs_warning (NULL, "green-16004: Invalid item type observed, run fsck ASAP");
+}
+
+
+static void errcatch_check_item (struct item_head * ih, char * item)
+{
+ reiserfs_warning (NULL, "green-16005: Invalid item type observed, run fsck ASAP");
+}
+
+static int errcatch_create_vi (struct virtual_node * vn,
+ struct virtual_item * vi,
+ int is_affected,
+ int insert_size)
+{
+ reiserfs_warning (NULL, "green-16006: Invalid item type observed, run fsck ASAP");
+ return 0; // We might return -1 here as well, but it won't help as create_virtual_node() from where
+ // this operation is called from is of return type void.
+}
+
+static int errcatch_check_left (struct virtual_item * vi, int free,
+ int start_skip, int end_skip)
+{
+ reiserfs_warning (NULL, "green-16007: Invalid item type observed, run fsck ASAP");
+ return -1;
+}
+
+
+static int errcatch_check_right (struct virtual_item * vi, int free)
+{
+ reiserfs_warning (NULL, "green-16008: Invalid item type observed, run fsck ASAP");
+ return -1;
+}
+
+static int errcatch_part_size (struct virtual_item * vi, int first, int count)
+{
+ reiserfs_warning (NULL, "green-16009: Invalid item type observed, run fsck ASAP");
+ return 0;
+}
+
+static int errcatch_unit_num (struct virtual_item * vi)
+{
+ reiserfs_warning (NULL, "green-16010: Invalid item type observed, run fsck ASAP");
+ return 0;
+}
+
+static void errcatch_print_vi (struct virtual_item * vi)
+{
+ reiserfs_warning (NULL, "green-16011: Invalid item type observed, run fsck ASAP");
+}
+
+static struct item_operations errcatch_ops = {
+ errcatch_bytes_number,
+ errcatch_decrement_key,
+ errcatch_is_left_mergeable,
+ errcatch_print_item,
+ errcatch_check_item,
+
+ errcatch_create_vi,
+ errcatch_check_left,
+ errcatch_check_right,
+ errcatch_part_size,
+ errcatch_unit_num,
+ errcatch_print_vi
+};
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//
+#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
+ do not compile
+#endif
+
+struct item_operations * item_ops [TYPE_ANY + 1] = {
+ &stat_data_ops,
+ &indirect_ops,
+ &direct_ops,
+ &direntry_ops,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ &errcatch_ops /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */
+};
+
+
+
+
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
new file mode 100644
index 00000000000..c9ad3a7849f
--- /dev/null
+++ b/fs/reiserfs/journal.c
@@ -0,0 +1,3876 @@
+/*
+** Write ahead logging implementation copyright Chris Mason 2000
+**
+** The background commits make this code very interelated, and
+** overly complex. I need to rethink things a bit....The major players:
+**
+** journal_begin -- call with the number of blocks you expect to log.
+** If the current transaction is too
+** old, it will block until the current transaction is
+** finished, and then start a new one.
+** Usually, your transaction will get joined in with
+** previous ones for speed.
+**
+** journal_join -- same as journal_begin, but won't block on the current
+** transaction regardless of age. Don't ever call
+** this. Ever. There are only two places it should be
+** called from, and they are both inside this file.
+**
+** journal_mark_dirty -- adds blocks into this transaction. clears any flags
+** that might make them get sent to disk
+** and then marks them BH_JDirty. Puts the buffer head
+** into the current transaction hash.
+**
+** journal_end -- if the current transaction is batchable, it does nothing
+** otherwise, it could do an async/synchronous commit, or
+** a full flush of all log and real blocks in the
+** transaction.
+**
+** flush_old_commits -- if the current transaction is too old, it is ended and
+** commit blocks are sent to disk. Forces commit blocks
+** to disk for all backgrounded commits that have been
+** around too long.
+** -- Note, if you call this as an immediate flush from
+** from within kupdate, it will ignore the immediate flag
+*/
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <linux/time.h>
+#include <asm/semaphore.h>
+
+#include <linux/vmalloc.h>
+#include <linux/reiserfs_fs.h>
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+
+
+/* gets a struct reiserfs_journal_list * from a list head */
+#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+ j_list))
+#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+ j_working_list))
+
+/* the number of mounted filesystems. This is used to decide when to
+** start and kill the commit workqueue
+*/
+static int reiserfs_mounted_fs_count;
+
+static struct workqueue_struct *commit_wq;
+
+#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit
+ structs at 4k */
+#define BUFNR 64 /*read ahead */
+
+/* cnode stat bits. Move these into reiserfs_fs.h */
+
+#define BLOCK_FREED 2 /* this block was freed, and can't be written. */
+#define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */
+
+#define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */
+#define BLOCK_DIRTIED 5
+
+
+/* journal list state bits */
+#define LIST_TOUCHED 1
+#define LIST_DIRTY 2
+#define LIST_COMMIT_PENDING 4 /* someone will commit this list */
+
+/* flags for do_journal_end */
+#define FLUSH_ALL 1 /* flush commit and real blocks */
+#define COMMIT_NOW 2 /* end and commit this transaction */
+#define WAIT 4 /* wait for the log blocks to hit the disk*/
+
+static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ;
+static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ;
+static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ;
+static int can_dirty(struct reiserfs_journal_cnode *cn) ;
+static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks);
+static int release_journal_dev( struct super_block *super,
+ struct reiserfs_journal *journal );
+static int dirty_one_transaction(struct super_block *s,
+ struct reiserfs_journal_list *jl);
+static void flush_async_commits(void *p);
+static void queue_log_writer(struct super_block *s);
+
+/* values for join in do_journal_begin_r */
+enum {
+ JBEGIN_REG = 0, /* regular journal begin */
+ JBEGIN_JOIN = 1, /* join the running transaction if at all possible */
+ JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */
+};
+
+static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
+ struct super_block * p_s_sb,
+ unsigned long nblocks,int join);
+
+static void init_journal_hash(struct super_block *p_s_sb) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ memset(journal->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
+}
+
+/*
+** clears BH_Dirty and sticks the buffer on the clean list. Called because I can't allow refile_buffer to
+** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for
+** more details.
+*/
+static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
+ if (bh) {
+ clear_buffer_dirty(bh);
+ clear_buffer_journal_test(bh);
+ }
+ return 0 ;
+}
+
+static void disable_barrier(struct super_block *s)
+{
+ REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
+ printk("reiserfs: disabling flush barriers on %s\n", reiserfs_bdevname(s));
+}
+
+static struct reiserfs_bitmap_node *
+allocate_bitmap_node(struct super_block *p_s_sb) {
+ struct reiserfs_bitmap_node *bn ;
+ static int id;
+
+ bn = reiserfs_kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS, p_s_sb) ;
+ if (!bn) {
+ return NULL ;
+ }
+ bn->data = reiserfs_kmalloc(p_s_sb->s_blocksize, GFP_NOFS, p_s_sb) ;
+ if (!bn->data) {
+ reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ;
+ return NULL ;
+ }
+ bn->id = id++ ;
+ memset(bn->data, 0, p_s_sb->s_blocksize) ;
+ INIT_LIST_HEAD(&bn->list) ;
+ return bn ;
+}
+
+static struct reiserfs_bitmap_node *
+get_bitmap_node(struct super_block *p_s_sb) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ struct reiserfs_bitmap_node *bn = NULL;
+ struct list_head *entry = journal->j_bitmap_nodes.next ;
+
+ journal->j_used_bitmap_nodes++ ;
+repeat:
+
+ if(entry != &journal->j_bitmap_nodes) {
+ bn = list_entry(entry, struct reiserfs_bitmap_node, list) ;
+ list_del(entry) ;
+ memset(bn->data, 0, p_s_sb->s_blocksize) ;
+ journal->j_free_bitmap_nodes-- ;
+ return bn ;
+ }
+ bn = allocate_bitmap_node(p_s_sb) ;
+ if (!bn) {
+ yield();
+ goto repeat ;
+ }
+ return bn ;
+}
+static inline void free_bitmap_node(struct super_block *p_s_sb,
+ struct reiserfs_bitmap_node *bn) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ journal->j_used_bitmap_nodes-- ;
+ if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
+ reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb) ;
+ reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ;
+ } else {
+ list_add(&bn->list, &journal->j_bitmap_nodes) ;
+ journal->j_free_bitmap_nodes++ ;
+ }
+}
+
+static void allocate_bitmap_nodes(struct super_block *p_s_sb) {
+ int i ;
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ struct reiserfs_bitmap_node *bn = NULL ;
+ for (i = 0 ; i < REISERFS_MIN_BITMAP_NODES ; i++) {
+ bn = allocate_bitmap_node(p_s_sb) ;
+ if (bn) {
+ list_add(&bn->list, &journal->j_bitmap_nodes) ;
+ journal->j_free_bitmap_nodes++ ;
+ } else {
+ break ; // this is ok, we'll try again when more are needed
+ }
+ }
+}
+
+static int set_bit_in_list_bitmap(struct super_block *p_s_sb, int block,
+ struct reiserfs_list_bitmap *jb) {
+ int bmap_nr = block / (p_s_sb->s_blocksize << 3) ;
+ int bit_nr = block % (p_s_sb->s_blocksize << 3) ;
+
+ if (!jb->bitmaps[bmap_nr]) {
+ jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb) ;
+ }
+ set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data) ;
+ return 0 ;
+}
+
+static void cleanup_bitmap_list(struct super_block *p_s_sb,
+ struct reiserfs_list_bitmap *jb) {
+ int i;
+ if (jb->bitmaps == NULL)
+ return;
+
+ for (i = 0 ; i < SB_BMAP_NR(p_s_sb) ; i++) {
+ if (jb->bitmaps[i]) {
+ free_bitmap_node(p_s_sb, jb->bitmaps[i]) ;
+ jb->bitmaps[i] = NULL ;
+ }
+ }
+}
+
+/*
+** only call this on FS unmount.
+*/
+static int free_list_bitmaps(struct super_block *p_s_sb,
+ struct reiserfs_list_bitmap *jb_array) {
+ int i ;
+ struct reiserfs_list_bitmap *jb ;
+ for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) {
+ jb = jb_array + i ;
+ jb->journal_list = NULL ;
+ cleanup_bitmap_list(p_s_sb, jb) ;
+ vfree(jb->bitmaps) ;
+ jb->bitmaps = NULL ;
+ }
+ return 0;
+}
+
+static int free_bitmap_nodes(struct super_block *p_s_sb) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ struct list_head *next = journal->j_bitmap_nodes.next ;
+ struct reiserfs_bitmap_node *bn ;
+
+ while(next != &journal->j_bitmap_nodes) {
+ bn = list_entry(next, struct reiserfs_bitmap_node, list) ;
+ list_del(next) ;
+ reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb) ;
+ reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ;
+ next = journal->j_bitmap_nodes.next ;
+ journal->j_free_bitmap_nodes-- ;
+ }
+
+ return 0 ;
+}
+
+/*
+** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
+** jb_array is the array to be filled in.
+*/
+int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
+ struct reiserfs_list_bitmap *jb_array,
+ int bmap_nr) {
+ int i ;
+ int failed = 0 ;
+ struct reiserfs_list_bitmap *jb ;
+ int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *) ;
+
+ for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) {
+ jb = jb_array + i ;
+ jb->journal_list = NULL ;
+ jb->bitmaps = vmalloc( mem ) ;
+ if (!jb->bitmaps) {
+ reiserfs_warning(p_s_sb, "clm-2000, unable to allocate bitmaps for journal lists") ;
+ failed = 1;
+ break ;
+ }
+ memset(jb->bitmaps, 0, mem) ;
+ }
+ if (failed) {
+ free_list_bitmaps(p_s_sb, jb_array) ;
+ return -1 ;
+ }
+ return 0 ;
+}
+
+/*
+** find an available list bitmap. If you can't find one, flush a commit list
+** and try again
+*/
+static struct reiserfs_list_bitmap *
+get_list_bitmap(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) {
+ int i,j ;
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ struct reiserfs_list_bitmap *jb = NULL ;
+
+ for (j = 0 ; j < (JOURNAL_NUM_BITMAPS * 3) ; j++) {
+ i = journal->j_list_bitmap_index ;
+ journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS ;
+ jb = journal->j_list_bitmap + i ;
+ if (journal->j_list_bitmap[i].journal_list) {
+ flush_commit_list(p_s_sb, journal->j_list_bitmap[i].journal_list, 1) ;
+ if (!journal->j_list_bitmap[i].journal_list) {
+ break ;
+ }
+ } else {
+ break ;
+ }
+ }
+ if (jb->journal_list) { /* double check to make sure if flushed correctly */
+ return NULL ;
+ }
+ jb->journal_list = jl ;
+ return jb ;
+}
+
+/*
+** allocates a new chunk of X nodes, and links them all together as a list.
+** Uses the cnode->next and cnode->prev pointers
+** returns NULL on failure
+*/
+static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) {
+ struct reiserfs_journal_cnode *head ;
+ int i ;
+ if (num_cnodes <= 0) {
+ return NULL ;
+ }
+ head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)) ;
+ if (!head) {
+ return NULL ;
+ }
+ memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode)) ;
+ head[0].prev = NULL ;
+ head[0].next = head + 1 ;
+ for (i = 1 ; i < num_cnodes; i++) {
+ head[i].prev = head + (i - 1) ;
+ head[i].next = head + (i + 1) ; /* if last one, overwrite it after the if */
+ }
+ head[num_cnodes -1].next = NULL ;
+ return head ;
+}
+
+/*
+** pulls a cnode off the free list, or returns NULL on failure
+*/
+static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb) {
+ struct reiserfs_journal_cnode *cn ;
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+
+ reiserfs_check_lock_depth(p_s_sb, "get_cnode") ;
+
+ if (journal->j_cnode_free <= 0) {
+ return NULL ;
+ }
+ journal->j_cnode_used++ ;
+ journal->j_cnode_free-- ;
+ cn = journal->j_cnode_free_list ;
+ if (!cn) {
+ return cn ;
+ }
+ if (cn->next) {
+ cn->next->prev = NULL ;
+ }
+ journal->j_cnode_free_list = cn->next ;
+ memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ;
+ return cn ;
+}
+
+/*
+** returns a cnode to the free list
+*/
+static void free_cnode(struct super_block *p_s_sb, struct reiserfs_journal_cnode *cn) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+
+ reiserfs_check_lock_depth(p_s_sb, "free_cnode") ;
+
+ journal->j_cnode_used-- ;
+ journal->j_cnode_free++ ;
+ /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
+ cn->next = journal->j_cnode_free_list ;
+ if (journal->j_cnode_free_list) {
+ journal->j_cnode_free_list->prev = cn ;
+ }
+ cn->prev = NULL ; /* not needed with the memset, but I might kill the memset, and forget to do this */
+ journal->j_cnode_free_list = cn ;
+}
+
+static void clear_prepared_bits(struct buffer_head *bh) {
+ clear_buffer_journal_prepared (bh);
+ clear_buffer_journal_restore_dirty (bh);
+}
+
+/* utility function to force a BUG if it is called without the big
+** kernel lock held. caller is the string printed just before calling BUG()
+*/
+void reiserfs_check_lock_depth(struct super_block *sb, char *caller) {
+#ifdef CONFIG_SMP
+ if (current->lock_depth < 0) {
+ reiserfs_panic (sb, "%s called without kernel lock held", caller) ;
+ }
+#else
+ ;
+#endif
+}
+
+/* return a cnode with same dev, block number and size in table, or null if not found */
+static inline struct reiserfs_journal_cnode *
+get_journal_hash_dev(struct super_block *sb,
+ struct reiserfs_journal_cnode **table,
+ long bl)
+{
+ struct reiserfs_journal_cnode *cn ;
+ cn = journal_hash(table, sb, bl) ;
+ while(cn) {
+ if (cn->blocknr == bl && cn->sb == sb)
+ return cn ;
+ cn = cn->hnext ;
+ }
+ return (struct reiserfs_journal_cnode *)0 ;
+}
+
+/*
+** this actually means 'can this block be reallocated yet?'. If you set search_all, a block can only be allocated
+** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever
+** being overwritten by a replay after crashing.
+**
+** If you don't set search_all, a block can only be allocated if it is not in the current transaction. Since deleting
+** a block removes it from the current transaction, this case should never happen. If you don't set search_all, make
+** sure you never write the block without logging it.
+**
+** next_zero_bit is a suggestion about the next block to try for find_forward.
+** when bl is rejected because it is set in a journal list bitmap, we search
+** for the next zero bit in the bitmap that rejected bl. Then, we return that
+** through next_zero_bit for find_forward to try.
+**
+** Just because we return something in next_zero_bit does not mean we won't
+** reject it on the next call to reiserfs_in_journal
+**
+*/
+int reiserfs_in_journal(struct super_block *p_s_sb,
+ int bmap_nr, int bit_nr, int search_all,
+ b_blocknr_t *next_zero_bit) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ struct reiserfs_journal_cnode *cn ;
+ struct reiserfs_list_bitmap *jb ;
+ int i ;
+ unsigned long bl;
+
+ *next_zero_bit = 0 ; /* always start this at zero. */
+
+ PROC_INFO_INC( p_s_sb, journal.in_journal );
+ /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
+ ** if we crash before the transaction that freed it commits, this transaction won't
+ ** have committed either, and the block will never be written
+ */
+ if (search_all) {
+ for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) {
+ PROC_INFO_INC( p_s_sb, journal.in_journal_bitmap );
+ jb = journal->j_list_bitmap + i ;
+ if (jb->journal_list && jb->bitmaps[bmap_nr] &&
+ test_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data)) {
+ *next_zero_bit = find_next_zero_bit((unsigned long *)
+ (jb->bitmaps[bmap_nr]->data),
+ p_s_sb->s_blocksize << 3, bit_nr+1) ;
+ return 1 ;
+ }
+ }
+ }
+
+ bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr;
+ /* is it in any old transactions? */
+ if (search_all && (cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) {
+ return 1;
+ }
+
+ /* is it in the current transaction. This should never happen */
+ if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) {
+ BUG();
+ return 1;
+ }
+
+ PROC_INFO_INC( p_s_sb, journal.in_journal_reusable );
+ /* safe for reuse */
+ return 0 ;
+}
+
+/* insert cn into table
+*/
+static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, struct reiserfs_journal_cnode *cn) {
+ struct reiserfs_journal_cnode *cn_orig ;
+
+ cn_orig = journal_hash(table, cn->sb, cn->blocknr) ;
+ cn->hnext = cn_orig ;
+ cn->hprev = NULL ;
+ if (cn_orig) {
+ cn_orig->hprev = cn ;
+ }
+ journal_hash(table, cn->sb, cn->blocknr) = cn ;
+}
+
+/* lock the current transaction */
+inline static void lock_journal(struct super_block *p_s_sb) {
+ PROC_INFO_INC( p_s_sb, journal.lock_journal );
+ down(&SB_JOURNAL(p_s_sb)->j_lock);
+}
+
+/* unlock the current transaction */
+inline static void unlock_journal(struct super_block *p_s_sb) {
+ up(&SB_JOURNAL(p_s_sb)->j_lock);
+}
+
+static inline void get_journal_list(struct reiserfs_journal_list *jl)
+{
+ jl->j_refcount++;
+}
+
+static inline void put_journal_list(struct super_block *s,
+ struct reiserfs_journal_list *jl)
+{
+ if (jl->j_refcount < 1) {
+ reiserfs_panic (s, "trans id %lu, refcount at %d", jl->j_trans_id,
+ jl->j_refcount);
+ }
+ if (--jl->j_refcount == 0)
+ reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s);
+}
+
+/*
+** this used to be much more involved, and I'm keeping it just in case things get ugly again.
+** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a
+** transaction.
+*/
+static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) {
+
+ struct reiserfs_list_bitmap *jb = jl->j_list_bitmap ;
+ if (jb) {
+ cleanup_bitmap_list(p_s_sb, jb) ;
+ }
+ jl->j_list_bitmap->journal_list = NULL ;
+ jl->j_list_bitmap = NULL ;
+}
+
+static int journal_list_still_alive(struct super_block *s,
+ unsigned long trans_id)
+{
+ struct reiserfs_journal *journal = SB_JOURNAL (s);
+ struct list_head *entry = &journal->j_journal_list;
+ struct reiserfs_journal_list *jl;
+
+ if (!list_empty(entry)) {
+ jl = JOURNAL_LIST_ENTRY(entry->next);
+ if (jl->j_trans_id <= trans_id) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
+ char b[BDEVNAME_SIZE];
+
+ if (buffer_journaled(bh)) {
+ reiserfs_warning(NULL, "clm-2084: pinned buffer %lu:%s sent to disk",
+ bh->b_blocknr, bdevname(bh->b_bdev, b)) ;
+ }
+ if (uptodate)
+ set_buffer_uptodate(bh) ;
+ else
+ clear_buffer_uptodate(bh) ;
+ unlock_buffer(bh) ;
+ put_bh(bh) ;
+}
+
+static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) {
+ if (uptodate)
+ set_buffer_uptodate(bh) ;
+ else
+ clear_buffer_uptodate(bh) ;
+ unlock_buffer(bh) ;
+ put_bh(bh) ;
+}
+
+static void submit_logged_buffer(struct buffer_head *bh) {
+ get_bh(bh) ;
+ bh->b_end_io = reiserfs_end_buffer_io_sync ;
+ clear_buffer_journal_new (bh);
+ clear_buffer_dirty(bh) ;
+ if (!test_clear_buffer_journal_test (bh))
+ BUG();
+ if (!buffer_uptodate(bh))
+ BUG();
+ submit_bh(WRITE, bh) ;
+}
+
+static void submit_ordered_buffer(struct buffer_head *bh) {
+ get_bh(bh) ;
+ bh->b_end_io = reiserfs_end_ordered_io;
+ clear_buffer_dirty(bh) ;
+ if (!buffer_uptodate(bh))
+ BUG();
+ submit_bh(WRITE, bh) ;
+}
+
+static int submit_barrier_buffer(struct buffer_head *bh) {
+ get_bh(bh) ;
+ bh->b_end_io = reiserfs_end_ordered_io;
+ clear_buffer_dirty(bh) ;
+ if (!buffer_uptodate(bh))
+ BUG();
+ return submit_bh(WRITE_BARRIER, bh) ;
+}
+
+static void check_barrier_completion(struct super_block *s,
+ struct buffer_head *bh) {
+ if (buffer_eopnotsupp(bh)) {
+ clear_buffer_eopnotsupp(bh);
+ disable_barrier(s);
+ set_buffer_uptodate(bh);
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ }
+}
+
+#define CHUNK_SIZE 32
+struct buffer_chunk {
+ struct buffer_head *bh[CHUNK_SIZE];
+ int nr;
+};
+
+static void write_chunk(struct buffer_chunk *chunk) {
+ int i;
+ for (i = 0; i < chunk->nr ; i++) {
+ submit_logged_buffer(chunk->bh[i]) ;
+ }
+ chunk->nr = 0;
+}
+
+static void write_ordered_chunk(struct buffer_chunk *chunk) {
+ int i;
+ for (i = 0; i < chunk->nr ; i++) {
+ submit_ordered_buffer(chunk->bh[i]) ;
+ }
+ chunk->nr = 0;
+}
+
+static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
+ spinlock_t *lock,
+ void (fn)(struct buffer_chunk *))
+{
+ int ret = 0;
+ if (chunk->nr >= CHUNK_SIZE)
+ BUG();
+ chunk->bh[chunk->nr++] = bh;
+ if (chunk->nr >= CHUNK_SIZE) {
+ ret = 1;
+ if (lock)
+ spin_unlock(lock);
+ fn(chunk);
+ if (lock)
+ spin_lock(lock);
+ }
+ return ret;
+}
+
+
+static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
+static struct reiserfs_jh *alloc_jh(void) {
+ struct reiserfs_jh *jh;
+ while(1) {
+ jh = kmalloc(sizeof(*jh), GFP_NOFS);
+ if (jh) {
+ atomic_inc(&nr_reiserfs_jh);
+ return jh;
+ }
+ yield();
+ }
+}
+
+/*
+ * we want to free the jh when the buffer has been written
+ * and waited on
+ */
+void reiserfs_free_jh(struct buffer_head *bh) {
+ struct reiserfs_jh *jh;
+
+ jh = bh->b_private;
+ if (jh) {
+ bh->b_private = NULL;
+ jh->bh = NULL;
+ list_del_init(&jh->list);
+ kfree(jh);
+ if (atomic_read(&nr_reiserfs_jh) <= 0)
+ BUG();
+ atomic_dec(&nr_reiserfs_jh);
+ put_bh(bh);
+ }
+}
+
+static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
+ int tail)
+{
+ struct reiserfs_jh *jh;
+
+ if (bh->b_private) {
+ spin_lock(&j->j_dirty_buffers_lock);
+ if (!bh->b_private) {
+ spin_unlock(&j->j_dirty_buffers_lock);
+ goto no_jh;
+ }
+ jh = bh->b_private;
+ list_del_init(&jh->list);
+ } else {
+no_jh:
+ get_bh(bh);
+ jh = alloc_jh();
+ spin_lock(&j->j_dirty_buffers_lock);
+ /* buffer must be locked for __add_jh, should be able to have
+ * two adds at the same time
+ */
+ if (bh->b_private)
+ BUG();
+ jh->bh = bh;
+ bh->b_private = jh;
+ }
+ jh->jl = j->j_current_jl;
+ if (tail)
+ list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
+ else {
+ list_add_tail(&jh->list, &jh->jl->j_bh_list);
+ }
+ spin_unlock(&j->j_dirty_buffers_lock);
+ return 0;
+}
+
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) {
+ return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
+}
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) {
+ return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
+}
+
+#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
+static int write_ordered_buffers(spinlock_t *lock,
+ struct reiserfs_journal *j,
+ struct reiserfs_journal_list *jl,
+ struct list_head *list)
+{
+ struct buffer_head *bh;
+ struct reiserfs_jh *jh;
+ int ret = j->j_errno;
+ struct buffer_chunk chunk;
+ struct list_head tmp;
+ INIT_LIST_HEAD(&tmp);
+
+ chunk.nr = 0;
+ spin_lock(lock);
+ while(!list_empty(list)) {
+ jh = JH_ENTRY(list->next);
+ bh = jh->bh;
+ get_bh(bh);
+ if (test_set_buffer_locked(bh)) {
+ if (!buffer_dirty(bh)) {
+ list_del_init(&jh->list);
+ list_add(&jh->list, &tmp);
+ goto loop_next;
+ }
+ spin_unlock(lock);
+ if (chunk.nr)
+ write_ordered_chunk(&chunk);
+ wait_on_buffer(bh);
+ cond_resched();
+ spin_lock(lock);
+ goto loop_next;
+ }
+ if (buffer_dirty(bh)) {
+ list_del_init(&jh->list);
+ list_add(&jh->list, &tmp);
+ add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
+ } else {
+ reiserfs_free_jh(bh);
+ unlock_buffer(bh);
+ }
+loop_next:
+ put_bh(bh);
+ cond_resched_lock(lock);
+ }
+ if (chunk.nr) {
+ spin_unlock(lock);
+ write_ordered_chunk(&chunk);
+ spin_lock(lock);
+ }
+ while(!list_empty(&tmp)) {
+ jh = JH_ENTRY(tmp.prev);
+ bh = jh->bh;
+ get_bh(bh);
+ reiserfs_free_jh(bh);
+
+ if (buffer_locked(bh)) {
+ spin_unlock(lock);
+ wait_on_buffer(bh);
+ spin_lock(lock);
+ }
+ if (!buffer_uptodate(bh)) {
+ ret = -EIO;
+ }
+ put_bh(bh);
+ cond_resched_lock(lock);
+ }
+ spin_unlock(lock);
+ return ret;
+}
+
+static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) {
+ struct reiserfs_journal *journal = SB_JOURNAL (s);
+ struct reiserfs_journal_list *other_jl;
+ struct reiserfs_journal_list *first_jl;
+ struct list_head *entry;
+ unsigned long trans_id = jl->j_trans_id;
+ unsigned long other_trans_id;
+ unsigned long first_trans_id;
+
+find_first:
+ /*
+ * first we walk backwards to find the oldest uncommitted transation
+ */
+ first_jl = jl;
+ entry = jl->j_list.prev;
+ while(1) {
+ other_jl = JOURNAL_LIST_ENTRY(entry);
+ if (entry == &journal->j_journal_list ||
+ atomic_read(&other_jl->j_older_commits_done))
+ break;
+
+ first_jl = other_jl;
+ entry = other_jl->j_list.prev;
+ }
+
+ /* if we didn't find any older uncommitted transactions, return now */
+ if (first_jl == jl) {
+ return 0;
+ }
+
+ first_trans_id = first_jl->j_trans_id;
+
+ entry = &first_jl->j_list;
+ while(1) {
+ other_jl = JOURNAL_LIST_ENTRY(entry);
+ other_trans_id = other_jl->j_trans_id;
+
+ if (other_trans_id < trans_id) {
+ if (atomic_read(&other_jl->j_commit_left) != 0) {
+ flush_commit_list(s, other_jl, 0);
+
+ /* list we were called with is gone, return */
+ if (!journal_list_still_alive(s, trans_id))
+ return 1;
+
+ /* the one we just flushed is gone, this means all
+ * older lists are also gone, so first_jl is no longer
+ * valid either. Go back to the beginning.
+ */
+ if (!journal_list_still_alive(s, other_trans_id)) {
+ goto find_first;
+ }
+ }
+ entry = entry->next;
+ if (entry == &journal->j_journal_list)
+ return 0;
+ } else {
+ return 0;
+ }
+ }
+ return 0;
+}
+int reiserfs_async_progress_wait(struct super_block *s) {
+ DEFINE_WAIT(wait);
+ struct reiserfs_journal *j = SB_JOURNAL(s);
+ if (atomic_read(&j->j_async_throttle))
+ blk_congestion_wait(WRITE, HZ/10);
+ return 0;
+}
+
+/*
+** if this journal list still has commit blocks unflushed, send them to disk.
+**
+** log areas must be flushed in order (transaction 2 can't commit before transaction 1)
+** Before the commit block can by written, every other log block must be safely on disk
+**
+*/
+static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) {
+ int i;
+ int bn ;
+ struct buffer_head *tbh = NULL ;
+ unsigned long trans_id = jl->j_trans_id;
+ struct reiserfs_journal *journal = SB_JOURNAL (s);
+ int barrier = 0;
+ int retval = 0;
+
+ reiserfs_check_lock_depth(s, "flush_commit_list") ;
+
+ if (atomic_read(&jl->j_older_commits_done)) {
+ return 0 ;
+ }
+
+ /* before we can put our commit blocks on disk, we have to make sure everyone older than
+ ** us is on disk too
+ */
+ BUG_ON (jl->j_len <= 0);
+ BUG_ON (trans_id == journal->j_trans_id);
+
+ get_journal_list(jl);
+ if (flushall) {
+ if (flush_older_commits(s, jl) == 1) {
+ /* list disappeared during flush_older_commits. return */
+ goto put_jl;
+ }
+ }
+
+ /* make sure nobody is trying to flush this one at the same time */
+ down(&jl->j_commit_lock);
+ if (!journal_list_still_alive(s, trans_id)) {
+ up(&jl->j_commit_lock);
+ goto put_jl;
+ }
+ BUG_ON (jl->j_trans_id == 0);
+
+ /* this commit is done, exit */
+ if (atomic_read(&(jl->j_commit_left)) <= 0) {
+ if (flushall) {
+ atomic_set(&(jl->j_older_commits_done), 1) ;
+ }
+ up(&jl->j_commit_lock);
+ goto put_jl;
+ }
+
+ if (!list_empty(&jl->j_bh_list)) {
+ unlock_kernel();
+ write_ordered_buffers(&journal->j_dirty_buffers_lock,
+ journal, jl, &jl->j_bh_list);
+ lock_kernel();
+ }
+ BUG_ON (!list_empty(&jl->j_bh_list));
+ /*
+ * for the description block and all the log blocks, submit any buffers
+ * that haven't already reached the disk
+ */
+ atomic_inc(&journal->j_async_throttle);
+ for (i = 0 ; i < (jl->j_len + 1) ; i++) {
+ bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %
+ SB_ONDISK_JOURNAL_SIZE(s);
+ tbh = journal_find_get_block(s, bn) ;
+ if (buffer_dirty(tbh)) /* redundant, ll_rw_block() checks */
+ ll_rw_block(WRITE, 1, &tbh) ;
+ put_bh(tbh) ;
+ }
+ atomic_dec(&journal->j_async_throttle);
+
+ /* wait on everything written so far before writing the commit
+ * if we are in barrier mode, send the commit down now
+ */
+ barrier = reiserfs_barrier_flush(s);
+ if (barrier) {
+ int ret;
+ lock_buffer(jl->j_commit_bh);
+ ret = submit_barrier_buffer(jl->j_commit_bh);
+ if (ret == -EOPNOTSUPP) {
+ set_buffer_uptodate(jl->j_commit_bh);
+ disable_barrier(s);
+ barrier = 0;
+ }
+ }
+ for (i = 0 ; i < (jl->j_len + 1) ; i++) {
+ bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
+ (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
+ tbh = journal_find_get_block(s, bn) ;
+ wait_on_buffer(tbh) ;
+ // since we're using ll_rw_blk above, it might have skipped over
+ // a locked buffer. Double check here
+ //
+ if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */
+ sync_dirty_buffer(tbh);
+ if (unlikely (!buffer_uptodate(tbh))) {
+#ifdef CONFIG_REISERFS_CHECK
+ reiserfs_warning(s, "journal-601, buffer write failed") ;
+#endif
+ retval = -EIO;
+ }
+ put_bh(tbh) ; /* once for journal_find_get_block */
+ put_bh(tbh) ; /* once due to original getblk in do_journal_end */
+ atomic_dec(&(jl->j_commit_left)) ;
+ }
+
+ BUG_ON (atomic_read(&(jl->j_commit_left)) != 1);
+
+ if (!barrier) {
+ if (buffer_dirty(jl->j_commit_bh))
+ BUG();
+ mark_buffer_dirty(jl->j_commit_bh) ;
+ sync_dirty_buffer(jl->j_commit_bh) ;
+ } else
+ wait_on_buffer(jl->j_commit_bh);
+
+ check_barrier_completion(s, jl->j_commit_bh);
+
+ /* If there was a write error in the journal - we can't commit this
+ * transaction - it will be invalid and, if successful, will just end
+ * up propogating the write error out to the filesystem. */
+ if (unlikely (!buffer_uptodate(jl->j_commit_bh))) {
+#ifdef CONFIG_REISERFS_CHECK
+ reiserfs_warning(s, "journal-615: buffer write failed") ;
+#endif
+ retval = -EIO;
+ }
+ bforget(jl->j_commit_bh) ;
+ if (journal->j_last_commit_id != 0 &&
+ (jl->j_trans_id - journal->j_last_commit_id) != 1) {
+ reiserfs_warning(s, "clm-2200: last commit %lu, current %lu",
+ journal->j_last_commit_id,
+ jl->j_trans_id);
+ }
+ journal->j_last_commit_id = jl->j_trans_id;
+
+ /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */
+ cleanup_freed_for_journal_list(s, jl) ;
+
+ retval = retval ? retval : journal->j_errno;
+
+ /* mark the metadata dirty */
+ if (!retval)
+ dirty_one_transaction(s, jl);
+ atomic_dec(&(jl->j_commit_left)) ;
+
+ if (flushall) {
+ atomic_set(&(jl->j_older_commits_done), 1) ;
+ }
+ up(&jl->j_commit_lock);
+put_jl:
+ put_journal_list(s, jl);
+
+ if (retval)
+ reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__);
+ return retval;
+}
+
+/*
+** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or
+** returns NULL if it can't find anything
+*/
+static struct reiserfs_journal_list *find_newer_jl_for_cn(struct reiserfs_journal_cnode *cn) {
+ struct super_block *sb = cn->sb;
+ b_blocknr_t blocknr = cn->blocknr ;
+
+ cn = cn->hprev ;
+ while(cn) {
+ if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
+ return cn->jlist ;
+ }
+ cn = cn->hprev ;
+ }
+ return NULL ;
+}
+
+static void remove_journal_hash(struct super_block *, struct reiserfs_journal_cnode **,
+struct reiserfs_journal_list *, unsigned long, int);
+
+/*
+** once all the real blocks have been flushed, it is safe to remove them from the
+** journal list for this transaction. Aside from freeing the cnode, this also allows the
+** block to be reallocated for data blocks if it had been deleted.
+*/
+static void remove_all_from_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, int debug) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ struct reiserfs_journal_cnode *cn, *last ;
+ cn = jl->j_realblock ;
+
+ /* which is better, to lock once around the whole loop, or
+ ** to lock for each call to remove_journal_hash?
+ */
+ while(cn) {
+ if (cn->blocknr != 0) {
+ if (debug) {
+ reiserfs_warning (p_s_sb, "block %u, bh is %d, state %ld", cn->blocknr,
+ cn->bh ? 1: 0, cn->state) ;
+ }
+ cn->state = 0 ;
+ remove_journal_hash(p_s_sb, journal->j_list_hash_table, jl, cn->blocknr, 1) ;
+ }
+ last = cn ;
+ cn = cn->next ;
+ free_cnode(p_s_sb, last) ;
+ }
+ jl->j_realblock = NULL ;
+}
+
+/*
+** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block.
+** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start
+** releasing blocks in this transaction for reuse as data blocks.
+** called by flush_journal_list, before it calls remove_all_from_journal_list
+**
+*/
+static int _update_journal_header_block(struct super_block *p_s_sb, unsigned long offset, unsigned long trans_id) {
+ struct reiserfs_journal_header *jh ;
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+
+ if (reiserfs_is_journal_aborted (journal))
+ return -EIO;
+
+ if (trans_id >= journal->j_last_flush_trans_id) {
+ if (buffer_locked((journal->j_header_bh))) {
+ wait_on_buffer((journal->j_header_bh)) ;
+ if (unlikely (!buffer_uptodate(journal->j_header_bh))) {
+#ifdef CONFIG_REISERFS_CHECK
+ reiserfs_warning (p_s_sb, "journal-699: buffer write failed") ;
+#endif
+ return -EIO;
+ }
+ }
+ journal->j_last_flush_trans_id = trans_id ;
+ journal->j_first_unflushed_offset = offset ;
+ jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data) ;
+ jh->j_last_flush_trans_id = cpu_to_le32(trans_id) ;
+ jh->j_first_unflushed_offset = cpu_to_le32(offset) ;
+ jh->j_mount_id = cpu_to_le32(journal->j_mount_id) ;
+
+ if (reiserfs_barrier_flush(p_s_sb)) {
+ int ret;
+ lock_buffer(journal->j_header_bh);
+ ret = submit_barrier_buffer(journal->j_header_bh);
+ if (ret == -EOPNOTSUPP) {
+ set_buffer_uptodate(journal->j_header_bh);
+ disable_barrier(p_s_sb);
+ goto sync;
+ }
+ wait_on_buffer(journal->j_header_bh);
+ check_barrier_completion(p_s_sb, journal->j_header_bh);
+ } else {
+sync:
+ set_buffer_dirty(journal->j_header_bh) ;
+ sync_dirty_buffer(journal->j_header_bh) ;
+ }
+ if (!buffer_uptodate(journal->j_header_bh)) {
+ reiserfs_warning (p_s_sb, "journal-837: IO error during journal replay");
+ return -EIO ;
+ }
+ }
+ return 0 ;
+}
+
+static int update_journal_header_block(struct super_block *p_s_sb,
+ unsigned long offset,
+ unsigned long trans_id) {
+ return _update_journal_header_block(p_s_sb, offset, trans_id);
+}
+/*
+** flush any and all journal lists older than you are
+** can only be called from flush_journal_list
+*/
+static int flush_older_journal_lists(struct super_block *p_s_sb,
+ struct reiserfs_journal_list *jl)
+{
+ struct list_head *entry;
+ struct reiserfs_journal_list *other_jl ;
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ unsigned long trans_id = jl->j_trans_id;
+
+ /* we know we are the only ones flushing things, no extra race
+ * protection is required.
+ */
+restart:
+ entry = journal->j_journal_list.next;
+ /* Did we wrap? */
+ if (entry == &journal->j_journal_list)
+ return 0;
+ other_jl = JOURNAL_LIST_ENTRY(entry);
+ if (other_jl->j_trans_id < trans_id) {
+ BUG_ON (other_jl->j_refcount <= 0);
+ /* do not flush all */
+ flush_journal_list(p_s_sb, other_jl, 0) ;
+
+ /* other_jl is now deleted from the list */
+ goto restart;
+ }
+ return 0 ;
+}
+
+static void del_from_work_list(struct super_block *s,
+ struct reiserfs_journal_list *jl) {
+ struct reiserfs_journal *journal = SB_JOURNAL (s);
+ if (!list_empty(&jl->j_working_list)) {
+ list_del_init(&jl->j_working_list);
+ journal->j_num_work_lists--;
+ }
+}
+
+/* flush a journal list, both commit and real blocks
+**
+** always set flushall to 1, unless you are calling from inside
+** flush_journal_list
+**
+** IMPORTANT. This can only be called while there are no journal writers,
+** and the journal is locked. That means it can only be called from
+** do_journal_end, or by journal_release
+*/
+static int flush_journal_list(struct super_block *s,
+ struct reiserfs_journal_list *jl, int flushall) {
+ struct reiserfs_journal_list *pjl ;
+ struct reiserfs_journal_cnode *cn, *last ;
+ int count ;
+ int was_jwait = 0 ;
+ int was_dirty = 0 ;
+ struct buffer_head *saved_bh ;
+ unsigned long j_len_saved = jl->j_len ;
+ struct reiserfs_journal *journal = SB_JOURNAL (s);
+ int err = 0;
+
+ BUG_ON (j_len_saved <= 0);
+
+ if (atomic_read(&journal->j_wcount) != 0) {
+ reiserfs_warning(s, "clm-2048: flush_journal_list called with wcount %d",
+ atomic_read(&journal->j_wcount)) ;
+ }
+ BUG_ON (jl->j_trans_id == 0);
+
+ /* if flushall == 0, the lock is already held */
+ if (flushall) {
+ down(&journal->j_flush_sem);
+ } else if (!down_trylock(&journal->j_flush_sem)) {
+ BUG();
+ }
+
+ count = 0 ;
+ if (j_len_saved > journal->j_trans_max) {
+ reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, trans id %lu\n", j_len_saved, jl->j_trans_id);
+ return 0 ;
+ }
+
+ /* if all the work is already done, get out of here */
+ if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
+ atomic_read(&(jl->j_commit_left)) <= 0) {
+ goto flush_older_and_return ;
+ }
+
+ /* start by putting the commit list on disk. This will also flush
+ ** the commit lists of any olders transactions
+ */
+ flush_commit_list(s, jl, 1) ;
+
+ if (!(jl->j_state & LIST_DIRTY) && !reiserfs_is_journal_aborted (journal))
+ BUG();
+
+ /* are we done now? */
+ if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
+ atomic_read(&(jl->j_commit_left)) <= 0) {
+ goto flush_older_and_return ;
+ }
+
+ /* loop through each cnode, see if we need to write it,
+ ** or wait on a more recent transaction, or just ignore it
+ */
+ if (atomic_read(&(journal->j_wcount)) != 0) {
+ reiserfs_panic(s, "journal-844: panic journal list is flushing, wcount is not 0\n") ;
+ }
+ cn = jl->j_realblock ;
+ while(cn) {
+ was_jwait = 0 ;
+ was_dirty = 0 ;
+ saved_bh = NULL ;
+ /* blocknr of 0 is no longer in the hash, ignore it */
+ if (cn->blocknr == 0) {
+ goto free_cnode ;
+ }
+
+ /* This transaction failed commit. Don't write out to the disk */
+ if (!(jl->j_state & LIST_DIRTY))
+ goto free_cnode;
+
+ pjl = find_newer_jl_for_cn(cn) ;
+ /* the order is important here. We check pjl to make sure we
+ ** don't clear BH_JDirty_wait if we aren't the one writing this
+ ** block to disk
+ */
+ if (!pjl && cn->bh) {
+ saved_bh = cn->bh ;
+
+ /* we do this to make sure nobody releases the buffer while
+ ** we are working with it
+ */
+ get_bh(saved_bh) ;
+
+ if (buffer_journal_dirty(saved_bh)) {
+ BUG_ON (!can_dirty (cn));
+ was_jwait = 1 ;
+ was_dirty = 1 ;
+ } else if (can_dirty(cn)) {
+ /* everything with !pjl && jwait should be writable */
+ BUG();
+ }
+ }
+
+ /* if someone has this block in a newer transaction, just make
+ ** sure they are commited, and don't try writing it to disk
+ */
+ if (pjl) {
+ if (atomic_read(&pjl->j_commit_left))
+ flush_commit_list(s, pjl, 1) ;
+ goto free_cnode ;
+ }
+
+ /* bh == NULL when the block got to disk on its own, OR,
+ ** the block got freed in a future transaction
+ */
+ if (saved_bh == NULL) {
+ goto free_cnode ;
+ }
+
+ /* this should never happen. kupdate_one_transaction has this list
+ ** locked while it works, so we should never see a buffer here that
+ ** is not marked JDirty_wait
+ */
+ if ((!was_jwait) && !buffer_locked(saved_bh)) {
+ reiserfs_warning (s, "journal-813: BAD! buffer %llu %cdirty %cjwait, "
+ "not in a newer tranasction",
+ (unsigned long long)saved_bh->b_blocknr,
+ was_dirty ? ' ' : '!', was_jwait ? ' ' : '!') ;
+ }
+ if (was_dirty) {
+ /* we inc again because saved_bh gets decremented at free_cnode */
+ get_bh(saved_bh) ;
+ set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
+ lock_buffer(saved_bh);
+ BUG_ON (cn->blocknr != saved_bh->b_blocknr);
+ if (buffer_dirty(saved_bh))
+ submit_logged_buffer(saved_bh) ;
+ else
+ unlock_buffer(saved_bh);
+ count++ ;
+ } else {
+ reiserfs_warning (s, "clm-2082: Unable to flush buffer %llu in %s",
+ (unsigned long long)saved_bh->b_blocknr, __FUNCTION__);
+ }
+free_cnode:
+ last = cn ;
+ cn = cn->next ;
+ if (saved_bh) {
+ /* we incremented this to keep others from taking the buffer head away */
+ put_bh(saved_bh) ;
+ if (atomic_read(&(saved_bh->b_count)) < 0) {
+ reiserfs_warning (s, "journal-945: saved_bh->b_count < 0");
+ }
+ }
+ }
+ if (count > 0) {
+ cn = jl->j_realblock ;
+ while(cn) {
+ if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
+ if (!cn->bh) {
+ reiserfs_panic(s, "journal-1011: cn->bh is NULL\n") ;
+ }
+ wait_on_buffer(cn->bh) ;
+ if (!cn->bh) {
+ reiserfs_panic(s, "journal-1012: cn->bh is NULL\n") ;
+ }
+ if (unlikely (!buffer_uptodate(cn->bh))) {
+#ifdef CONFIG_REISERFS_CHECK
+ reiserfs_warning(s, "journal-949: buffer write failed\n") ;
+#endif
+ err = -EIO;
+ }
+ /* note, we must clear the JDirty_wait bit after the up to date
+ ** check, otherwise we race against our flushpage routine
+ */
+ BUG_ON (!test_clear_buffer_journal_dirty (cn->bh));
+
+ /* undo the inc from journal_mark_dirty */
+ put_bh(cn->bh) ;
+ brelse(cn->bh) ;
+ }
+ cn = cn->next ;
+ }
+ }
+
+ if (err)
+ reiserfs_abort (s, -EIO, "Write error while pushing transaction to disk in %s", __FUNCTION__);
+flush_older_and_return:
+
+
+ /* before we can update the journal header block, we _must_ flush all
+ ** real blocks from all older transactions to disk. This is because
+ ** once the header block is updated, this transaction will not be
+ ** replayed after a crash
+ */
+ if (flushall) {
+ flush_older_journal_lists(s, jl);
+ }
+
+ err = journal->j_errno;
+ /* before we can remove everything from the hash tables for this
+ ** transaction, we must make sure it can never be replayed
+ **
+ ** since we are only called from do_journal_end, we know for sure there
+ ** are no allocations going on while we are flushing journal lists. So,
+ ** we only need to update the journal header block for the last list
+ ** being flushed
+ */
+ if (!err && flushall) {
+ err = update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ;
+ if (err)
+ reiserfs_abort (s, -EIO, "Write error while updating journal header in %s", __FUNCTION__);
+ }
+ remove_all_from_journal_list(s, jl, 0) ;
+ list_del_init(&jl->j_list);
+ journal->j_num_lists--;
+ del_from_work_list(s, jl);
+
+ if (journal->j_last_flush_id != 0 &&
+ (jl->j_trans_id - journal->j_last_flush_id) != 1) {
+ reiserfs_warning(s, "clm-2201: last flush %lu, current %lu",
+ journal->j_last_flush_id,
+ jl->j_trans_id);
+ }
+ journal->j_last_flush_id = jl->j_trans_id;
+
+ /* not strictly required since we are freeing the list, but it should
+ * help find code using dead lists later on
+ */
+ jl->j_len = 0 ;
+ atomic_set(&(jl->j_nonzerolen), 0) ;
+ jl->j_start = 0 ;
+ jl->j_realblock = NULL ;
+ jl->j_commit_bh = NULL ;
+ jl->j_trans_id = 0 ;
+ jl->j_state = 0;
+ put_journal_list(s, jl);
+ if (flushall)
+ up(&journal->j_flush_sem);
+ return err ;
+}
+
+static int write_one_transaction(struct super_block *s,
+ struct reiserfs_journal_list *jl,
+ struct buffer_chunk *chunk)
+{
+ struct reiserfs_journal_cnode *cn;
+ int ret = 0 ;
+
+ jl->j_state |= LIST_TOUCHED;
+ del_from_work_list(s, jl);
+ if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
+ return 0;
+ }
+
+ cn = jl->j_realblock ;
+ while(cn) {
+ /* if the blocknr == 0, this has been cleared from the hash,
+ ** skip it
+ */
+ if (cn->blocknr == 0) {
+ goto next ;
+ }
+ if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
+ struct buffer_head *tmp_bh;
+ /* we can race against journal_mark_freed when we try
+ * to lock_buffer(cn->bh), so we have to inc the buffer
+ * count, and recheck things after locking
+ */
+ tmp_bh = cn->bh;
+ get_bh(tmp_bh);
+ lock_buffer(tmp_bh);
+ if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
+ if (!buffer_journal_dirty(tmp_bh) ||
+ buffer_journal_prepared(tmp_bh))
+ BUG();
+ add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
+ ret++;
+ } else {
+ /* note, cn->bh might be null now */
+ unlock_buffer(tmp_bh);
+ }
+ put_bh(tmp_bh);
+ }
+next:
+ cn = cn->next ;
+ cond_resched();
+ }
+ return ret ;
+}
+
+/* used by flush_commit_list */
+static int dirty_one_transaction(struct super_block *s,
+ struct reiserfs_journal_list *jl)
+{
+ struct reiserfs_journal_cnode *cn;
+ struct reiserfs_journal_list *pjl;
+ int ret = 0 ;
+
+ jl->j_state |= LIST_DIRTY;
+ cn = jl->j_realblock ;
+ while(cn) {
+ /* look for a more recent transaction that logged this
+ ** buffer. Only the most recent transaction with a buffer in
+ ** it is allowed to send that buffer to disk
+ */
+ pjl = find_newer_jl_for_cn(cn) ;
+ if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh))
+ {
+ BUG_ON (!can_dirty(cn));
+ /* if the buffer is prepared, it will either be logged
+ * or restored. If restored, we need to make sure
+ * it actually gets marked dirty
+ */
+ clear_buffer_journal_new (cn->bh);
+ if (buffer_journal_prepared (cn->bh)) {
+ set_buffer_journal_restore_dirty (cn->bh);
+ } else {
+ set_buffer_journal_test (cn->bh);
+ mark_buffer_dirty(cn->bh);
+ }
+ }
+ cn = cn->next ;
+ }
+ return ret ;
+}
+
+static int kupdate_transactions(struct super_block *s,
+ struct reiserfs_journal_list *jl,
+ struct reiserfs_journal_list **next_jl,
+ unsigned long *next_trans_id,
+ int num_blocks,
+ int num_trans) {
+ int ret = 0;
+ int written = 0 ;
+ int transactions_flushed = 0;
+ unsigned long orig_trans_id = jl->j_trans_id;
+ struct buffer_chunk chunk;
+ struct list_head *entry;
+ struct reiserfs_journal *journal = SB_JOURNAL (s);
+ chunk.nr = 0;
+
+ down(&journal->j_flush_sem);
+ if (!journal_list_still_alive(s, orig_trans_id)) {
+ goto done;
+ }
+
+ /* we've got j_flush_sem held, nobody is going to delete any
+ * of these lists out from underneath us
+ */
+ while((num_trans && transactions_flushed < num_trans) ||
+ (!num_trans && written < num_blocks)) {
+
+ if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
+ atomic_read(&jl->j_commit_left) || !(jl->j_state & LIST_DIRTY))
+ {
+ del_from_work_list(s, jl);
+ break;
+ }
+ ret = write_one_transaction(s, jl, &chunk);
+
+ if (ret < 0)
+ goto done;
+ transactions_flushed++;
+ written += ret;
+ entry = jl->j_list.next;
+
+ /* did we wrap? */
+ if (entry == &journal->j_journal_list) {
+ break;
+ }
+ jl = JOURNAL_LIST_ENTRY(entry);
+
+ /* don't bother with older transactions */
+ if (jl->j_trans_id <= orig_trans_id)
+ break;
+ }
+ if (chunk.nr) {
+ write_chunk(&chunk);
+ }
+
+done:
+ up(&journal->j_flush_sem);
+ return ret;
+}
+
+/* for o_sync and fsync heavy applications, they tend to use
+** all the journa list slots with tiny transactions. These
+** trigger lots and lots of calls to update the header block, which
+** adds seeks and slows things down.
+**
+** This function tries to clear out a large chunk of the journal lists
+** at once, which makes everything faster since only the newest journal
+** list updates the header block
+*/
+static int flush_used_journal_lists(struct super_block *s,
+ struct reiserfs_journal_list *jl) {
+ unsigned long len = 0;
+ unsigned long cur_len;
+ int ret;
+ int i;
+ int limit = 256;
+ struct reiserfs_journal_list *tjl;
+ struct reiserfs_journal_list *flush_jl;
+ unsigned long trans_id;
+ struct reiserfs_journal *journal = SB_JOURNAL (s);
+
+ flush_jl = tjl = jl;
+
+ /* in data logging mode, try harder to flush a lot of blocks */
+ if (reiserfs_data_log(s))
+ limit = 1024;
+ /* flush for 256 transactions or limit blocks, whichever comes first */
+ for(i = 0 ; i < 256 && len < limit ; i++) {
+ if (atomic_read(&tjl->j_commit_left) ||
+ tjl->j_trans_id < jl->j_trans_id) {
+ break;
+ }
+ cur_len = atomic_read(&tjl->j_nonzerolen);
+ if (cur_len > 0) {
+ tjl->j_state &= ~LIST_TOUCHED;
+ }
+ len += cur_len;
+ flush_jl = tjl;
+ if (tjl->j_list.next == &journal->j_journal_list)
+ break;
+ tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
+ }
+ /* try to find a group of blocks we can flush across all the
+ ** transactions, but only bother if we've actually spanned
+ ** across multiple lists
+ */
+ if (flush_jl != jl) {
+ ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
+ }
+ flush_journal_list(s, flush_jl, 1);
+ return 0;
+}
+
+/*
+** removes any nodes in table with name block and dev as bh.
+** only touchs the hnext and hprev pointers.
+*/
+void remove_journal_hash(struct super_block *sb,
+ struct reiserfs_journal_cnode **table,
+ struct reiserfs_journal_list *jl,
+ unsigned long block, int remove_freed)
+{
+ struct reiserfs_journal_cnode *cur ;
+ struct reiserfs_journal_cnode **head ;
+
+ head= &(journal_hash(table, sb, block)) ;
+ if (!head) {
+ return ;
+ }
+ cur = *head ;
+ while(cur) {
+ if (cur->blocknr == block && cur->sb == sb && (jl == NULL || jl == cur->jlist) &&
+ (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
+ if (cur->hnext) {
+ cur->hnext->hprev = cur->hprev ;
+ }
+ if (cur->hprev) {
+ cur->hprev->hnext = cur->hnext ;
+ } else {
+ *head = cur->hnext ;
+ }
+ cur->blocknr = 0 ;
+ cur->sb = NULL ;
+ cur->state = 0 ;
+ if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */
+ atomic_dec(&(cur->jlist->j_nonzerolen)) ;
+ cur->bh = NULL ;
+ cur->jlist = NULL ;
+ }
+ cur = cur->hnext ;
+ }
+}
+
+static void free_journal_ram(struct super_block *p_s_sb) {
+ struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+ reiserfs_kfree(journal->j_current_jl,
+ sizeof(struct reiserfs_journal_list), p_s_sb);
+ journal->j_num_lists--;
+
+ vfree(journal->j_cnode_free_orig) ;
+ free_list_bitmaps(p_s_sb, journal->j_list_bitmap) ;
+ free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */
+ if (journal->j_header_bh) {
+ brelse(journal->j_header_bh) ;
+ }
+ /* j_header_bh is on the journal dev, make sure not to release the journal
+ * dev until we brelse j_header_bh
+ */
+ release_journal_dev(p_s_sb, journal);
+ vfree(journal) ;
+}
+
+/*
+** call on unmount. Only set error to 1 if you haven't made your way out
+** of read_super() yet. Any other caller must keep error at 0.
+*/
+static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, int error) {
+ struct reiserfs_transaction_handle myth ;
+ int flushed = 0;
+ struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+
+ /* we only want to flush out transactions if we were called with error == 0
+ */
+ if (!error && !(p_s_sb->s_flags & MS_RDONLY)) {
+ /* end the current trans */
+ BUG_ON (!th->t_trans_id);
+ do_journal_end(th, p_s_sb,10, FLUSH_ALL) ;
+
+ /* make sure something gets logged to force our way into the flush code */
+ if (!journal_join(&myth, p_s_sb, 1)) {
+ reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+ journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
+ do_journal_end(&myth, p_s_sb,1, FLUSH_ALL) ;
+ flushed = 1;
+ }
+ }
+
+ /* this also catches errors during the do_journal_end above */
+ if (!error && reiserfs_is_journal_aborted(journal)) {
+ memset(&myth, 0, sizeof(myth));
+ if (!journal_join_abort(&myth, p_s_sb, 1)) {
+ reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+ journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
+ do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL) ;
+ }
+ }
+
+ reiserfs_mounted_fs_count-- ;
+ /* wait for all commits to finish */
+ cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work);
+ flush_workqueue(commit_wq);
+ if (!reiserfs_mounted_fs_count) {
+ destroy_workqueue(commit_wq);
+ commit_wq = NULL;
+ }
+
+ free_journal_ram(p_s_sb) ;
+
+ return 0 ;
+}
+
+/*
+** call on unmount. flush all journal trans, release all alloc'd ram
+*/
+int journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) {
+ return do_journal_release(th, p_s_sb, 0) ;
+}
+/*
+** only call from an error condition inside reiserfs_read_super!
+*/
+int journal_release_error(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) {
+ return do_journal_release(th, p_s_sb, 1) ;
+}
+
+/* compares description block with commit block. returns 1 if they differ, 0 if they are the same */
+static int journal_compare_desc_commit(struct super_block *p_s_sb, struct reiserfs_journal_desc *desc,
+ struct reiserfs_journal_commit *commit) {
+ if (get_commit_trans_id (commit) != get_desc_trans_id (desc) ||
+ get_commit_trans_len (commit) != get_desc_trans_len (desc) ||
+ get_commit_trans_len (commit) > SB_JOURNAL(p_s_sb)->j_trans_max ||
+ get_commit_trans_len (commit) <= 0
+ ) {
+ return 1 ;
+ }
+ return 0 ;
+}
+/* returns 0 if it did not find a description block
+** returns -1 if it found a corrupt commit block
+** returns 1 if both desc and commit were valid
+*/
+static int journal_transaction_is_valid(struct super_block *p_s_sb, struct buffer_head *d_bh, unsigned long *oldest_invalid_trans_id, unsigned long *newest_mount_id) {
+ struct reiserfs_journal_desc *desc ;
+ struct reiserfs_journal_commit *commit ;
+ struct buffer_head *c_bh ;
+ unsigned long offset ;
+
+ if (!d_bh)
+ return 0 ;
+
+ desc = (struct reiserfs_journal_desc *)d_bh->b_data ;
+ if (get_desc_trans_len(desc) > 0 && !memcmp(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8)) {
+ if (oldest_invalid_trans_id && *oldest_invalid_trans_id && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-986: transaction "
+ "is valid returning because trans_id %d is greater than "
+ "oldest_invalid %lu", get_desc_trans_id(desc),
+ *oldest_invalid_trans_id);
+ return 0 ;
+ }
+ if (newest_mount_id && *newest_mount_id > get_desc_mount_id (desc)) {
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1087: transaction "
+ "is valid returning because mount_id %d is less than "
+ "newest_mount_id %lu", get_desc_mount_id (desc),
+ *newest_mount_id) ;
+ return -1 ;
+ }
+ if ( get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max ) {
+ reiserfs_warning(p_s_sb, "journal-2018: Bad transaction length %d encountered, ignoring transaction", get_desc_trans_len(desc));
+ return -1 ;
+ }
+ offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
+
+ /* ok, we have a journal description block, lets see if the transaction was valid */
+ c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+ ((offset + get_desc_trans_len(desc) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
+ if (!c_bh)
+ return 0 ;
+ commit = (struct reiserfs_journal_commit *)c_bh->b_data ;
+ if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+ "journal_transaction_is_valid, commit offset %ld had bad "
+ "time %d or length %d",
+ c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+ get_commit_trans_id (commit),
+ get_commit_trans_len(commit));
+ brelse(c_bh) ;
+ if (oldest_invalid_trans_id) {
+ *oldest_invalid_trans_id = get_desc_trans_id(desc) ;
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1004: "
+ "transaction_is_valid setting oldest invalid trans_id "
+ "to %d", get_desc_trans_id(desc)) ;
+ }
+ return -1;
+ }
+ brelse(c_bh) ;
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1006: found valid "
+ "transaction start offset %llu, len %d id %d",
+ d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+ get_desc_trans_len(desc), get_desc_trans_id(desc)) ;
+ return 1 ;
+ } else {
+ return 0 ;
+ }
+}
+
+static void brelse_array(struct buffer_head **heads, int num) {
+ int i ;
+ for (i = 0 ; i < num ; i++) {
+ brelse(heads[i]) ;
+ }
+}
+
+/*
+** given the start, and values for the oldest acceptable transactions,
+** this either reads in a replays a transaction, or returns because the transaction
+** is invalid, or too old.
+*/
+static int journal_read_transaction(struct super_block *p_s_sb, unsigned long cur_dblock, unsigned long oldest_start,
+ unsigned long oldest_trans_id, unsigned long newest_mount_id) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ struct reiserfs_journal_desc *desc ;
+ struct reiserfs_journal_commit *commit ;
+ unsigned long trans_id = 0 ;
+ struct buffer_head *c_bh ;
+ struct buffer_head *d_bh ;
+ struct buffer_head **log_blocks = NULL ;
+ struct buffer_head **real_blocks = NULL ;
+ unsigned long trans_offset ;
+ int i;
+ int trans_half;
+
+ d_bh = journal_bread(p_s_sb, cur_dblock) ;
+ if (!d_bh)
+ return 1 ;
+ desc = (struct reiserfs_journal_desc *)d_bh->b_data ;
+ trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: "
+ "journal_read_transaction, offset %llu, len %d mount_id %d",
+ d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+ get_desc_trans_len(desc), get_desc_mount_id(desc)) ;
+ if (get_desc_trans_id(desc) < oldest_trans_id) {
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: "
+ "journal_read_trans skipping because %lu is too old",
+ cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)) ;
+ brelse(d_bh) ;
+ return 1 ;
+ }
+ if (get_desc_mount_id(desc) != newest_mount_id) {
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: "
+ "journal_read_trans skipping because %d is != "
+ "newest_mount_id %lu", get_desc_mount_id(desc),
+ newest_mount_id) ;
+ brelse(d_bh) ;
+ return 1 ;
+ }
+ c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+ ((trans_offset + get_desc_trans_len(desc) + 1) %
+ SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
+ if (!c_bh) {
+ brelse(d_bh) ;
+ return 1 ;
+ }
+ commit = (struct reiserfs_journal_commit *)c_bh->b_data ;
+ if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal_read_transaction, "
+ "commit offset %llu had bad time %d or length %d",
+ c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+ get_commit_trans_id(commit), get_commit_trans_len(commit));
+ brelse(c_bh) ;
+ brelse(d_bh) ;
+ return 1;
+ }
+ trans_id = get_desc_trans_id(desc) ;
+ /* now we know we've got a good transaction, and it was inside the valid time ranges */
+ log_blocks = reiserfs_kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS, p_s_sb) ;
+ real_blocks = reiserfs_kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS, p_s_sb) ;
+ if (!log_blocks || !real_blocks) {
+ brelse(c_bh) ;
+ brelse(d_bh) ;
+ reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
+ reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
+ reiserfs_warning(p_s_sb, "journal-1169: kmalloc failed, unable to mount FS") ;
+ return -1 ;
+ }
+ /* get all the buffer heads */
+ trans_half = journal_trans_half (p_s_sb->s_blocksize) ;
+ for(i = 0 ; i < get_desc_trans_len(desc) ; i++) {
+ log_blocks[i] = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + (trans_offset + 1 + i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+ if (i < trans_half) {
+ real_blocks[i] = sb_getblk(p_s_sb, le32_to_cpu(desc->j_realblock[i])) ;
+ } else {
+ real_blocks[i] = sb_getblk(p_s_sb, le32_to_cpu(commit->j_realblock[i - trans_half])) ;
+ }
+ if ( real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb) ) {
+ reiserfs_warning(p_s_sb, "journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem");
+ goto abort_replay;
+ }
+ /* make sure we don't try to replay onto log or reserved area */
+ if (is_block_in_log_or_reserved_area(p_s_sb, real_blocks[i]->b_blocknr)) {
+ reiserfs_warning(p_s_sb, "journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block") ;
+abort_replay:
+ brelse_array(log_blocks, i) ;
+ brelse_array(real_blocks, i) ;
+ brelse(c_bh) ;
+ brelse(d_bh) ;
+ reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
+ reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
+ return -1 ;
+ }
+ }
+ /* read in the log blocks, memcpy to the corresponding real block */
+ ll_rw_block(READ, get_desc_trans_len(desc), log_blocks) ;
+ for (i = 0 ; i < get_desc_trans_len(desc) ; i++) {
+ wait_on_buffer(log_blocks[i]) ;
+ if (!buffer_uptodate(log_blocks[i])) {
+ reiserfs_warning(p_s_sb, "journal-1212: REPLAY FAILURE fsck required! buffer write failed") ;
+ brelse_array(log_blocks + i, get_desc_trans_len(desc) - i) ;
+ brelse_array(real_blocks, get_desc_trans_len(desc)) ;
+ brelse(c_bh) ;
+ brelse(d_bh) ;
+ reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
+ reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
+ return -1 ;
+ }
+ memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, real_blocks[i]->b_size) ;
+ set_buffer_uptodate(real_blocks[i]) ;
+ brelse(log_blocks[i]) ;
+ }
+ /* flush out the real blocks */
+ for (i = 0 ; i < get_desc_trans_len(desc) ; i++) {
+ set_buffer_dirty(real_blocks[i]) ;
+ ll_rw_block(WRITE, 1, real_blocks + i) ;
+ }
+ for (i = 0 ; i < get_desc_trans_len(desc) ; i++) {
+ wait_on_buffer(real_blocks[i]) ;
+ if (!buffer_uptodate(real_blocks[i])) {
+ reiserfs_warning(p_s_sb, "journal-1226: REPLAY FAILURE, fsck required! buffer write failed") ;
+ brelse_array(real_blocks + i, get_desc_trans_len(desc) - i) ;
+ brelse(c_bh) ;
+ brelse(d_bh) ;
+ reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
+ reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
+ return -1 ;
+ }
+ brelse(real_blocks[i]) ;
+ }
+ cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + ((trans_offset + get_desc_trans_len(desc) + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) ;
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1095: setting journal "
+ "start to offset %ld",
+ cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)) ;
+
+ /* init starting values for the first transaction, in case this is the last transaction to be replayed. */
+ journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
+ journal->j_last_flush_trans_id = trans_id ;
+ journal->j_trans_id = trans_id + 1;
+ brelse(c_bh) ;
+ brelse(d_bh) ;
+ reiserfs_kfree(log_blocks, le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), p_s_sb) ;
+ reiserfs_kfree(real_blocks, le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), p_s_sb) ;
+ return 0 ;
+}
+
+/* This function reads blocks starting from block and to max_block of bufsize
+ size (but no more than BUFNR blocks at a time). This proved to improve
+ mounting speed on self-rebuilding raid5 arrays at least.
+ Right now it is only used from journal code. But later we might use it
+ from other places.
+ Note: Do not use journal_getblk/sb_getblk functions here! */
+static struct buffer_head * reiserfs_breada (struct block_device *dev, int block, int bufsize,
+ unsigned int max_block)
+{
+ struct buffer_head * bhlist[BUFNR];
+ unsigned int blocks = BUFNR;
+ struct buffer_head * bh;
+ int i, j;
+
+ bh = __getblk (dev, block, bufsize );
+ if (buffer_uptodate (bh))
+ return (bh);
+
+ if (block + BUFNR > max_block) {
+ blocks = max_block - block;
+ }
+ bhlist[0] = bh;
+ j = 1;
+ for (i = 1; i < blocks; i++) {
+ bh = __getblk (dev, block + i, bufsize);
+ if (buffer_uptodate (bh)) {
+ brelse (bh);
+ break;
+ }
+ else bhlist[j++] = bh;
+ }
+ ll_rw_block (READ, j, bhlist);
+ for(i = 1; i < j; i++)
+ brelse (bhlist[i]);
+ bh = bhlist[0];
+ wait_on_buffer (bh);
+ if (buffer_uptodate (bh))
+ return bh;
+ brelse (bh);
+ return NULL;
+}
+
+/*
+** read and replay the log
+** on a clean unmount, the journal header's next unflushed pointer will be to an invalid
+** transaction. This tests that before finding all the transactions in the log, which makes normal mount times fast.
+**
+** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid.
+**
+** On exit, it sets things up so the first transaction will work correctly.
+*/
+static int journal_read(struct super_block *p_s_sb) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ struct reiserfs_journal_desc *desc ;
+ unsigned long oldest_trans_id = 0;
+ unsigned long oldest_invalid_trans_id = 0 ;
+ time_t start ;
+ unsigned long oldest_start = 0;
+ unsigned long cur_dblock = 0 ;
+ unsigned long newest_mount_id = 9 ;
+ struct buffer_head *d_bh ;
+ struct reiserfs_journal_header *jh ;
+ int valid_journal_header = 0 ;
+ int replay_count = 0 ;
+ int continue_replay = 1 ;
+ int ret ;
+ char b[BDEVNAME_SIZE];
+
+ cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
+ reiserfs_info (p_s_sb, "checking transaction log (%s)\n",
+ bdevname(journal->j_dev_bd, b));
+ start = get_seconds();
+
+ /* step 1, read in the journal header block. Check the transaction it says
+ ** is the first unflushed, and if that transaction is not valid,
+ ** replay is done
+ */
+ journal->j_header_bh = journal_bread(p_s_sb,
+ SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+ SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+ if (!journal->j_header_bh) {
+ return 1 ;
+ }
+ jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data) ;
+ if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 &&
+ le32_to_cpu(jh->j_first_unflushed_offset) < SB_ONDISK_JOURNAL_SIZE(p_s_sb) &&
+ le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
+ oldest_start = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+ le32_to_cpu(jh->j_first_unflushed_offset) ;
+ oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
+ newest_mount_id = le32_to_cpu(jh->j_mount_id);
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1153: found in "
+ "header: first_unflushed_offset %d, last_flushed_trans_id "
+ "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
+ le32_to_cpu(jh->j_last_flush_trans_id)) ;
+ valid_journal_header = 1 ;
+
+ /* now, we try to read the first unflushed offset. If it is not valid,
+ ** there is nothing more we can do, and it makes no sense to read
+ ** through the whole log.
+ */
+ d_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + le32_to_cpu(jh->j_first_unflushed_offset)) ;
+ ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL) ;
+ if (!ret) {
+ continue_replay = 0 ;
+ }
+ brelse(d_bh) ;
+ goto start_log_replay;
+ }
+
+ if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) {
+ reiserfs_warning (p_s_sb,
+ "clm-2076: device is readonly, unable to replay log") ;
+ return -1 ;
+ }
+
+ /* ok, there are transactions that need to be replayed. start with the first log block, find
+ ** all the valid transactions, and pick out the oldest.
+ */
+ while(continue_replay && cur_dblock < (SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb))) {
+ /* Note that it is required for blocksize of primary fs device and journal
+ device to be the same */
+ d_bh = reiserfs_breada(journal->j_dev_bd, cur_dblock, p_s_sb->s_blocksize,
+ SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) ;
+ ret = journal_transaction_is_valid(p_s_sb, d_bh, &oldest_invalid_trans_id, &newest_mount_id) ;
+ if (ret == 1) {
+ desc = (struct reiserfs_journal_desc *)d_bh->b_data ;
+ if (oldest_start == 0) { /* init all oldest_ values */
+ oldest_trans_id = get_desc_trans_id(desc) ;
+ oldest_start = d_bh->b_blocknr ;
+ newest_mount_id = get_desc_mount_id(desc) ;
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1179: Setting "
+ "oldest_start to offset %llu, trans_id %lu",
+ oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+ oldest_trans_id) ;
+ } else if (oldest_trans_id > get_desc_trans_id(desc)) {
+ /* one we just read was older */
+ oldest_trans_id = get_desc_trans_id(desc) ;
+ oldest_start = d_bh->b_blocknr ;
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1180: Resetting "
+ "oldest_start to offset %lu, trans_id %lu",
+ oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+ oldest_trans_id) ;
+ }
+ if (newest_mount_id < get_desc_mount_id(desc)) {
+ newest_mount_id = get_desc_mount_id(desc) ;
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
+ "newest_mount_id to %d", get_desc_mount_id(desc));
+ }
+ cur_dblock += get_desc_trans_len(desc) + 2 ;
+ } else {
+ cur_dblock++ ;
+ }
+ brelse(d_bh) ;
+ }
+
+start_log_replay:
+ cur_dblock = oldest_start ;
+ if (oldest_trans_id) {
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay "
+ "from offset %llu, trans_id %lu",
+ cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+ oldest_trans_id) ;
+
+ }
+ replay_count = 0 ;
+ while(continue_replay && oldest_trans_id > 0) {
+ ret = journal_read_transaction(p_s_sb, cur_dblock, oldest_start, oldest_trans_id, newest_mount_id) ;
+ if (ret < 0) {
+ return ret ;
+ } else if (ret != 0) {
+ break ;
+ }
+ cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start ;
+ replay_count++ ;
+ if (cur_dblock == oldest_start)
+ break;
+ }
+
+ if (oldest_trans_id == 0) {
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1225: No valid "
+ "transactions found") ;
+ }
+ /* j_start does not get set correctly if we don't replay any transactions.
+ ** if we had a valid journal_header, set j_start to the first unflushed transaction value,
+ ** copy the trans_id from the header
+ */
+ if (valid_journal_header && replay_count == 0) {
+ journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset) ;
+ journal->j_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
+ journal->j_last_flush_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) ;
+ journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
+ } else {
+ journal->j_mount_id = newest_mount_id + 1 ;
+ }
+ reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
+ "newest_mount_id to %lu", journal->j_mount_id) ;
+ journal->j_first_unflushed_offset = journal->j_start ;
+ if (replay_count > 0) {
+ reiserfs_info (p_s_sb, "replayed %d transactions in %lu seconds\n",
+ replay_count, get_seconds() - start) ;
+ }
+ if (!bdev_read_only(p_s_sb->s_bdev) &&
+ _update_journal_header_block(p_s_sb, journal->j_start,
+ journal->j_last_flush_trans_id))
+ {
+ /* replay failed, caller must call free_journal_ram and abort
+ ** the mount
+ */
+ return -1 ;
+ }
+ return 0 ;
+}
+
+static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
+{
+ struct reiserfs_journal_list *jl;
+retry:
+ jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s);
+ if (!jl) {
+ yield();
+ goto retry;
+ }
+ memset(jl, 0, sizeof(*jl));
+ INIT_LIST_HEAD(&jl->j_list);
+ INIT_LIST_HEAD(&jl->j_working_list);
+ INIT_LIST_HEAD(&jl->j_tail_bh_list);
+ INIT_LIST_HEAD(&jl->j_bh_list);
+ sema_init(&jl->j_commit_lock, 1);
+ SB_JOURNAL(s)->j_num_lists++;
+ get_journal_list(jl);
+ return jl;
+}
+
+static void journal_list_init(struct super_block *p_s_sb) {
+ SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
+}
+
+static int release_journal_dev( struct super_block *super,
+ struct reiserfs_journal *journal )
+{
+ int result;
+
+ result = 0;
+
+ if( journal -> j_dev_file != NULL ) {
+ result = filp_close( journal -> j_dev_file, NULL );
+ journal -> j_dev_file = NULL;
+ journal -> j_dev_bd = NULL;
+ } else if( journal -> j_dev_bd != NULL ) {
+ result = blkdev_put( journal -> j_dev_bd );
+ journal -> j_dev_bd = NULL;
+ }
+
+ if( result != 0 ) {
+ reiserfs_warning(super, "sh-457: release_journal_dev: Cannot release journal device: %i", result );
+ }
+ return result;
+}
+
+static int journal_init_dev( struct super_block *super,
+ struct reiserfs_journal *journal,
+ const char *jdev_name )
+{
+ int result;
+ dev_t jdev;
+ int blkdev_mode = FMODE_READ | FMODE_WRITE;
+ char b[BDEVNAME_SIZE];
+
+ result = 0;
+
+ journal -> j_dev_bd = NULL;
+ journal -> j_dev_file = NULL;
+ jdev = SB_ONDISK_JOURNAL_DEVICE( super ) ?
+ new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
+
+ if (bdev_read_only(super->s_bdev))
+ blkdev_mode = FMODE_READ;
+
+ /* there is no "jdev" option and journal is on separate device */
+ if( ( !jdev_name || !jdev_name[ 0 ] ) ) {
+ journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
+ if (IS_ERR(journal->j_dev_bd)) {
+ result = PTR_ERR(journal->j_dev_bd);
+ journal->j_dev_bd = NULL;
+ reiserfs_warning (super, "sh-458: journal_init_dev: "
+ "cannot init journal device '%s': %i",
+ __bdevname(jdev, b), result );
+ return result;
+ } else if (jdev != super->s_dev)
+ set_blocksize(journal->j_dev_bd, super->s_blocksize);
+ return 0;
+ }
+
+ journal -> j_dev_file = filp_open( jdev_name, 0, 0 );
+ if( !IS_ERR( journal -> j_dev_file ) ) {
+ struct inode *jdev_inode = journal->j_dev_file->f_mapping->host;
+ if( !S_ISBLK( jdev_inode -> i_mode ) ) {
+ reiserfs_warning (super, "journal_init_dev: '%s' is "
+ "not a block device", jdev_name );
+ result = -ENOTBLK;
+ } else {
+ /* ok */
+ journal->j_dev_bd = I_BDEV(jdev_inode);
+ set_blocksize(journal->j_dev_bd, super->s_blocksize);
+ }
+ } else {
+ result = PTR_ERR( journal -> j_dev_file );
+ journal -> j_dev_file = NULL;
+ reiserfs_warning (super,
+ "journal_init_dev: Cannot open '%s': %i",
+ jdev_name, result );
+ }
+ if( result != 0 ) {
+ release_journal_dev( super, journal );
+ }
+ reiserfs_info(super, "journal_init_dev: journal device: %s\n",
+ bdevname(journal->j_dev_bd, b));
+ return result;
+}
+
+/*
+** must be called once on fs mount. calls journal_read for you
+*/
+int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_format, unsigned int commit_max_age) {
+ int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2 ;
+ struct buffer_head *bhjh;
+ struct reiserfs_super_block * rs;
+ struct reiserfs_journal_header *jh;
+ struct reiserfs_journal *journal;
+ struct reiserfs_journal_list *jl;
+ char b[BDEVNAME_SIZE];
+
+ journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ;
+ if (!journal) {
+ reiserfs_warning (p_s_sb, "journal-1256: unable to get memory for journal structure") ;
+ return 1 ;
+ }
+ memset(journal, 0, sizeof(struct reiserfs_journal)) ;
+ INIT_LIST_HEAD(&journal->j_bitmap_nodes) ;
+ INIT_LIST_HEAD (&journal->j_prealloc_list);
+ INIT_LIST_HEAD(&journal->j_working_list);
+ INIT_LIST_HEAD(&journal->j_journal_list);
+ journal->j_persistent_trans = 0;
+ if (reiserfs_allocate_list_bitmaps(p_s_sb,
+ journal->j_list_bitmap,
+ SB_BMAP_NR(p_s_sb)))
+ goto free_and_return ;
+ allocate_bitmap_nodes(p_s_sb) ;
+
+ /* reserved for journal area support */
+ SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ?
+ REISERFS_OLD_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize +
+ SB_BMAP_NR(p_s_sb) + 1 :
+ REISERFS_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize + 2);
+
+ /* Sanity check to see is the standard journal fitting withing first bitmap
+ (actual for small blocksizes) */
+ if ( !SB_ONDISK_JOURNAL_DEVICE( p_s_sb ) &&
+ (SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8) ) {
+ reiserfs_warning (p_s_sb, "journal-1393: journal does not fit for area "
+ "addressed by first of bitmap blocks. It starts at "
+ "%u and its size is %u. Block size %ld",
+ SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb),
+ SB_ONDISK_JOURNAL_SIZE(p_s_sb), p_s_sb->s_blocksize);
+ goto free_and_return;
+ }
+
+ if( journal_init_dev( p_s_sb, journal, j_dev_name ) != 0 ) {
+ reiserfs_warning (p_s_sb, "sh-462: unable to initialize jornal device");
+ goto free_and_return;
+ }
+
+ rs = SB_DISK_SUPER_BLOCK(p_s_sb);
+
+ /* read journal header */
+ bhjh = journal_bread(p_s_sb,
+ SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+ if (!bhjh) {
+ reiserfs_warning (p_s_sb, "sh-459: unable to read journal header");
+ goto free_and_return;
+ }
+ jh = (struct reiserfs_journal_header *)(bhjh->b_data);
+
+ /* make sure that journal matches to the super block */
+ if (is_reiserfs_jr(rs) && (jh->jh_journal.jp_journal_magic != sb_jp_journal_magic(rs))) {
+ reiserfs_warning (p_s_sb, "sh-460: journal header magic %x "
+ "(device %s) does not match to magic found in super "
+ "block %x",
+ jh->jh_journal.jp_journal_magic,
+ bdevname( journal->j_dev_bd, b),
+ sb_jp_journal_magic(rs));
+ brelse (bhjh);
+ goto free_and_return;
+ }
+
+ journal->j_trans_max = le32_to_cpu (jh->jh_journal.jp_journal_trans_max);
+ journal->j_max_batch = le32_to_cpu (jh->jh_journal.jp_journal_max_batch);
+ journal->j_max_commit_age = le32_to_cpu (jh->jh_journal.jp_journal_max_commit_age);
+ journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
+
+ if (journal->j_trans_max) {
+ /* make sure these parameters are available, assign it if they are not */
+ __u32 initial = journal->j_trans_max;
+ __u32 ratio = 1;
+
+ if (p_s_sb->s_blocksize < 4096)
+ ratio = 4096 / p_s_sb->s_blocksize;
+
+ if (SB_ONDISK_JOURNAL_SIZE(p_s_sb)/journal->j_trans_max < JOURNAL_MIN_RATIO)
+ journal->j_trans_max = SB_ONDISK_JOURNAL_SIZE(p_s_sb) / JOURNAL_MIN_RATIO;
+ if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio)
+ journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT / ratio;
+ if (journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio)
+ journal->j_trans_max = JOURNAL_TRANS_MIN_DEFAULT / ratio;
+
+ if (journal->j_trans_max != initial)
+ reiserfs_warning (p_s_sb, "sh-461: journal_init: wrong transaction max size (%u). Changed to %u",
+ initial, journal->j_trans_max);
+
+ journal->j_max_batch = journal->j_trans_max*
+ JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT;
+ }
+
+ if (!journal->j_trans_max) {
+ /*we have the file system was created by old version of mkreiserfs
+ so this field contains zero value */
+ journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT ;
+ journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT ;
+ journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE ;
+
+ /* for blocksize >= 4096 - max transaction size is 1024. For block size < 4096
+ trans max size is decreased proportionally */
+ if (p_s_sb->s_blocksize < 4096) {
+ journal->j_trans_max /= (4096 / p_s_sb->s_blocksize) ;
+ journal->j_max_batch = (journal->j_trans_max) * 9 / 10 ;
+ }
+ }
+
+ journal->j_default_max_commit_age = journal->j_max_commit_age;
+
+ if (commit_max_age != 0) {
+ journal->j_max_commit_age = commit_max_age;
+ journal->j_max_trans_age = commit_max_age;
+ }
+
+ reiserfs_info (p_s_sb, "journal params: device %s, size %u, "
+ "journal first block %u, max trans len %u, max batch %u, "
+ "max commit age %u, max trans age %u\n",
+ bdevname( journal->j_dev_bd, b),
+ SB_ONDISK_JOURNAL_SIZE(p_s_sb),
+ SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+ journal->j_trans_max,
+ journal->j_max_batch,
+ journal->j_max_commit_age,
+ journal->j_max_trans_age);
+
+ brelse (bhjh);
+
+ journal->j_list_bitmap_index = 0 ;
+ journal_list_init(p_s_sb) ;
+
+ memset(journal->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
+
+ INIT_LIST_HEAD(&journal->j_dirty_buffers) ;
+ spin_lock_init(&journal->j_dirty_buffers_lock) ;
+
+ journal->j_start = 0 ;
+ journal->j_len = 0 ;
+ journal->j_len_alloc = 0 ;
+ atomic_set(&(journal->j_wcount), 0) ;
+ atomic_set(&(journal->j_async_throttle), 0) ;
+ journal->j_bcount = 0 ;
+ journal->j_trans_start_time = 0 ;
+ journal->j_last = NULL ;
+ journal->j_first = NULL ;
+ init_waitqueue_head(&(journal->j_join_wait)) ;
+ sema_init(&journal->j_lock, 1);
+ sema_init(&journal->j_flush_sem, 1);
+
+ journal->j_trans_id = 10 ;
+ journal->j_mount_id = 10 ;
+ journal->j_state = 0 ;
+ atomic_set(&(journal->j_jlock), 0) ;
+ journal->j_cnode_free_list = allocate_cnodes(num_cnodes) ;
+ journal->j_cnode_free_orig = journal->j_cnode_free_list ;
+ journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0 ;
+ journal->j_cnode_used = 0 ;
+ journal->j_must_wait = 0 ;
+
+ init_journal_hash(p_s_sb) ;
+ jl = journal->j_current_jl;
+ jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);
+ if (!jl->j_list_bitmap) {
+ reiserfs_warning(p_s_sb, "journal-2005, get_list_bitmap failed for journal list 0") ;
+ goto free_and_return;
+ }
+ if (journal_read(p_s_sb) < 0) {
+ reiserfs_warning(p_s_sb, "Replay Failure, unable to mount") ;
+ goto free_and_return;
+ }
+
+ reiserfs_mounted_fs_count++ ;
+ if (reiserfs_mounted_fs_count <= 1)
+ commit_wq = create_workqueue("reiserfs");
+
+ INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb);
+ return 0 ;
+free_and_return:
+ free_journal_ram(p_s_sb);
+ return 1;
+}
+
+/*
+** test for a polite end of the current transaction. Used by file_write, and should
+** be used by delete to make sure they don't write more than can fit inside a single
+** transaction
+*/
+int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) {
+ struct reiserfs_journal *journal = SB_JOURNAL (th->t_super);
+ time_t now = get_seconds() ;
+ /* cannot restart while nested */
+ BUG_ON (!th->t_trans_id);
+ if (th->t_refcount > 1)
+ return 0 ;
+ if ( journal->j_must_wait > 0 ||
+ (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
+ atomic_read(&(journal->j_jlock)) ||
+ (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
+ journal->j_cnode_free < (journal->j_trans_max * 3)) {
+ return 1 ;
+ }
+ return 0 ;
+}
+
+/* this must be called inside a transaction, and requires the
+** kernel_lock to be held
+*/
+void reiserfs_block_writes(struct reiserfs_transaction_handle *th) {
+ struct reiserfs_journal *journal = SB_JOURNAL (th->t_super);
+ BUG_ON (!th->t_trans_id);
+ journal->j_must_wait = 1 ;
+ set_bit(J_WRITERS_BLOCKED, &journal->j_state) ;
+ return ;
+}
+
+/* this must be called without a transaction started, and does not
+** require BKL
+*/
+void reiserfs_allow_writes(struct super_block *s) {
+ struct reiserfs_journal *journal = SB_JOURNAL (s);
+ clear_bit(J_WRITERS_BLOCKED, &journal->j_state) ;
+ wake_up(&journal->j_join_wait) ;
+}
+
+/* this must be called without a transaction started, and does not
+** require BKL
+*/
+void reiserfs_wait_on_write_block(struct super_block *s) {
+ struct reiserfs_journal *journal = SB_JOURNAL (s);
+ wait_event(journal->j_join_wait,
+ !test_bit(J_WRITERS_BLOCKED, &journal->j_state)) ;
+}
+
+static void queue_log_writer(struct super_block *s) {
+ wait_queue_t wait;
+ struct reiserfs_journal *journal = SB_JOURNAL (s);
+ set_bit(J_WRITERS_QUEUED, &journal->j_state);
+
+ /*
+ * we don't want to use wait_event here because
+ * we only want to wait once.
+ */
+ init_waitqueue_entry(&wait, current);
+ add_wait_queue(&journal->j_join_wait, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (test_bit(J_WRITERS_QUEUED, &journal->j_state))
+ schedule();
+ current->state = TASK_RUNNING;
+ remove_wait_queue(&journal->j_join_wait, &wait);
+}
+
+static void wake_queued_writers(struct super_block *s) {
+ struct reiserfs_journal *journal = SB_JOURNAL (s);
+ if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
+ wake_up(&journal->j_join_wait);
+}
+
+static void let_transaction_grow(struct super_block *sb,
+ unsigned long trans_id)
+{
+ struct reiserfs_journal *journal = SB_JOURNAL (sb);
+ unsigned long bcount = journal->j_bcount;
+ while(1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(1);
+ journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
+ while ((atomic_read(&journal->j_wcount) > 0 ||
+ atomic_read(&journal->j_jlock)) &&
+ journal->j_trans_id == trans_id) {
+ queue_log_writer(sb);
+ }
+ if (journal->j_trans_id != trans_id)
+ break;
+ if (bcount == journal->j_bcount)
+ break;
+ bcount = journal->j_bcount;
+ }
+}
+
+/* join == true if you must join an existing transaction.
+** join == false if you can deal with waiting for others to finish
+**
+** this will block until the transaction is joinable. send the number of blocks you
+** expect to use in nblocks.
+*/
+static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) {
+ time_t now = get_seconds() ;
+ int old_trans_id ;
+ struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+ struct reiserfs_transaction_handle myth;
+ int sched_count = 0;
+ int retval;
+
+ reiserfs_check_lock_depth(p_s_sb, "journal_begin") ;
+
+ PROC_INFO_INC( p_s_sb, journal.journal_being );
+ /* set here for journal_join */
+ th->t_refcount = 1;
+ th->t_super = p_s_sb ;
+
+relock:
+ lock_journal(p_s_sb) ;
+ if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted (journal)) {
+ unlock_journal (p_s_sb);
+ retval = journal->j_errno;
+ goto out_fail;
+ }
+ journal->j_bcount++;
+
+ if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
+ unlock_journal(p_s_sb) ;
+ reiserfs_wait_on_write_block(p_s_sb) ;
+ PROC_INFO_INC( p_s_sb, journal.journal_relock_writers );
+ goto relock ;
+ }
+ now = get_seconds();
+
+ /* if there is no room in the journal OR
+ ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning
+ ** we don't sleep if there aren't other writers
+ */
+
+ if ( (!join && journal->j_must_wait > 0) ||
+ ( !join && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch) ||
+ (!join && atomic_read(&journal->j_wcount) > 0 && journal->j_trans_start_time > 0 &&
+ (now - journal->j_trans_start_time) > journal->j_max_trans_age) ||
+ (!join && atomic_read(&journal->j_jlock)) ||
+ (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
+
+ old_trans_id = journal->j_trans_id;
+ unlock_journal(p_s_sb) ; /* allow others to finish this transaction */
+
+ if (!join && (journal->j_len_alloc + nblocks + 2) >=
+ journal->j_max_batch &&
+ ((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75))
+ {
+ if (atomic_read(&journal->j_wcount) > 10) {
+ sched_count++;
+ queue_log_writer(p_s_sb);
+ goto relock;
+ }
+ }
+ /* don't mess with joining the transaction if all we have to do is
+ * wait for someone else to do a commit
+ */
+ if (atomic_read(&journal->j_jlock)) {
+ while (journal->j_trans_id == old_trans_id &&
+ atomic_read(&journal->j_jlock)) {
+ queue_log_writer(p_s_sb);
+ }
+ goto relock;
+ }
+ retval = journal_join(&myth, p_s_sb, 1) ;
+ if (retval)
+ goto out_fail;
+
+ /* someone might have ended the transaction while we joined */
+ if (old_trans_id != journal->j_trans_id) {
+ retval = do_journal_end(&myth, p_s_sb, 1, 0) ;
+ } else {
+ retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ;
+ }
+
+ if (retval)
+ goto out_fail;
+
+ PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount );
+ goto relock ;
+ }
+ /* we are the first writer, set trans_id */
+ if (journal->j_trans_start_time == 0) {
+ journal->j_trans_start_time = get_seconds();
+ }
+ atomic_inc(&(journal->j_wcount)) ;
+ journal->j_len_alloc += nblocks ;
+ th->t_blocks_logged = 0 ;
+ th->t_blocks_allocated = nblocks ;
+ th->t_trans_id = journal->j_trans_id ;
+ unlock_journal(p_s_sb) ;
+ INIT_LIST_HEAD (&th->t_list);
+ return 0 ;
+
+out_fail:
+ memset (th, 0, sizeof (*th));
+ /* Re-set th->t_super, so we can properly keep track of how many
+ * persistent transactions there are. We need to do this so if this
+ * call is part of a failed restart_transaction, we can free it later */
+ th->t_super = p_s_sb;
+ return retval;
+}
+
+struct reiserfs_transaction_handle *
+reiserfs_persistent_transaction(struct super_block *s, int nblocks) {
+ int ret ;
+ struct reiserfs_transaction_handle *th ;
+
+ /* if we're nesting into an existing transaction. It will be
+ ** persistent on its own
+ */
+ if (reiserfs_transaction_running(s)) {
+ th = current->journal_info ;
+ th->t_refcount++ ;
+ if (th->t_refcount < 2) {
+ BUG() ;
+ }
+ return th ;
+ }
+ th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ;
+ if (!th)
+ return NULL;
+ ret = journal_begin(th, s, nblocks) ;
+ if (ret) {
+ reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
+ return NULL;
+ }
+
+ SB_JOURNAL(s)->j_persistent_trans++;
+ return th ;
+}
+
+int
+reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) {
+ struct super_block *s = th->t_super;
+ int ret = 0;
+ if (th->t_trans_id)
+ ret = journal_end(th, th->t_super, th->t_blocks_allocated);
+ else
+ ret = -EIO;
+ if (th->t_refcount == 0) {
+ SB_JOURNAL(s)->j_persistent_trans--;
+ reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
+ }
+ return ret;
+}
+
+static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
+ struct reiserfs_transaction_handle *cur_th = current->journal_info;
+
+ /* this keeps do_journal_end from NULLing out the current->journal_info
+ ** pointer
+ */
+ th->t_handle_save = cur_th ;
+ if (cur_th && cur_th->t_refcount > 1) {
+ BUG() ;
+ }
+ return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN) ;
+}
+
+int journal_join_abort(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
+ struct reiserfs_transaction_handle *cur_th = current->journal_info;
+
+ /* this keeps do_journal_end from NULLing out the current->journal_info
+ ** pointer
+ */
+ th->t_handle_save = cur_th ;
+ if (cur_th && cur_th->t_refcount > 1) {
+ BUG() ;
+ }
+ return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT) ;
+}
+
+int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) {
+ struct reiserfs_transaction_handle *cur_th = current->journal_info ;
+ int ret ;
+
+ th->t_handle_save = NULL ;
+ if (cur_th) {
+ /* we are nesting into the current transaction */
+ if (cur_th->t_super == p_s_sb) {
+ BUG_ON (!cur_th->t_refcount);
+ cur_th->t_refcount++ ;
+ memcpy(th, cur_th, sizeof(*th));
+ if (th->t_refcount <= 1)
+ reiserfs_warning (p_s_sb, "BAD: refcount <= 1, but journal_info != 0");
+ return 0;
+ } else {
+ /* we've ended up with a handle from a different filesystem.
+ ** save it and restore on journal_end. This should never
+ ** really happen...
+ */
+ reiserfs_warning(p_s_sb, "clm-2100: nesting info a different FS") ;
+ th->t_handle_save = current->journal_info ;
+ current->journal_info = th;
+ }
+ } else {
+ current->journal_info = th;
+ }
+ ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG) ;
+ if (current->journal_info != th)
+ BUG() ;
+
+ /* I guess this boils down to being the reciprocal of clm-2100 above.
+ * If do_journal_begin_r fails, we need to put it back, since journal_end
+ * won't be called to do it. */
+ if (ret)
+ current->journal_info = th->t_handle_save;
+ else
+ BUG_ON (!th->t_refcount);
+
+ return ret ;
+}
+
+/*
+** puts bh into the current transaction. If it was already there, reorders removes the
+** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order).
+**
+** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the
+** transaction is committed.
+**
+** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
+*/
+int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ struct reiserfs_journal_cnode *cn = NULL;
+ int count_already_incd = 0 ;
+ int prepared = 0 ;
+ BUG_ON (!th->t_trans_id);
+
+ PROC_INFO_INC( p_s_sb, journal.mark_dirty );
+ if (th->t_trans_id != journal->j_trans_id) {
+ reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n",
+ th->t_trans_id, journal->j_trans_id);
+ }
+
+ p_s_sb->s_dirt = 1;
+
+ prepared = test_clear_buffer_journal_prepared (bh);
+ clear_buffer_journal_restore_dirty (bh);
+ /* already in this transaction, we are done */
+ if (buffer_journaled(bh)) {
+ PROC_INFO_INC( p_s_sb, journal.mark_dirty_already );
+ return 0 ;
+ }
+
+ /* this must be turned into a panic instead of a warning. We can't allow
+ ** a dirty or journal_dirty or locked buffer to be logged, as some changes
+ ** could get to disk too early. NOT GOOD.
+ */
+ if (!prepared || buffer_dirty(bh)) {
+ reiserfs_warning (p_s_sb, "journal-1777: buffer %llu bad state "
+ "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
+ (unsigned long long)bh->b_blocknr, prepared ? ' ' : '!',
+ buffer_locked(bh) ? ' ' : '!',
+ buffer_dirty(bh) ? ' ' : '!',
+ buffer_journal_dirty(bh) ? ' ' : '!') ;
+ }
+
+ if (atomic_read(&(journal->j_wcount)) <= 0) {
+ reiserfs_warning (p_s_sb, "journal-1409: journal_mark_dirty returning because j_wcount was %d", atomic_read(&(journal->j_wcount))) ;
+ return 1 ;
+ }
+ /* this error means I've screwed up, and we've overflowed the transaction.
+ ** Nothing can be done here, except make the FS readonly or panic.
+ */
+ if (journal->j_len >= journal->j_trans_max) {
+ reiserfs_panic(th->t_super, "journal-1413: journal_mark_dirty: j_len (%lu) is too big\n", journal->j_len) ;
+ }
+
+ if (buffer_journal_dirty(bh)) {
+ count_already_incd = 1 ;
+ PROC_INFO_INC( p_s_sb, journal.mark_dirty_notjournal );
+ clear_buffer_journal_dirty (bh);
+ }
+
+ if (journal->j_len > journal->j_len_alloc) {
+ journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT ;
+ }
+
+ set_buffer_journaled (bh);
+
+ /* now put this guy on the end */
+ if (!cn) {
+ cn = get_cnode(p_s_sb) ;
+ if (!cn) {
+ reiserfs_panic(p_s_sb, "get_cnode failed!\n");
+ }
+
+ if (th->t_blocks_logged == th->t_blocks_allocated) {
+ th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT ;
+ journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT ;
+ }
+ th->t_blocks_logged++ ;
+ journal->j_len++ ;
+
+ cn->bh = bh ;
+ cn->blocknr = bh->b_blocknr ;
+ cn->sb = p_s_sb;
+ cn->jlist = NULL ;
+ insert_journal_hash(journal->j_hash_table, cn) ;
+ if (!count_already_incd) {
+ get_bh(bh) ;
+ }
+ }
+ cn->next = NULL ;
+ cn->prev = journal->j_last ;
+ cn->bh = bh ;
+ if (journal->j_last) {
+ journal->j_last->next = cn ;
+ journal->j_last = cn ;
+ } else {
+ journal->j_first = cn ;
+ journal->j_last = cn ;
+ }
+ return 0 ;
+}
+
+int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
+ if (!current->journal_info && th->t_refcount > 1)
+ reiserfs_warning (p_s_sb, "REISER-NESTING: th NULL, refcount %d",
+ th->t_refcount);
+
+ if (!th->t_trans_id) {
+ WARN_ON (1);
+ return -EIO;
+ }
+
+ th->t_refcount--;
+ if (th->t_refcount > 0) {
+ struct reiserfs_transaction_handle *cur_th = current->journal_info ;
+
+ /* we aren't allowed to close a nested transaction on a different
+ ** filesystem from the one in the task struct
+ */
+ if (cur_th->t_super != th->t_super)
+ BUG() ;
+
+ if (th != cur_th) {
+ memcpy(current->journal_info, th, sizeof(*th));
+ th->t_trans_id = 0;
+ }
+ return 0;
+ } else {
+ return do_journal_end(th, p_s_sb, nblocks, 0) ;
+ }
+}
+
+/* removes from the current transaction, relsing and descrementing any counters.
+** also files the removed buffer directly onto the clean list
+**
+** called by journal_mark_freed when a block has been deleted
+**
+** returns 1 if it cleaned and relsed the buffer. 0 otherwise
+*/
+static int remove_from_transaction(struct super_block *p_s_sb, b_blocknr_t blocknr, int already_cleaned) {
+ struct buffer_head *bh ;
+ struct reiserfs_journal_cnode *cn ;
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ int ret = 0;
+
+ cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr) ;
+ if (!cn || !cn->bh) {
+ return ret ;
+ }
+ bh = cn->bh ;
+ if (cn->prev) {
+ cn->prev->next = cn->next ;
+ }
+ if (cn->next) {
+ cn->next->prev = cn->prev ;
+ }
+ if (cn == journal->j_first) {
+ journal->j_first = cn->next ;
+ }
+ if (cn == journal->j_last) {
+ journal->j_last = cn->prev ;
+ }
+ if (bh)
+ remove_journal_hash(p_s_sb, journal->j_hash_table, NULL, bh->b_blocknr, 0) ;
+ clear_buffer_journaled (bh); /* don't log this one */
+
+ if (!already_cleaned) {
+ clear_buffer_journal_dirty (bh);
+ clear_buffer_dirty(bh);
+ clear_buffer_journal_test (bh);
+ put_bh(bh) ;
+ if (atomic_read(&(bh->b_count)) < 0) {
+ reiserfs_warning (p_s_sb, "journal-1752: remove from trans, b_count < 0");
+ }
+ ret = 1 ;
+ }
+ journal->j_len-- ;
+ journal->j_len_alloc-- ;
+ free_cnode(p_s_sb, cn) ;
+ return ret ;
+}
+
+/*
+** for any cnode in a journal list, it can only be dirtied of all the
+** transactions that include it are commited to disk.
+** this checks through each transaction, and returns 1 if you are allowed to dirty,
+** and 0 if you aren't
+**
+** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log
+** blocks for a given transaction on disk
+**
+*/
+static int can_dirty(struct reiserfs_journal_cnode *cn) {
+ struct super_block *sb = cn->sb;
+ b_blocknr_t blocknr = cn->blocknr ;
+ struct reiserfs_journal_cnode *cur = cn->hprev ;
+ int can_dirty = 1 ;
+
+ /* first test hprev. These are all newer than cn, so any node here
+ ** with the same block number and dev means this node can't be sent
+ ** to disk right now.
+ */
+ while(cur && can_dirty) {
+ if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
+ cur->blocknr == blocknr) {
+ can_dirty = 0 ;
+ }
+ cur = cur->hprev ;
+ }
+ /* then test hnext. These are all older than cn. As long as they
+ ** are committed to the log, it is safe to write cn to disk
+ */
+ cur = cn->hnext ;
+ while(cur && can_dirty) {
+ if (cur->jlist && cur->jlist->j_len > 0 &&
+ atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh &&
+ cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
+ can_dirty = 0 ;
+ }
+ cur = cur->hnext ;
+ }
+ return can_dirty ;
+}
+
+/* syncs the commit blocks, but does not force the real buffers to disk
+** will wait until the current transaction is done/commited before returning
+*/
+int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+
+ BUG_ON (!th->t_trans_id);
+ /* you can sync while nested, very, very bad */
+ if (th->t_refcount > 1) {
+ BUG() ;
+ }
+ if (journal->j_len == 0) {
+ reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+ journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
+ }
+ return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT) ;
+}
+
+/*
+** writeback the pending async commits to disk
+*/
+static void flush_async_commits(void *p) {
+ struct super_block *p_s_sb = p;
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ struct reiserfs_journal_list *jl;
+ struct list_head *entry;
+
+ lock_kernel();
+ if (!list_empty(&journal->j_journal_list)) {
+ /* last entry is the youngest, commit it and you get everything */
+ entry = journal->j_journal_list.prev;
+ jl = JOURNAL_LIST_ENTRY(entry);
+ flush_commit_list(p_s_sb, jl, 1);
+ }
+ unlock_kernel();
+ /*
+ * this is a little racey, but there's no harm in missing
+ * the filemap_fdata_write
+ */
+ if (!atomic_read(&journal->j_async_throttle) && !reiserfs_is_journal_aborted (journal)) {
+ atomic_inc(&journal->j_async_throttle);
+ filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping);
+ atomic_dec(&journal->j_async_throttle);
+ }
+}
+
+/*
+** flushes any old transactions to disk
+** ends the current transaction if it is too old
+*/
+int reiserfs_flush_old_commits(struct super_block *p_s_sb) {
+ time_t now ;
+ struct reiserfs_transaction_handle th ;
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+
+ now = get_seconds();
+ /* safety check so we don't flush while we are replaying the log during
+ * mount
+ */
+ if (list_empty(&journal->j_journal_list)) {
+ return 0 ;
+ }
+
+ /* check the current transaction. If there are no writers, and it is
+ * too old, finish it, and force the commit blocks to disk
+ */
+ if (atomic_read(&journal->j_wcount) <= 0 &&
+ journal->j_trans_start_time > 0 &&
+ journal->j_len > 0 &&
+ (now - journal->j_trans_start_time) > journal->j_max_trans_age)
+ {
+ if (!journal_join(&th, p_s_sb, 1)) {
+ reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+ journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
+
+ /* we're only being called from kreiserfsd, it makes no sense to do
+ ** an async commit so that kreiserfsd can do it later
+ */
+ do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
+ }
+ }
+ return p_s_sb->s_dirt;
+}
+
+/*
+** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit
+**
+** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all
+** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just
+** flushes the commit list and returns 0.
+**
+** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait.
+**
+** Note, we can't allow the journal_end to proceed while there are still writers in the log.
+*/
+static int check_journal_end(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,
+ unsigned long nblocks, int flags) {
+
+ time_t now ;
+ int flush = flags & FLUSH_ALL ;
+ int commit_now = flags & COMMIT_NOW ;
+ int wait_on_commit = flags & WAIT ;
+ struct reiserfs_journal_list *jl;
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+
+ BUG_ON (!th->t_trans_id);
+
+ if (th->t_trans_id != journal->j_trans_id) {
+ reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n",
+ th->t_trans_id, journal->j_trans_id);
+ }
+
+ journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged) ;
+ if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */
+ atomic_dec(&(journal->j_wcount)) ;
+ }
+
+ /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released
+ ** will be dealt with by next transaction that actually writes something, but should be taken
+ ** care of in this trans
+ */
+ if (journal->j_len == 0) {
+ BUG();
+ }
+ /* if wcount > 0, and we are called to with flush or commit_now,
+ ** we wait on j_join_wait. We will wake up when the last writer has
+ ** finished the transaction, and started it on its way to the disk.
+ ** Then, we flush the commit or journal list, and just return 0
+ ** because the rest of journal end was already done for this transaction.
+ */
+ if (atomic_read(&(journal->j_wcount)) > 0) {
+ if (flush || commit_now) {
+ unsigned trans_id ;
+
+ jl = journal->j_current_jl;
+ trans_id = jl->j_trans_id;
+ if (wait_on_commit)
+ jl->j_state |= LIST_COMMIT_PENDING;
+ atomic_set(&(journal->j_jlock), 1) ;
+ if (flush) {
+ journal->j_next_full_flush = 1 ;
+ }
+ unlock_journal(p_s_sb) ;
+
+ /* sleep while the current transaction is still j_jlocked */
+ while(journal->j_trans_id == trans_id) {
+ if (atomic_read(&journal->j_jlock)) {
+ queue_log_writer(p_s_sb);
+ } else {
+ lock_journal(p_s_sb);
+ if (journal->j_trans_id == trans_id) {
+ atomic_set(&(journal->j_jlock), 1) ;
+ }
+ unlock_journal(p_s_sb);
+ }
+ }
+ if (journal->j_trans_id == trans_id) {
+ BUG();
+ }
+ if (commit_now && journal_list_still_alive(p_s_sb, trans_id) &&
+ wait_on_commit)
+ {
+ flush_commit_list(p_s_sb, jl, 1) ;
+ }
+ return 0 ;
+ }
+ unlock_journal(p_s_sb) ;
+ return 0 ;
+ }
+
+ /* deal with old transactions where we are the last writers */
+ now = get_seconds();
+ if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
+ commit_now = 1 ;
+ journal->j_next_async_flush = 1 ;
+ }
+ /* don't batch when someone is waiting on j_join_wait */
+ /* don't batch when syncing the commit or flushing the whole trans */
+ if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock))) && !flush && !commit_now &&
+ (journal->j_len < journal->j_max_batch) &&
+ journal->j_len_alloc < journal->j_max_batch && journal->j_cnode_free > (journal->j_trans_max * 3)) {
+ journal->j_bcount++ ;
+ unlock_journal(p_s_sb) ;
+ return 0 ;
+ }
+
+ if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
+ reiserfs_panic(p_s_sb, "journal-003: journal_end: j_start (%ld) is too high\n", journal->j_start) ;
+ }
+ return 1 ;
+}
+
+/*
+** Does all the work that makes deleting blocks safe.
+** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on.
+**
+** otherwise:
+** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes
+** before this transaction has finished.
+**
+** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. That will prevent any old transactions with
+** this block from trying to flush to the real location. Since we aren't removing the cnode from the journal_list_hash,
+** the block can't be reallocated yet.
+**
+** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.
+*/
+int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, b_blocknr_t blocknr) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ struct reiserfs_journal_cnode *cn = NULL ;
+ struct buffer_head *bh = NULL ;
+ struct reiserfs_list_bitmap *jb = NULL ;
+ int cleaned = 0 ;
+ BUG_ON (!th->t_trans_id);
+
+ cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);
+ if (cn && cn->bh) {
+ bh = cn->bh ;
+ get_bh(bh) ;
+ }
+ /* if it is journal new, we just remove it from this transaction */
+ if (bh && buffer_journal_new(bh)) {
+ clear_buffer_journal_new (bh);
+ clear_prepared_bits(bh) ;
+ reiserfs_clean_and_file_buffer(bh) ;
+ cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
+ } else {
+ /* set the bit for this block in the journal bitmap for this transaction */
+ jb = journal->j_current_jl->j_list_bitmap;
+ if (!jb) {
+ reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ;
+ }
+ set_bit_in_list_bitmap(p_s_sb, blocknr, jb) ;
+
+ /* Note, the entire while loop is not allowed to schedule. */
+
+ if (bh) {
+ clear_prepared_bits(bh) ;
+ reiserfs_clean_and_file_buffer(bh) ;
+ }
+ cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
+
+ /* find all older transactions with this block, make sure they don't try to write it out */
+ cn = get_journal_hash_dev(p_s_sb,journal->j_list_hash_table, blocknr) ;
+ while (cn) {
+ if (p_s_sb == cn->sb && blocknr == cn->blocknr) {
+ set_bit(BLOCK_FREED, &cn->state) ;
+ if (cn->bh) {
+ if (!cleaned) {
+ /* remove_from_transaction will brelse the buffer if it was
+ ** in the current trans
+ */
+ clear_buffer_journal_dirty (cn->bh);
+ clear_buffer_dirty(cn->bh);
+ clear_buffer_journal_test(cn->bh);
+ cleaned = 1 ;
+ put_bh(cn->bh) ;
+ if (atomic_read(&(cn->bh->b_count)) < 0) {
+ reiserfs_warning (p_s_sb, "journal-2138: cn->bh->b_count < 0");
+ }
+ }
+ if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */
+ atomic_dec(&(cn->jlist->j_nonzerolen)) ;
+ }
+ cn->bh = NULL ;
+ }
+ }
+ cn = cn->hnext ;
+ }
+ }
+
+ if (bh) {
+ put_bh(bh) ; /* get_hash grabs the buffer */
+ if (atomic_read(&(bh->b_count)) < 0) {
+ reiserfs_warning (p_s_sb, "journal-2165: bh->b_count < 0");
+ }
+ }
+ return 0 ;
+}
+
+void reiserfs_update_inode_transaction(struct inode *inode) {
+ struct reiserfs_journal *journal = SB_JOURNAL (inode->i_sb);
+ REISERFS_I(inode)->i_jl = journal->j_current_jl;
+ REISERFS_I(inode)->i_trans_id = journal->j_trans_id ;
+}
+
+/*
+ * returns -1 on error, 0 if no commits/barriers were done and 1
+ * if a transaction was actually committed and the barrier was done
+ */
+static int __commit_trans_jl(struct inode *inode, unsigned long id,
+ struct reiserfs_journal_list *jl)
+{
+ struct reiserfs_transaction_handle th ;
+ struct super_block *sb = inode->i_sb ;
+ struct reiserfs_journal *journal = SB_JOURNAL (sb);
+ int ret = 0;
+
+ /* is it from the current transaction, or from an unknown transaction? */
+ if (id == journal->j_trans_id) {
+ jl = journal->j_current_jl;
+ /* try to let other writers come in and grow this transaction */
+ let_transaction_grow(sb, id);
+ if (journal->j_trans_id != id) {
+ goto flush_commit_only;
+ }
+
+ ret = journal_begin(&th, sb, 1) ;
+ if (ret)
+ return ret;
+
+ /* someone might have ended this transaction while we joined */
+ if (journal->j_trans_id != id) {
+ reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ;
+ journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ;
+ ret = journal_end(&th, sb, 1) ;
+ goto flush_commit_only;
+ }
+
+ ret = journal_end_sync(&th, sb, 1) ;
+ if (!ret)
+ ret = 1;
+
+ } else {
+ /* this gets tricky, we have to make sure the journal list in
+ * the inode still exists. We know the list is still around
+ * if we've got a larger transaction id than the oldest list
+ */
+flush_commit_only:
+ if (journal_list_still_alive(inode->i_sb, id)) {
+ /*
+ * we only set ret to 1 when we know for sure
+ * the barrier hasn't been started yet on the commit
+ * block.
+ */
+ if (atomic_read(&jl->j_commit_left) > 1)
+ ret = 1;
+ flush_commit_list(sb, jl, 1) ;
+ if (journal->j_errno)
+ ret = journal->j_errno;
+ }
+ }
+ /* otherwise the list is gone, and long since committed */
+ return ret;
+}
+
+int reiserfs_commit_for_inode(struct inode *inode) {
+ unsigned long id = REISERFS_I(inode)->i_trans_id;
+ struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
+
+ /* for the whole inode, assume unset id means it was
+ * changed in the current transaction. More conservative
+ */
+ if (!id || !jl) {
+ reiserfs_update_inode_transaction(inode) ;
+ id = REISERFS_I(inode)->i_trans_id;
+ /* jl will be updated in __commit_trans_jl */
+ }
+
+ return __commit_trans_jl(inode, id, jl);
+}
+
+void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb,
+ struct buffer_head *bh) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ PROC_INFO_INC( p_s_sb, journal.restore_prepared );
+ if (!bh) {
+ return ;
+ }
+ if (test_clear_buffer_journal_restore_dirty (bh) &&
+ buffer_journal_dirty(bh)) {
+ struct reiserfs_journal_cnode *cn;
+ cn = get_journal_hash_dev(p_s_sb,
+ journal->j_list_hash_table,
+ bh->b_blocknr);
+ if (cn && can_dirty(cn)) {
+ set_buffer_journal_test (bh);
+ mark_buffer_dirty(bh);
+ }
+ }
+ clear_buffer_journal_prepared (bh);
+}
+
+extern struct tree_balance *cur_tb ;
+/*
+** before we can change a metadata block, we have to make sure it won't
+** be written to disk while we are altering it. So, we must:
+** clean it
+** wait on it.
+**
+*/
+int reiserfs_prepare_for_journal(struct super_block *p_s_sb,
+ struct buffer_head *bh, int wait) {
+ PROC_INFO_INC( p_s_sb, journal.prepare );
+
+ if (test_set_buffer_locked(bh)) {
+ if (!wait)
+ return 0;
+ lock_buffer(bh);
+ }
+ set_buffer_journal_prepared (bh);
+ if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
+ clear_buffer_journal_test (bh);
+ set_buffer_journal_restore_dirty (bh);
+ }
+ unlock_buffer(bh);
+ return 1;
+}
+
+static void flush_old_journal_lists(struct super_block *s) {
+ struct reiserfs_journal *journal = SB_JOURNAL (s);
+ struct reiserfs_journal_list *jl;
+ struct list_head *entry;
+ time_t now = get_seconds();
+
+ while(!list_empty(&journal->j_journal_list)) {
+ entry = journal->j_journal_list.next;
+ jl = JOURNAL_LIST_ENTRY(entry);
+ /* this check should always be run, to send old lists to disk */
+ if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
+ flush_used_journal_lists(s, jl);
+ } else {
+ break;
+ }
+ }
+}
+
+/*
+** long and ugly. If flush, will not return until all commit
+** blocks and all real buffers in the trans are on disk.
+** If no_async, won't return until all commit blocks are on disk.
+**
+** keep reading, there are comments as you go along
+**
+** If the journal is aborted, we just clean up. Things like flushing
+** journal lists, etc just won't happen.
+*/
+static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks,
+ int flags) {
+ struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+ struct reiserfs_journal_cnode *cn, *next, *jl_cn;
+ struct reiserfs_journal_cnode *last_cn = NULL;
+ struct reiserfs_journal_desc *desc ;
+ struct reiserfs_journal_commit *commit ;
+ struct buffer_head *c_bh ; /* commit bh */
+ struct buffer_head *d_bh ; /* desc bh */
+ int cur_write_start = 0 ; /* start index of current log write */
+ int old_start ;
+ int i ;
+ int flush = flags & FLUSH_ALL ;
+ int wait_on_commit = flags & WAIT ;
+ struct reiserfs_journal_list *jl, *temp_jl;
+ struct list_head *entry, *safe;
+ unsigned long jindex;
+ unsigned long commit_trans_id;
+ int trans_half;
+
+ BUG_ON (th->t_refcount > 1);
+ BUG_ON (!th->t_trans_id);
+
+ current->journal_info = th->t_handle_save;
+ reiserfs_check_lock_depth(p_s_sb, "journal end");
+ if (journal->j_len == 0) {
+ reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+ journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
+ }
+
+ lock_journal(p_s_sb) ;
+ if (journal->j_next_full_flush) {
+ flags |= FLUSH_ALL ;
+ flush = 1 ;
+ }
+ if (journal->j_next_async_flush) {
+ flags |= COMMIT_NOW | WAIT;
+ wait_on_commit = 1;
+ }
+
+ /* check_journal_end locks the journal, and unlocks if it does not return 1
+ ** it tells us if we should continue with the journal_end, or just return
+ */
+ if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
+ p_s_sb->s_dirt = 1;
+ wake_queued_writers(p_s_sb);
+ reiserfs_async_progress_wait(p_s_sb);
+ goto out ;
+ }
+
+ /* check_journal_end might set these, check again */
+ if (journal->j_next_full_flush) {
+ flush = 1 ;
+ }
+
+ /*
+ ** j must wait means we have to flush the log blocks, and the real blocks for
+ ** this transaction
+ */
+ if (journal->j_must_wait > 0) {
+ flush = 1 ;
+ }
+
+#ifdef REISERFS_PREALLOCATE
+ /* quota ops might need to nest, setup the journal_info pointer for them */
+ current->journal_info = th ;
+ reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into
+ * the transaction */
+ current->journal_info = th->t_handle_save ;
+#endif
+
+ /* setup description block */
+ d_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start) ;
+ set_buffer_uptodate(d_bh);
+ desc = (struct reiserfs_journal_desc *)(d_bh)->b_data ;
+ memset(d_bh->b_data, 0, d_bh->b_size) ;
+ memcpy(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8) ;
+ set_desc_trans_id(desc, journal->j_trans_id) ;
+
+ /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */
+ c_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+ ((journal->j_start + journal->j_len + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
+ commit = (struct reiserfs_journal_commit *)c_bh->b_data ;
+ memset(c_bh->b_data, 0, c_bh->b_size) ;
+ set_commit_trans_id(commit, journal->j_trans_id) ;
+ set_buffer_uptodate(c_bh) ;
+
+ /* init this journal list */
+ jl = journal->j_current_jl;
+
+ /* we lock the commit before doing anything because
+ * we want to make sure nobody tries to run flush_commit_list until
+ * the new transaction is fully setup, and we've already flushed the
+ * ordered bh list
+ */
+ down(&jl->j_commit_lock);
+
+ /* save the transaction id in case we need to commit it later */
+ commit_trans_id = jl->j_trans_id;
+
+ atomic_set(&jl->j_older_commits_done, 0) ;
+ jl->j_trans_id = journal->j_trans_id ;
+ jl->j_timestamp = journal->j_trans_start_time ;
+ jl->j_commit_bh = c_bh ;
+ jl->j_start = journal->j_start ;
+ jl->j_len = journal->j_len ;
+ atomic_set(&jl->j_nonzerolen, journal->j_len) ;
+ atomic_set(&jl->j_commit_left, journal->j_len + 2);
+ jl->j_realblock = NULL ;
+
+ /* The ENTIRE FOR LOOP MUST not cause schedule to occur.
+ ** for each real block, add it to the journal list hash,
+ ** copy into real block index array in the commit or desc block
+ */
+ trans_half = journal_trans_half(p_s_sb->s_blocksize);
+ for (i = 0, cn = journal->j_first ; cn ; cn = cn->next, i++) {
+ if (buffer_journaled (cn->bh)) {
+ jl_cn = get_cnode(p_s_sb) ;
+ if (!jl_cn) {
+ reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ;
+ }
+ if (i == 0) {
+ jl->j_realblock = jl_cn ;
+ }
+ jl_cn->prev = last_cn ;
+ jl_cn->next = NULL ;
+ if (last_cn) {
+ last_cn->next = jl_cn ;
+ }
+ last_cn = jl_cn ;
+ /* make sure the block we are trying to log is not a block
+ of journal or reserved area */
+
+ if (is_block_in_log_or_reserved_area(p_s_sb, cn->bh->b_blocknr)) {
+ reiserfs_panic(p_s_sb, "journal-2332: Trying to log block %lu, which is a log block\n", cn->bh->b_blocknr) ;
+ }
+ jl_cn->blocknr = cn->bh->b_blocknr ;
+ jl_cn->state = 0 ;
+ jl_cn->sb = p_s_sb;
+ jl_cn->bh = cn->bh ;
+ jl_cn->jlist = jl;
+ insert_journal_hash(journal->j_list_hash_table, jl_cn) ;
+ if (i < trans_half) {
+ desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ;
+ } else {
+ commit->j_realblock[i - trans_half] = cpu_to_le32(cn->bh->b_blocknr) ;
+ }
+ } else {
+ i-- ;
+ }
+ }
+ set_desc_trans_len(desc, journal->j_len) ;
+ set_desc_mount_id(desc, journal->j_mount_id) ;
+ set_desc_trans_id(desc, journal->j_trans_id) ;
+ set_commit_trans_len(commit, journal->j_len);
+
+ /* special check in case all buffers in the journal were marked for not logging */
+ if (journal->j_len == 0) {
+ BUG();
+ }
+
+ /* we're about to dirty all the log blocks, mark the description block
+ * dirty now too. Don't mark the commit block dirty until all the
+ * others are on disk
+ */
+ mark_buffer_dirty(d_bh);
+
+ /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
+ cur_write_start = journal->j_start ;
+ cn = journal->j_first ;
+ jindex = 1 ; /* start at one so we don't get the desc again */
+ while(cn) {
+ clear_buffer_journal_new (cn->bh);
+ /* copy all the real blocks into log area. dirty log blocks */
+ if (buffer_journaled (cn->bh)) {
+ struct buffer_head *tmp_bh ;
+ char *addr;
+ struct page *page;
+ tmp_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+ ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
+ set_buffer_uptodate(tmp_bh);
+ page = cn->bh->b_page;
+ addr = kmap(page);
+ memcpy(tmp_bh->b_data, addr + offset_in_page(cn->bh->b_data),
+ cn->bh->b_size);
+ kunmap(page);
+ mark_buffer_dirty(tmp_bh);
+ jindex++ ;
+ set_buffer_journal_dirty (cn->bh);
+ clear_buffer_journaled (cn->bh);
+ } else {
+ /* JDirty cleared sometime during transaction. don't log this one */
+ reiserfs_warning(p_s_sb, "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!") ;
+ brelse(cn->bh) ;
+ }
+ next = cn->next ;
+ free_cnode(p_s_sb, cn) ;
+ cn = next ;
+ cond_resched();
+ }
+
+ /* we are done with both the c_bh and d_bh, but
+ ** c_bh must be written after all other commit blocks,
+ ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
+ */
+
+ journal->j_current_jl = alloc_journal_list(p_s_sb);
+
+ /* now it is safe to insert this transaction on the main list */
+ list_add_tail(&jl->j_list, &journal->j_journal_list);
+ list_add_tail(&jl->j_working_list, &journal->j_working_list);
+ journal->j_num_work_lists++;
+
+ /* reset journal values for the next transaction */
+ old_start = journal->j_start ;
+ journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb);
+ atomic_set(&(journal->j_wcount), 0) ;
+ journal->j_bcount = 0 ;
+ journal->j_last = NULL ;
+ journal->j_first = NULL ;
+ journal->j_len = 0 ;
+ journal->j_trans_start_time = 0 ;
+ journal->j_trans_id++ ;
+ journal->j_current_jl->j_trans_id = journal->j_trans_id;
+ journal->j_must_wait = 0 ;
+ journal->j_len_alloc = 0 ;
+ journal->j_next_full_flush = 0 ;
+ journal->j_next_async_flush = 0 ;
+ init_journal_hash(p_s_sb) ;
+
+ // make sure reiserfs_add_jh sees the new current_jl before we
+ // write out the tails
+ smp_mb();
+
+ /* tail conversion targets have to hit the disk before we end the
+ * transaction. Otherwise a later transaction might repack the tail
+ * before this transaction commits, leaving the data block unflushed and
+ * clean, if we crash before the later transaction commits, the data block
+ * is lost.
+ */
+ if (!list_empty(&jl->j_tail_bh_list)) {
+ unlock_kernel();
+ write_ordered_buffers(&journal->j_dirty_buffers_lock,
+ journal, jl, &jl->j_tail_bh_list);
+ lock_kernel();
+ }
+ if (!list_empty(&jl->j_tail_bh_list))
+ BUG();
+ up(&jl->j_commit_lock);
+
+ /* honor the flush wishes from the caller, simple commits can
+ ** be done outside the journal lock, they are done below
+ **
+ ** if we don't flush the commit list right now, we put it into
+ ** the work queue so the people waiting on the async progress work
+ ** queue don't wait for this proc to flush journal lists and such.
+ */
+ if (flush) {
+ flush_commit_list(p_s_sb, jl, 1) ;
+ flush_journal_list(p_s_sb, jl, 1) ;
+ } else if (!(jl->j_state & LIST_COMMIT_PENDING))
+ queue_delayed_work(commit_wq, &journal->j_work, HZ/10);
+
+
+ /* if the next transaction has any chance of wrapping, flush
+ ** transactions that might get overwritten. If any journal lists are very
+ ** old flush them as well.
+ */
+first_jl:
+ list_for_each_safe(entry, safe, &journal->j_journal_list) {
+ temp_jl = JOURNAL_LIST_ENTRY(entry);
+ if (journal->j_start <= temp_jl->j_start) {
+ if ((journal->j_start + journal->j_trans_max + 1) >=
+ temp_jl->j_start)
+ {
+ flush_used_journal_lists(p_s_sb, temp_jl);
+ goto first_jl;
+ } else if ((journal->j_start +
+ journal->j_trans_max + 1) <
+ SB_ONDISK_JOURNAL_SIZE(p_s_sb))
+ {
+ /* if we don't cross into the next transaction and we don't
+ * wrap, there is no way we can overlap any later transactions
+ * break now
+ */
+ break;
+ }
+ } else if ((journal->j_start +
+ journal->j_trans_max + 1) >
+ SB_ONDISK_JOURNAL_SIZE(p_s_sb))
+ {
+ if (((journal->j_start + journal->j_trans_max + 1) %
+ SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start)
+ {
+ flush_used_journal_lists(p_s_sb, temp_jl);
+ goto first_jl;
+ } else {
+ /* we don't overlap anything from out start to the end of the
+ * log, and our wrapped portion doesn't overlap anything at
+ * the start of the log. We can break
+ */
+ break;
+ }
+ }
+ }
+ flush_old_journal_lists(p_s_sb);
+
+ journal->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, journal->j_current_jl) ;
+
+ if (!(journal->j_current_jl->j_list_bitmap)) {
+ reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ;
+ }
+
+ atomic_set(&(journal->j_jlock), 0) ;
+ unlock_journal(p_s_sb) ;
+ /* wake up any body waiting to join. */
+ clear_bit(J_WRITERS_QUEUED, &journal->j_state);
+ wake_up(&(journal->j_join_wait)) ;
+
+ if (!flush && wait_on_commit &&
+ journal_list_still_alive(p_s_sb, commit_trans_id)) {
+ flush_commit_list(p_s_sb, jl, 1) ;
+ }
+out:
+ reiserfs_check_lock_depth(p_s_sb, "journal end2");
+
+ memset (th, 0, sizeof (*th));
+ /* Re-set th->t_super, so we can properly keep track of how many
+ * persistent transactions there are. We need to do this so if this
+ * call is part of a failed restart_transaction, we can free it later */
+ th->t_super = p_s_sb;
+
+ return journal->j_errno;
+}
+
+static void
+__reiserfs_journal_abort_hard (struct super_block *sb)
+{
+ struct reiserfs_journal *journal = SB_JOURNAL (sb);
+ if (test_bit (J_ABORTED, &journal->j_state))
+ return;
+
+ printk (KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n",
+ reiserfs_bdevname (sb));
+
+ sb->s_flags |= MS_RDONLY;
+ set_bit (J_ABORTED, &journal->j_state);
+
+#ifdef CONFIG_REISERFS_CHECK
+ dump_stack();
+#endif
+}
+
+static void
+__reiserfs_journal_abort_soft (struct super_block *sb, int errno)
+{
+ struct reiserfs_journal *journal = SB_JOURNAL (sb);
+ if (test_bit (J_ABORTED, &journal->j_state))
+ return;
+
+ if (!journal->j_errno)
+ journal->j_errno = errno;
+
+ __reiserfs_journal_abort_hard (sb);
+}
+
+void
+reiserfs_journal_abort (struct super_block *sb, int errno)
+{
+ return __reiserfs_journal_abort_soft (sb, errno);
+}
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
new file mode 100644
index 00000000000..2406608fc5c
--- /dev/null
+++ b/fs/reiserfs/lbalance.c
@@ -0,0 +1,1222 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/buffer_head.h>
+
+/* these are used in do_balance.c */
+
+/* leaf_move_items
+ leaf_shift_left
+ leaf_shift_right
+ leaf_delete_items
+ leaf_insert_into_buf
+ leaf_paste_in_buffer
+ leaf_cut_from_buffer
+ leaf_paste_entries
+ */
+
+
+/* copy copy_count entries from source directory item to dest buffer (creating new item if needed) */
+static void leaf_copy_dir_entries (struct buffer_info * dest_bi, struct buffer_head * source,
+ int last_first, int item_num, int from, int copy_count)
+{
+ struct buffer_head * dest = dest_bi->bi_bh;
+ int item_num_in_dest; /* either the number of target item,
+ or if we must create a new item,
+ the number of the item we will
+ create it next to */
+ struct item_head * ih;
+ struct reiserfs_de_head * deh;
+ int copy_records_len; /* length of all records in item to be copied */
+ char * records;
+
+ ih = B_N_PITEM_HEAD (source, item_num);
+
+ RFALSE( !is_direntry_le_ih (ih), "vs-10000: item must be directory item");
+
+ /* length of all record to be copied and first byte of the last of them */
+ deh = B_I_DEH (source, ih);
+ if (copy_count) {
+ copy_records_len = (from ? deh_location( &(deh[from - 1]) ) :
+ ih_item_len(ih)) - deh_location( &(deh[from + copy_count - 1]));
+ records = source->b_data + ih_location(ih) +
+ deh_location( &(deh[from + copy_count - 1]));
+ } else {
+ copy_records_len = 0;
+ records = NULL;
+ }
+
+ /* when copy last to first, dest buffer can contain 0 items */
+ item_num_in_dest = (last_first == LAST_TO_FIRST) ? (( B_NR_ITEMS(dest) ) ? 0 : -1) : (B_NR_ITEMS(dest) - 1);
+
+ /* if there are no items in dest or the first/last item in dest is not item of the same directory */
+ if ( (item_num_in_dest == - 1) ||
+ (last_first == FIRST_TO_LAST && le_ih_k_offset (ih) == DOT_OFFSET) ||
+ (last_first == LAST_TO_FIRST && comp_short_le_keys/*COMP_SHORT_KEYS*/ (&ih->ih_key, B_N_PKEY (dest, item_num_in_dest)))) {
+ /* create new item in dest */
+ struct item_head new_ih;
+
+ /* form item header */
+ memcpy (&new_ih.ih_key, &ih->ih_key, KEY_SIZE);
+ put_ih_version( &new_ih, KEY_FORMAT_3_5 );
+ /* calculate item len */
+ put_ih_item_len( &new_ih, DEH_SIZE * copy_count + copy_records_len );
+ put_ih_entry_count( &new_ih, 0 );
+
+ if (last_first == LAST_TO_FIRST) {
+ /* form key by the following way */
+ if (from < I_ENTRY_COUNT(ih)) {
+ set_le_ih_k_offset( &new_ih, deh_offset( &(deh[from]) ) );
+ /*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE);*/
+ } else {
+ /* no entries will be copied to this item in this function */
+ set_le_ih_k_offset (&new_ih, U32_MAX);
+ /* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */
+ }
+ set_le_key_k_type (KEY_FORMAT_3_5, &(new_ih.ih_key), TYPE_DIRENTRY);
+ }
+
+ /* insert item into dest buffer */
+ leaf_insert_into_buf (dest_bi, (last_first == LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest), &new_ih, NULL, 0);
+ } else {
+ /* prepare space for entries */
+ leaf_paste_in_buffer (dest_bi, (last_first==FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0, MAX_US_INT,
+ DEH_SIZE * copy_count + copy_records_len, records, 0
+ );
+ }
+
+ item_num_in_dest = (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest)-1) : 0;
+
+ leaf_paste_entries (dest_bi->bi_bh, item_num_in_dest,
+ (last_first == FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD (dest, item_num_in_dest)) : 0,
+ copy_count, deh + from, records,
+ DEH_SIZE * copy_count + copy_records_len
+ );
+}
+
+
+/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or
+ part of it or nothing (see the return 0 below) from SOURCE to the end
+ (if last_first) or beginning (!last_first) of the DEST */
+/* returns 1 if anything was copied, else 0 */
+static int leaf_copy_boundary_item (struct buffer_info * dest_bi, struct buffer_head * src, int last_first,
+ int bytes_or_entries)
+{
+ struct buffer_head * dest = dest_bi->bi_bh;
+ int dest_nr_item, src_nr_item; /* number of items in the source and destination buffers */
+ struct item_head * ih;
+ struct item_head * dih;
+
+ dest_nr_item = B_NR_ITEMS(dest);
+
+ if ( last_first == FIRST_TO_LAST ) {
+ /* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects
+ or of different types ) then there is no need to treat this item differently from the other items
+ that we copy, so we return */
+ ih = B_N_PITEM_HEAD (src, 0);
+ dih = B_N_PITEM_HEAD (dest, dest_nr_item - 1);
+ if (!dest_nr_item || (!op_is_left_mergeable (&(ih->ih_key), src->b_size)))
+ /* there is nothing to merge */
+ return 0;
+
+ RFALSE( ! ih_item_len(ih), "vs-10010: item can not have empty length");
+
+ if ( is_direntry_le_ih (ih) ) {
+ if ( bytes_or_entries == -1 )
+ /* copy all entries to dest */
+ bytes_or_entries = ih_entry_count(ih);
+ leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, 0, 0, bytes_or_entries);
+ return 1;
+ }
+
+ /* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST
+ part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header
+ */
+ if ( bytes_or_entries == -1 )
+ bytes_or_entries = ih_item_len(ih);
+
+#ifdef CONFIG_REISERFS_CHECK
+ else {
+ if (bytes_or_entries == ih_item_len(ih) && is_indirect_le_ih(ih))
+ if (get_ih_free_space (ih))
+ reiserfs_panic (NULL, "vs-10020: leaf_copy_boundary_item: "
+ "last unformatted node must be filled entirely (%h)",
+ ih);
+ }
+#endif
+
+ /* merge first item (or its part) of src buffer with the last
+ item of dest buffer. Both are of the same file */
+ leaf_paste_in_buffer (dest_bi,
+ dest_nr_item - 1, ih_item_len(dih), bytes_or_entries, B_I_PITEM(src,ih), 0
+ );
+
+ if (is_indirect_le_ih (dih)) {
+ RFALSE( get_ih_free_space (dih),
+ "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space",
+ ih);
+ if (bytes_or_entries == ih_item_len(ih))
+ set_ih_free_space (dih, get_ih_free_space (ih));
+ }
+
+ return 1;
+ }
+
+
+ /* copy boundary item to right (last_first == LAST_TO_FIRST) */
+
+ /* ( DEST is empty or last item of SOURCE and first item of DEST
+ are the items of different object or of different types )
+ */
+ src_nr_item = B_NR_ITEMS (src);
+ ih = B_N_PITEM_HEAD (src, src_nr_item - 1);
+ dih = B_N_PITEM_HEAD (dest, 0);
+
+ if (!dest_nr_item || !op_is_left_mergeable (&(dih->ih_key), src->b_size))
+ return 0;
+
+ if ( is_direntry_le_ih (ih)) {
+ if ( bytes_or_entries == -1 )
+ /* bytes_or_entries = entries number in last item body of SOURCE */
+ bytes_or_entries = ih_entry_count(ih);
+
+ leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, src_nr_item - 1, ih_entry_count(ih) - bytes_or_entries, bytes_or_entries);
+ return 1;
+ }
+
+ /* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST;
+ part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST;
+ don't create new item header
+ */
+
+ RFALSE( is_indirect_le_ih(ih) && get_ih_free_space (ih),
+ "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)",
+ ih);
+
+ if ( bytes_or_entries == -1 ) {
+ /* bytes_or_entries = length of last item body of SOURCE */
+ bytes_or_entries = ih_item_len(ih);
+
+ RFALSE( le_ih_k_offset (dih) !=
+ le_ih_k_offset (ih) + op_bytes_number (ih, src->b_size),
+ "vs-10050: items %h and %h do not match", ih, dih);
+
+ /* change first item key of the DEST */
+ set_le_ih_k_offset (dih, le_ih_k_offset (ih));
+
+ /* item becomes non-mergeable */
+ /* or mergeable if left item was */
+ set_le_ih_k_type (dih, le_ih_k_type (ih));
+ } else {
+ /* merge to right only part of item */
+ RFALSE( ih_item_len(ih) <= bytes_or_entries,
+ "vs-10060: no so much bytes %lu (needed %lu)",
+ ( unsigned long )ih_item_len(ih), ( unsigned long )bytes_or_entries);
+
+ /* change first item key of the DEST */
+ if ( is_direct_le_ih (dih) ) {
+ RFALSE( le_ih_k_offset (dih) <= (unsigned long)bytes_or_entries,
+ "vs-10070: dih %h, bytes_or_entries(%d)", dih, bytes_or_entries);
+ set_le_ih_k_offset (dih, le_ih_k_offset (dih) - bytes_or_entries);
+ } else {
+ RFALSE( le_ih_k_offset (dih) <=
+ (bytes_or_entries / UNFM_P_SIZE) * dest->b_size,
+ "vs-10080: dih %h, bytes_or_entries(%d)",
+ dih, (bytes_or_entries/UNFM_P_SIZE)*dest->b_size);
+ set_le_ih_k_offset (dih, le_ih_k_offset (dih) - ((bytes_or_entries / UNFM_P_SIZE) * dest->b_size));
+ }
+ }
+
+ leaf_paste_in_buffer (dest_bi, 0, 0, bytes_or_entries, B_I_PITEM(src,ih) + ih_item_len(ih) - bytes_or_entries, 0);
+ return 1;
+}
+
+
+/* copy cpy_mun items from buffer src to buffer dest
+ * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning from first-th item in src to tail of dest
+ * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning from first-th item in src to head of dest
+ */
+static void leaf_copy_items_entirely (struct buffer_info * dest_bi, struct buffer_head * src, int last_first,
+ int first, int cpy_num)
+{
+ struct buffer_head * dest;
+ int nr, free_space;
+ int dest_before;
+ int last_loc, last_inserted_loc, location;
+ int i, j;
+ struct block_head * blkh;
+ struct item_head * ih;
+
+ RFALSE( last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST,
+ "vs-10090: bad last_first parameter %d", last_first);
+ RFALSE( B_NR_ITEMS (src) - first < cpy_num,
+ "vs-10100: too few items in source %d, required %d from %d",
+ B_NR_ITEMS(src), cpy_num, first);
+ RFALSE( cpy_num < 0, "vs-10110: can not copy negative amount of items");
+ RFALSE( ! dest_bi, "vs-10120: can not copy negative amount of items");
+
+ dest = dest_bi->bi_bh;
+
+ RFALSE( ! dest, "vs-10130: can not copy negative amount of items");
+
+ if (cpy_num == 0)
+ return;
+
+ blkh = B_BLK_HEAD(dest);
+ nr = blkh_nr_item( blkh );
+ free_space = blkh_free_space(blkh);
+
+ /* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */
+ dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr;
+
+ /* location of head of first new item */
+ ih = B_N_PITEM_HEAD (dest, dest_before);
+
+ RFALSE( blkh_free_space(blkh) < cpy_num * IH_SIZE,
+ "vs-10140: not enough free space for headers %d (needed %d)",
+ B_FREE_SPACE (dest), cpy_num * IH_SIZE);
+
+ /* prepare space for headers */
+ memmove (ih + cpy_num, ih, (nr-dest_before) * IH_SIZE);
+
+ /* copy item headers */
+ memcpy (ih, B_N_PITEM_HEAD (src, first), cpy_num * IH_SIZE);
+
+ free_space -= (IH_SIZE * cpy_num);
+ set_blkh_free_space( blkh, free_space );
+
+ /* location of unmovable item */
+ j = location = (dest_before == 0) ? dest->b_size : ih_location(ih-1);
+ for (i = dest_before; i < nr + cpy_num; i ++) {
+ location -= ih_item_len( ih + i - dest_before );
+ put_ih_location( ih + i - dest_before, location );
+ }
+
+ /* prepare space for items */
+ last_loc = ih_location( &(ih[nr+cpy_num-1-dest_before]) );
+ last_inserted_loc = ih_location( &(ih[cpy_num-1]) );
+
+ /* check free space */
+ RFALSE( free_space < j - last_inserted_loc,
+ "vs-10150: not enough free space for items %d (needed %d)",
+ free_space, j - last_inserted_loc);
+
+ memmove (dest->b_data + last_loc,
+ dest->b_data + last_loc + j - last_inserted_loc,
+ last_inserted_loc - last_loc);
+
+ /* copy items */
+ memcpy (dest->b_data + last_inserted_loc, B_N_PITEM(src,(first + cpy_num - 1)),
+ j - last_inserted_loc);
+
+ /* sizes, item number */
+ set_blkh_nr_item( blkh, nr + cpy_num );
+ set_blkh_free_space( blkh, free_space - (j - last_inserted_loc) );
+
+ do_balance_mark_leaf_dirty (dest_bi->tb, dest, 0);
+
+ if (dest_bi->bi_parent) {
+ struct disk_child *t_dc;
+ t_dc = B_N_CHILD (dest_bi->bi_parent, dest_bi->bi_position);
+ RFALSE( dc_block_number(t_dc) != dest->b_blocknr,
+ "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu",
+ ( long unsigned ) dest->b_blocknr,
+ ( long unsigned ) dc_block_number(t_dc));
+ put_dc_size( t_dc, dc_size(t_dc) + (j - last_inserted_loc + IH_SIZE * cpy_num ) );
+
+ do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent, 0);
+ }
+}
+
+
+/* This function splits the (liquid) item into two items (useful when
+ shifting part of an item into another node.) */
+static void leaf_item_bottle (struct buffer_info * dest_bi, struct buffer_head * src, int last_first,
+ int item_num, int cpy_bytes)
+{
+ struct buffer_head * dest = dest_bi->bi_bh;
+ struct item_head * ih;
+
+ RFALSE( cpy_bytes == -1, "vs-10170: bytes == - 1 means: do not split item");
+
+ if ( last_first == FIRST_TO_LAST ) {
+ /* if ( if item in position item_num in buffer SOURCE is directory item ) */
+ if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(src,item_num)))
+ leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, item_num, 0, cpy_bytes);
+ else {
+ struct item_head n_ih;
+
+ /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST
+ part defined by 'cpy_bytes'; create new item header; change old item_header (????);
+ n_ih = new item_header;
+ */
+ memcpy (&n_ih, ih, IH_SIZE);
+ put_ih_item_len( &n_ih, cpy_bytes );
+ if (is_indirect_le_ih (ih)) {
+ RFALSE( cpy_bytes == ih_item_len(ih) && get_ih_free_space(ih),
+ "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)",
+ ( long unsigned ) get_ih_free_space (ih));
+ set_ih_free_space (&n_ih, 0);
+ }
+
+ RFALSE( op_is_left_mergeable (&(ih->ih_key), src->b_size),
+ "vs-10190: bad mergeability of item %h", ih);
+ n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
+ leaf_insert_into_buf (dest_bi, B_NR_ITEMS(dest), &n_ih, B_N_PITEM (src, item_num), 0);
+ }
+ } else {
+ /* if ( if item in position item_num in buffer SOURCE is directory item ) */
+ if (is_direntry_le_ih(ih = B_N_PITEM_HEAD (src, item_num)))
+ leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, item_num, I_ENTRY_COUNT(ih) - cpy_bytes, cpy_bytes);
+ else {
+ struct item_head n_ih;
+
+ /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST
+ part defined by 'cpy_bytes'; create new item header;
+ n_ih = new item_header;
+ */
+ memcpy (&n_ih, ih, SHORT_KEY_SIZE);
+
+ n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
+
+ if (is_direct_le_ih (ih)) {
+ set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + ih_item_len(ih) - cpy_bytes);
+ set_le_ih_k_type (&n_ih, TYPE_DIRECT);
+ set_ih_free_space (&n_ih, MAX_US_INT);
+ } else {
+ /* indirect item */
+ RFALSE( !cpy_bytes && get_ih_free_space (ih),
+ "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended");
+ set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + (ih_item_len(ih) - cpy_bytes) / UNFM_P_SIZE * dest->b_size);
+ set_le_ih_k_type (&n_ih, TYPE_INDIRECT);
+ set_ih_free_space (&n_ih, get_ih_free_space (ih));
+ }
+
+ /* set item length */
+ put_ih_item_len( &n_ih, cpy_bytes );
+
+ n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
+
+ leaf_insert_into_buf (dest_bi, 0, &n_ih, B_N_PITEM(src,item_num) + ih_item_len(ih) - cpy_bytes, 0);
+ }
+ }
+}
+
+
+/* If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE to DEST.
+ If cpy_bytes not equal to minus one than copy cpy_num-1 whole items from SOURCE to DEST.
+ From last item copy cpy_num bytes for regular item and cpy_num directory entries for
+ directory item. */
+static int leaf_copy_items (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, int cpy_num,
+ int cpy_bytes)
+{
+ struct buffer_head * dest;
+ int pos, i, src_nr_item, bytes;
+
+ dest = dest_bi->bi_bh;
+ RFALSE( !dest || !src, "vs-10210: !dest || !src");
+ RFALSE( last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
+ "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST");
+ RFALSE( B_NR_ITEMS(src) < cpy_num,
+ "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src), cpy_num);
+ RFALSE( cpy_num < 0,"vs-10240: cpy_num < 0 (%d)", cpy_num);
+
+ if ( cpy_num == 0 )
+ return 0;
+
+ if ( last_first == FIRST_TO_LAST ) {
+ /* copy items to left */
+ pos = 0;
+ if ( cpy_num == 1 )
+ bytes = cpy_bytes;
+ else
+ bytes = -1;
+
+ /* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */
+ i = leaf_copy_boundary_item (dest_bi, src, FIRST_TO_LAST, bytes);
+ cpy_num -= i;
+ if ( cpy_num == 0 )
+ return i;
+ pos += i;
+ if ( cpy_bytes == -1 )
+ /* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */
+ leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num);
+ else {
+ /* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */
+ leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num-1);
+
+ /* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */
+ leaf_item_bottle (dest_bi, src, FIRST_TO_LAST, cpy_num+pos-1, cpy_bytes);
+ }
+ } else {
+ /* copy items to right */
+ src_nr_item = B_NR_ITEMS (src);
+ if ( cpy_num == 1 )
+ bytes = cpy_bytes;
+ else
+ bytes = -1;
+
+ /* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */
+ i = leaf_copy_boundary_item (dest_bi, src, LAST_TO_FIRST, bytes);
+
+ cpy_num -= i;
+ if ( cpy_num == 0 )
+ return i;
+
+ pos = src_nr_item - cpy_num - i;
+ if ( cpy_bytes == -1 ) {
+ /* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */
+ leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos, cpy_num);
+ } else {
+ /* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */
+ leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos+1, cpy_num-1);
+
+ /* copy part of the item which number is pos to the begin of the DEST */
+ leaf_item_bottle (dest_bi, src, LAST_TO_FIRST, pos, cpy_bytes);
+ }
+ }
+ return i;
+}
+
+
+/* there are types of coping: from S[0] to L[0], from S[0] to R[0],
+ from R[0] to L[0]. for each of these we have to define parent and
+ positions of destination and source buffers */
+static void leaf_define_dest_src_infos (int shift_mode, struct tree_balance * tb, struct buffer_info * dest_bi,
+ struct buffer_info * src_bi, int * first_last,
+ struct buffer_head * Snew)
+{
+ memset (dest_bi, 0, sizeof (struct buffer_info));
+ memset (src_bi, 0, sizeof (struct buffer_info));
+
+ /* define dest, src, dest parent, dest position */
+ switch (shift_mode) {
+ case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */
+ src_bi->tb = tb;
+ src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path);
+ src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
+ src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); /* src->b_item_order */
+ dest_bi->tb = tb;
+ dest_bi->bi_bh = tb->L[0];
+ dest_bi->bi_parent = tb->FL[0];
+ dest_bi->bi_position = get_left_neighbor_position (tb, 0);
+ *first_last = FIRST_TO_LAST;
+ break;
+
+ case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */
+ src_bi->tb = tb;
+ src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path);
+ src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
+ src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0);
+ dest_bi->tb = tb;
+ dest_bi->bi_bh = tb->R[0];
+ dest_bi->bi_parent = tb->FR[0];
+ dest_bi->bi_position = get_right_neighbor_position (tb, 0);
+ *first_last = LAST_TO_FIRST;
+ break;
+
+ case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */
+ src_bi->tb = tb;
+ src_bi->bi_bh = tb->R[0];
+ src_bi->bi_parent = tb->FR[0];
+ src_bi->bi_position = get_right_neighbor_position (tb, 0);
+ dest_bi->tb = tb;
+ dest_bi->bi_bh = tb->L[0];
+ dest_bi->bi_parent = tb->FL[0];
+ dest_bi->bi_position = get_left_neighbor_position (tb, 0);
+ *first_last = FIRST_TO_LAST;
+ break;
+
+ case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */
+ src_bi->tb = tb;
+ src_bi->bi_bh = tb->L[0];
+ src_bi->bi_parent = tb->FL[0];
+ src_bi->bi_position = get_left_neighbor_position (tb, 0);
+ dest_bi->tb = tb;
+ dest_bi->bi_bh = tb->R[0];
+ dest_bi->bi_parent = tb->FR[0];
+ dest_bi->bi_position = get_right_neighbor_position (tb, 0);
+ *first_last = LAST_TO_FIRST;
+ break;
+
+ case LEAF_FROM_S_TO_SNEW:
+ src_bi->tb = tb;
+ src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path);
+ src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
+ src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0);
+ dest_bi->tb = tb;
+ dest_bi->bi_bh = Snew;
+ dest_bi->bi_parent = NULL;
+ dest_bi->bi_position = 0;
+ *first_last = LAST_TO_FIRST;
+ break;
+
+ default:
+ reiserfs_panic (NULL, "vs-10250: leaf_define_dest_src_infos: shift type is unknown (%d)", shift_mode);
+ }
+ RFALSE( src_bi->bi_bh == 0 || dest_bi->bi_bh == 0,
+ "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
+ shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
+}
+
+
+
+
+/* copy mov_num items and mov_bytes of the (mov_num-1)th item to
+ neighbor. Delete them from source */
+int leaf_move_items (int shift_mode, struct tree_balance * tb, int mov_num, int mov_bytes, struct buffer_head * Snew)
+{
+ int ret_value;
+ struct buffer_info dest_bi, src_bi;
+ int first_last;
+
+ leaf_define_dest_src_infos (shift_mode, tb, &dest_bi, &src_bi, &first_last, Snew);
+
+ ret_value = leaf_copy_items (&dest_bi, src_bi.bi_bh, first_last, mov_num, mov_bytes);
+
+ leaf_delete_items (&src_bi, first_last, (first_last == FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) - mov_num), mov_num, mov_bytes);
+
+
+ return ret_value;
+}
+
+
+/* Shift shift_num items (and shift_bytes of last shifted item if shift_bytes != -1)
+ from S[0] to L[0] and replace the delimiting key */
+int leaf_shift_left (struct tree_balance * tb, int shift_num, int shift_bytes)
+{
+ struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path);
+ int i;
+
+ /* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */
+ i = leaf_move_items (LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL);
+
+ if ( shift_num ) {
+ if (B_NR_ITEMS (S0) == 0) { /* number of items in S[0] == 0 */
+
+ RFALSE( shift_bytes != -1,
+ "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)",
+ shift_bytes);
+#ifdef CONFIG_REISERFS_CHECK
+ if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
+ print_cur_tb ("vs-10275");
+ reiserfs_panic (tb->tb_sb, "vs-10275: leaf_shift_left: balance condition corrupted (%c)", tb->tb_mode);
+ }
+#endif
+
+ if (PATH_H_POSITION (tb->tb_path, 1) == 0)
+ replace_key (tb, tb->CFL[0], tb->lkey[0], PATH_H_PPARENT (tb->tb_path, 0), 0);
+
+ } else {
+ /* replace lkey in CFL[0] by 0-th key from S[0]; */
+ replace_key (tb, tb->CFL[0], tb->lkey[0], S0, 0);
+
+ RFALSE( (shift_bytes != -1 &&
+ !(is_direntry_le_ih (B_N_PITEM_HEAD (S0, 0))
+ && !I_ENTRY_COUNT (B_N_PITEM_HEAD (S0, 0)))) &&
+ (!op_is_left_mergeable (B_N_PKEY (S0, 0), S0->b_size)),
+ "vs-10280: item must be mergeable");
+ }
+ }
+
+ return i;
+}
+
+
+
+
+
+/* CLEANING STOPPED HERE */
+
+
+
+
+/* Shift shift_num (shift_bytes) items from S[0] to the right neighbor, and replace the delimiting key */
+int leaf_shift_right(
+ struct tree_balance * tb,
+ int shift_num,
+ int shift_bytes
+ )
+{
+ // struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path);
+ int ret_value;
+
+ /* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */
+ ret_value = leaf_move_items (LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL);
+
+ /* replace rkey in CFR[0] by the 0-th key from R[0] */
+ if (shift_num) {
+ replace_key (tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+ }
+
+ return ret_value;
+}
+
+
+
+static void leaf_delete_items_entirely (struct buffer_info * bi,
+ int first, int del_num);
+/* If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR.
+ If not.
+ If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of
+ the first item. Part defined by del_bytes. Don't delete first item header
+ If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of
+ the last item . Part defined by del_bytes. Don't delete last item header.
+*/
+void leaf_delete_items (struct buffer_info * cur_bi, int last_first,
+ int first, int del_num, int del_bytes)
+{
+ struct buffer_head * bh;
+ int item_amount = B_NR_ITEMS (bh = cur_bi->bi_bh);
+
+ RFALSE( !bh, "10155: bh is not defined");
+ RFALSE( del_num < 0, "10160: del_num can not be < 0. del_num==%d", del_num);
+ RFALSE( first < 0 || first + del_num > item_amount,
+ "10165: invalid number of first item to be deleted (%d) or "
+ "no so much items (%d) to delete (only %d)",
+ first, first + del_num, item_amount);
+
+ if ( del_num == 0 )
+ return;
+
+ if ( first == 0 && del_num == item_amount && del_bytes == -1 ) {
+ make_empty_node (cur_bi);
+ do_balance_mark_leaf_dirty (cur_bi->tb, bh, 0);
+ return;
+ }
+
+ if ( del_bytes == -1 )
+ /* delete del_num items beginning from item in position first */
+ leaf_delete_items_entirely (cur_bi, first, del_num);
+ else {
+ if ( last_first == FIRST_TO_LAST ) {
+ /* delete del_num-1 items beginning from item in position first */
+ leaf_delete_items_entirely (cur_bi, first, del_num-1);
+
+ /* delete the part of the first item of the bh
+ do not delete item header
+ */
+ leaf_cut_from_buffer (cur_bi, 0, 0, del_bytes);
+ } else {
+ struct item_head * ih;
+ int len;
+
+ /* delete del_num-1 items beginning from item in position first+1 */
+ leaf_delete_items_entirely (cur_bi, first+1, del_num-1);
+
+ if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh)-1))) /* the last item is directory */
+ /* len = numbers of directory entries in this item */
+ len = ih_entry_count(ih);
+ else
+ /* len = body len of item */
+ len = ih_item_len(ih);
+
+ /* delete the part of the last item of the bh
+ do not delete item header
+ */
+ leaf_cut_from_buffer (cur_bi, B_NR_ITEMS(bh)-1, len - del_bytes, del_bytes);
+ }
+ }
+}
+
+
+/* insert item into the leaf node in position before */
+void leaf_insert_into_buf (struct buffer_info * bi, int before,
+ struct item_head * inserted_item_ih,
+ const char * inserted_item_body,
+ int zeros_number)
+{
+ struct buffer_head * bh = bi->bi_bh;
+ int nr, free_space;
+ struct block_head * blkh;
+ struct item_head * ih;
+ int i;
+ int last_loc, unmoved_loc;
+ char * to;
+
+
+ blkh = B_BLK_HEAD(bh);
+ nr = blkh_nr_item(blkh);
+ free_space = blkh_free_space( blkh );
+
+ /* check free space */
+ RFALSE( free_space < ih_item_len(inserted_item_ih) + IH_SIZE,
+ "vs-10170: not enough free space in block %z, new item %h",
+ bh, inserted_item_ih);
+ RFALSE( zeros_number > ih_item_len(inserted_item_ih),
+ "vs-10172: zero number == %d, item length == %d",
+ zeros_number, ih_item_len(inserted_item_ih));
+
+
+ /* get item new item must be inserted before */
+ ih = B_N_PITEM_HEAD (bh, before);
+
+ /* prepare space for the body of new item */
+ last_loc = nr ? ih_location( &(ih[nr - before - 1]) ) : bh->b_size;
+ unmoved_loc = before ? ih_location( ih-1 ) : bh->b_size;
+
+
+ memmove (bh->b_data + last_loc - ih_item_len(inserted_item_ih),
+ bh->b_data + last_loc, unmoved_loc - last_loc);
+
+ to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih);
+ memset (to, 0, zeros_number);
+ to += zeros_number;
+
+ /* copy body to prepared space */
+ if (inserted_item_body)
+ memmove (to, inserted_item_body, ih_item_len(inserted_item_ih) - zeros_number);
+ else
+ memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number);
+
+ /* insert item header */
+ memmove (ih + 1, ih, IH_SIZE * (nr - before));
+ memmove (ih, inserted_item_ih, IH_SIZE);
+
+ /* change locations */
+ for (i = before; i < nr + 1; i ++)
+ {
+ unmoved_loc -= ih_item_len( &(ih[i-before]));
+ put_ih_location( &(ih[i-before]), unmoved_loc );
+ }
+
+ /* sizes, free space, item number */
+ set_blkh_nr_item( blkh, blkh_nr_item(blkh) + 1 );
+ set_blkh_free_space( blkh,
+ free_space - (IH_SIZE + ih_item_len(inserted_item_ih ) ) );
+ do_balance_mark_leaf_dirty (bi->tb, bh, 1);
+
+ if (bi->bi_parent) {
+ struct disk_child *t_dc;
+ t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position);
+ put_dc_size( t_dc, dc_size(t_dc) + (IH_SIZE + ih_item_len(inserted_item_ih)));
+ do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0);
+ }
+}
+
+
+/* paste paste_size bytes to affected_item_num-th item.
+ When item is a directory, this only prepare space for new entries */
+void leaf_paste_in_buffer (struct buffer_info * bi, int affected_item_num,
+ int pos_in_item, int paste_size,
+ const char * body,
+ int zeros_number)
+{
+ struct buffer_head * bh = bi->bi_bh;
+ int nr, free_space;
+ struct block_head * blkh;
+ struct item_head * ih;
+ int i;
+ int last_loc, unmoved_loc;
+
+ blkh = B_BLK_HEAD(bh);
+ nr = blkh_nr_item(blkh);
+ free_space = blkh_free_space(blkh);
+
+
+ /* check free space */
+ RFALSE( free_space < paste_size,
+ "vs-10175: not enough free space: needed %d, available %d",
+ paste_size, free_space);
+
+#ifdef CONFIG_REISERFS_CHECK
+ if (zeros_number > paste_size) {
+ print_cur_tb ("10177");
+ reiserfs_panic ( NULL, "vs-10177: leaf_paste_in_buffer: ero number == %d, paste_size == %d",
+ zeros_number, paste_size);
+ }
+#endif /* CONFIG_REISERFS_CHECK */
+
+
+ /* item to be appended */
+ ih = B_N_PITEM_HEAD(bh, affected_item_num);
+
+ last_loc = ih_location( &(ih[nr - affected_item_num - 1]) );
+ unmoved_loc = affected_item_num ? ih_location( ih-1 ) : bh->b_size;
+
+ /* prepare space */
+ memmove (bh->b_data + last_loc - paste_size, bh->b_data + last_loc,
+ unmoved_loc - last_loc);
+
+
+ /* change locations */
+ for (i = affected_item_num; i < nr; i ++)
+ put_ih_location( &(ih[i-affected_item_num]),
+ ih_location( &(ih[i-affected_item_num])) - paste_size );
+
+ if ( body ) {
+ if (!is_direntry_le_ih (ih)) {
+ if (!pos_in_item) {
+ /* shift data to right */
+ memmove (bh->b_data + ih_location(ih) + paste_size,
+ bh->b_data + ih_location(ih), ih_item_len(ih));
+ /* paste data in the head of item */
+ memset (bh->b_data + ih_location(ih), 0, zeros_number);
+ memcpy (bh->b_data + ih_location(ih) + zeros_number, body, paste_size - zeros_number);
+ } else {
+ memset (bh->b_data + unmoved_loc - paste_size, 0, zeros_number);
+ memcpy (bh->b_data + unmoved_loc - paste_size + zeros_number, body, paste_size - zeros_number);
+ }
+ }
+ }
+ else
+ memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size);
+
+ put_ih_item_len( ih, ih_item_len(ih) + paste_size );
+
+ /* change free space */
+ set_blkh_free_space( blkh, free_space - paste_size );
+
+ do_balance_mark_leaf_dirty (bi->tb, bh, 0);
+
+ if (bi->bi_parent) {
+ struct disk_child *t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position);
+ put_dc_size( t_dc, dc_size(t_dc) + paste_size );
+ do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0);
+ }
+}
+
+
+/* cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
+ does not have free space, so it moves DEHs and remaining records as
+ necessary. Return value is size of removed part of directory item
+ in bytes. */
+static int leaf_cut_entries (
+ struct buffer_head * bh,
+ struct item_head * ih,
+ int from,
+ int del_count
+ )
+{
+ char * item;
+ struct reiserfs_de_head * deh;
+ int prev_record_offset; /* offset of record, that is (from-1)th */
+ char * prev_record; /* */
+ int cut_records_len; /* length of all removed records */
+ int i;
+
+
+ /* make sure, that item is directory and there are enough entries to
+ remove */
+ RFALSE( !is_direntry_le_ih (ih), "10180: item is not directory item");
+ RFALSE( I_ENTRY_COUNT(ih) < from + del_count,
+ "10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d",
+ I_ENTRY_COUNT(ih), from, del_count);
+
+ if (del_count == 0)
+ return 0;
+
+ /* first byte of item */
+ item = bh->b_data + ih_location(ih);
+
+ /* entry head array */
+ deh = B_I_DEH (bh, ih);
+
+ /* first byte of remaining entries, those are BEFORE cut entries
+ (prev_record) and length of all removed records (cut_records_len) */
+ prev_record_offset = (from ? deh_location( &(deh[from - 1])) : ih_item_len(ih));
+ cut_records_len = prev_record_offset/*from_record*/ -
+ deh_location( &(deh[from + del_count - 1]));
+ prev_record = item + prev_record_offset;
+
+
+ /* adjust locations of remaining entries */
+ for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i --)
+ put_deh_location( &(deh[i]),
+ deh_location( &deh[i] ) - (DEH_SIZE * del_count ) );
+
+ for (i = 0; i < from; i ++)
+ put_deh_location( &(deh[i]),
+ deh_location( &deh[i] ) - (DEH_SIZE * del_count + cut_records_len) );
+
+ put_ih_entry_count( ih, ih_entry_count(ih) - del_count );
+
+ /* shift entry head array and entries those are AFTER removed entries */
+ memmove ((char *)(deh + from),
+ deh + from + del_count,
+ prev_record - cut_records_len - (char *)(deh + from + del_count));
+
+ /* shift records, those are BEFORE removed entries */
+ memmove (prev_record - cut_records_len - DEH_SIZE * del_count,
+ prev_record, item + ih_item_len(ih) - prev_record);
+
+ return DEH_SIZE * del_count + cut_records_len;
+}
+
+
+/* when cut item is part of regular file
+ pos_in_item - first byte that must be cut
+ cut_size - number of bytes to be cut beginning from pos_in_item
+
+ when cut item is part of directory
+ pos_in_item - number of first deleted entry
+ cut_size - count of deleted entries
+ */
+void leaf_cut_from_buffer (struct buffer_info * bi, int cut_item_num,
+ int pos_in_item, int cut_size)
+{
+ int nr;
+ struct buffer_head * bh = bi->bi_bh;
+ struct block_head * blkh;
+ struct item_head * ih;
+ int last_loc, unmoved_loc;
+ int i;
+
+ blkh = B_BLK_HEAD(bh);
+ nr = blkh_nr_item(blkh);
+
+ /* item head of truncated item */
+ ih = B_N_PITEM_HEAD (bh, cut_item_num);
+
+ if (is_direntry_le_ih (ih)) {
+ /* first cut entry ()*/
+ cut_size = leaf_cut_entries (bh, ih, pos_in_item, cut_size);
+ if (pos_in_item == 0) {
+ /* change key */
+ RFALSE( cut_item_num,
+ "when 0-th enrty of item is cut, that item must be first in the node, not %d-th", cut_item_num);
+ /* change item key by key of first entry in the item */
+ set_le_ih_k_offset (ih, deh_offset(B_I_DEH (bh, ih)));
+ /*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE);*/
+ }
+ } else {
+ /* item is direct or indirect */
+ RFALSE( is_statdata_le_ih (ih), "10195: item is stat data");
+ RFALSE( pos_in_item && pos_in_item + cut_size != ih_item_len(ih),
+ "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)",
+ ( long unsigned ) pos_in_item, ( long unsigned ) cut_size,
+ ( long unsigned ) ih_item_len (ih));
+
+ /* shift item body to left if cut is from the head of item */
+ if (pos_in_item == 0) {
+ memmove( bh->b_data + ih_location(ih),
+ bh->b_data + ih_location(ih) + cut_size,
+ ih_item_len(ih) - cut_size);
+
+ /* change key of item */
+ if (is_direct_le_ih (ih))
+ set_le_ih_k_offset (ih, le_ih_k_offset (ih) + cut_size);
+ else {
+ set_le_ih_k_offset (ih, le_ih_k_offset (ih) + (cut_size / UNFM_P_SIZE) * bh->b_size);
+ RFALSE( ih_item_len(ih) == cut_size && get_ih_free_space (ih),
+ "10205: invalid ih_free_space (%h)", ih);
+ }
+ }
+ }
+
+
+ /* location of the last item */
+ last_loc = ih_location( &(ih[nr - cut_item_num - 1]) );
+
+ /* location of the item, which is remaining at the same place */
+ unmoved_loc = cut_item_num ? ih_location(ih-1) : bh->b_size;
+
+
+ /* shift */
+ memmove (bh->b_data + last_loc + cut_size, bh->b_data + last_loc,
+ unmoved_loc - last_loc - cut_size);
+
+ /* change item length */
+ put_ih_item_len( ih, ih_item_len(ih) - cut_size );
+
+ if (is_indirect_le_ih (ih)) {
+ if (pos_in_item)
+ set_ih_free_space (ih, 0);
+ }
+
+ /* change locations */
+ for (i = cut_item_num; i < nr; i ++)
+ put_ih_location( &(ih[i-cut_item_num]), ih_location( &ih[i-cut_item_num]) + cut_size );
+
+ /* size, free space */
+ set_blkh_free_space( blkh, blkh_free_space(blkh) + cut_size );
+
+ do_balance_mark_leaf_dirty (bi->tb, bh, 0);
+
+ if (bi->bi_parent) {
+ struct disk_child *t_dc;
+ t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position);
+ put_dc_size( t_dc, dc_size(t_dc) - cut_size );
+ do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0);
+ }
+}
+
+
+/* delete del_num items from buffer starting from the first'th item */
+static void leaf_delete_items_entirely (struct buffer_info * bi,
+ int first, int del_num)
+{
+ struct buffer_head * bh = bi->bi_bh;
+ int nr;
+ int i, j;
+ int last_loc, last_removed_loc;
+ struct block_head * blkh;
+ struct item_head * ih;
+
+ RFALSE( bh == NULL, "10210: buffer is 0");
+ RFALSE( del_num < 0, "10215: del_num less than 0 (%d)", del_num);
+
+ if (del_num == 0)
+ return;
+
+ blkh = B_BLK_HEAD(bh);
+ nr = blkh_nr_item(blkh);
+
+ RFALSE( first < 0 || first + del_num > nr,
+ "10220: first=%d, number=%d, there is %d items", first, del_num, nr);
+
+ if (first == 0 && del_num == nr) {
+ /* this does not work */
+ make_empty_node (bi);
+
+ do_balance_mark_leaf_dirty (bi->tb, bh, 0);
+ return;
+ }
+
+ ih = B_N_PITEM_HEAD (bh, first);
+
+ /* location of unmovable item */
+ j = (first == 0) ? bh->b_size : ih_location(ih-1);
+
+ /* delete items */
+ last_loc = ih_location( &(ih[nr-1-first]) );
+ last_removed_loc = ih_location( &(ih[del_num-1]) );
+
+ memmove (bh->b_data + last_loc + j - last_removed_loc,
+ bh->b_data + last_loc, last_removed_loc - last_loc);
+
+ /* delete item headers */
+ memmove (ih, ih + del_num, (nr - first - del_num) * IH_SIZE);
+
+ /* change item location */
+ for (i = first; i < nr - del_num; i ++)
+ put_ih_location( &(ih[i-first]), ih_location( &(ih[i-first]) ) + (j - last_removed_loc) );
+
+ /* sizes, item number */
+ set_blkh_nr_item( blkh, blkh_nr_item(blkh) - del_num );
+ set_blkh_free_space( blkh, blkh_free_space(blkh) + (j - last_removed_loc + IH_SIZE * del_num) );
+
+ do_balance_mark_leaf_dirty (bi->tb, bh, 0);
+
+ if (bi->bi_parent) {
+ struct disk_child *t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position);
+ put_dc_size( t_dc, dc_size(t_dc) -
+ (j - last_removed_loc + IH_SIZE * del_num));
+ do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0);
+ }
+}
+
+
+
+
+
+/* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */
+void leaf_paste_entries (
+ struct buffer_head * bh,
+ int item_num,
+ int before,
+ int new_entry_count,
+ struct reiserfs_de_head * new_dehs,
+ const char * records,
+ int paste_size
+ )
+{
+ struct item_head * ih;
+ char * item;
+ struct reiserfs_de_head * deh;
+ char * insert_point;
+ int i, old_entry_num;
+
+ if (new_entry_count == 0)
+ return;
+
+ ih = B_N_PITEM_HEAD(bh, item_num);
+
+ /* make sure, that item is directory, and there are enough records in it */
+ RFALSE( !is_direntry_le_ih (ih), "10225: item is not directory item");
+ RFALSE( I_ENTRY_COUNT (ih) < before,
+ "10230: there are no entry we paste entries before. entry_count = %d, before = %d",
+ I_ENTRY_COUNT (ih), before);
+
+
+ /* first byte of dest item */
+ item = bh->b_data + ih_location(ih);
+
+ /* entry head array */
+ deh = B_I_DEH (bh, ih);
+
+ /* new records will be pasted at this point */
+ insert_point = item + (before ? deh_location( &(deh[before - 1])) : (ih_item_len(ih) - paste_size));
+
+ /* adjust locations of records that will be AFTER new records */
+ for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i --)
+ put_deh_location( &(deh[i]),
+ deh_location(&(deh[i])) + (DEH_SIZE * new_entry_count ));
+
+ /* adjust locations of records that will be BEFORE new records */
+ for (i = 0; i < before; i ++)
+ put_deh_location( &(deh[i]), deh_location(&(deh[i])) + paste_size );
+
+ old_entry_num = I_ENTRY_COUNT(ih);
+ put_ih_entry_count( ih, ih_entry_count(ih) + new_entry_count );
+
+ /* prepare space for pasted records */
+ memmove (insert_point + paste_size, insert_point, item + (ih_item_len(ih) - paste_size) - insert_point);
+
+ /* copy new records */
+ memcpy (insert_point + DEH_SIZE * new_entry_count, records,
+ paste_size - DEH_SIZE * new_entry_count);
+
+ /* prepare space for new entry heads */
+ deh += before;
+ memmove ((char *)(deh + new_entry_count), deh, insert_point - (char *)deh);
+
+ /* copy new entry heads */
+ deh = (struct reiserfs_de_head *)((char *)deh);
+ memcpy (deh, new_dehs, DEH_SIZE * new_entry_count);
+
+ /* set locations of new records */
+ for (i = 0; i < new_entry_count; i ++)
+ {
+ put_deh_location( &(deh[i]),
+ deh_location( &(deh[i] )) +
+ (- deh_location( &(new_dehs[new_entry_count - 1])) +
+ insert_point + DEH_SIZE * new_entry_count - item));
+ }
+
+
+ /* change item key if necessary (when we paste before 0-th entry */
+ if (!before)
+ {
+ set_le_ih_k_offset (ih, deh_offset(new_dehs));
+/* memcpy (&ih->ih_key.k_offset,
+ &new_dehs->deh_offset, SHORT_KEY_SIZE);*/
+ }
+
+#ifdef CONFIG_REISERFS_CHECK
+ {
+ int prev, next;
+ /* check record locations */
+ deh = B_I_DEH (bh, ih);
+ for (i = 0; i < I_ENTRY_COUNT(ih); i ++) {
+ next = (i < I_ENTRY_COUNT(ih) - 1) ? deh_location( &(deh[i + 1])) : 0;
+ prev = (i != 0) ? deh_location( &(deh[i - 1]) ) : 0;
+
+ if (prev && prev <= deh_location( &(deh[i])))
+ reiserfs_warning (NULL, "vs-10240: leaf_paste_entries: directory item (%h) corrupted (prev %a, cur(%d) %a)",
+ ih, deh + i - 1, i, deh + i);
+ if (next && next >= deh_location( &(deh[i])))
+ reiserfs_warning (NULL, "vs-10250: leaf_paste_entries: directory item (%h) corrupted (cur(%d) %a, next %a)",
+ ih, i, deh + i, deh + i + 1);
+ }
+ }
+#endif
+
+}
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
new file mode 100644
index 00000000000..80e92d9b81c
--- /dev/null
+++ b/fs/reiserfs/namei.c
@@ -0,0 +1,1491 @@
+/*
+ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
+ *
+ * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility
+ *
+ * Trivial Changes:
+ * Rights granted to Hans Reiser to redistribute under other terms providing
+ * he accepts all liability including but not limited to patent, fitness
+ * for purpose, and direct or indirect claims arising from failure to perform.
+ *
+ * NO WARRANTY
+ */
+
+#include <linux/config.h>
+#include <linux/time.h>
+#include <linux/bitops.h>
+#include <linux/reiserfs_fs.h>
+#include <linux/reiserfs_acl.h>
+#include <linux/reiserfs_xattr.h>
+#include <linux/smp_lock.h>
+#include <linux/quotaops.h>
+
+#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
+#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) i->i_nlink--;
+
+// directory item contains array of entry headers. This performs
+// binary search through that array
+static int bin_search_in_dir_item (struct reiserfs_dir_entry * de, loff_t off)
+{
+ struct item_head * ih = de->de_ih;
+ struct reiserfs_de_head * deh = de->de_deh;
+ int rbound, lbound, j;
+
+ lbound = 0;
+ rbound = I_ENTRY_COUNT (ih) - 1;
+
+ for (j = (rbound + lbound) / 2; lbound <= rbound; j = (rbound + lbound) / 2) {
+ if (off < deh_offset (deh + j)) {
+ rbound = j - 1;
+ continue;
+ }
+ if (off > deh_offset (deh + j)) {
+ lbound = j + 1;
+ continue;
+ }
+ // this is not name found, but matched third key component
+ de->de_entry_num = j;
+ return NAME_FOUND;
+ }
+
+ de->de_entry_num = lbound;
+ return NAME_NOT_FOUND;
+}
+
+
+// comment? maybe something like set de to point to what the path points to?
+static inline void set_de_item_location (struct reiserfs_dir_entry * de, struct path * path)
+{
+ de->de_bh = get_last_bh (path);
+ de->de_ih = get_ih (path);
+ de->de_deh = B_I_DEH (de->de_bh, de->de_ih);
+ de->de_item_num = PATH_LAST_POSITION (path);
+}
+
+
+// de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
+inline void set_de_name_and_namelen (struct reiserfs_dir_entry * de)
+{
+ struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num;
+
+ if (de->de_entry_num >= ih_entry_count (de->de_ih))
+ BUG ();
+
+ de->de_entrylen = entry_length (de->de_bh, de->de_ih, de->de_entry_num);
+ de->de_namelen = de->de_entrylen - (de_with_sd (deh) ? SD_SIZE : 0);
+ de->de_name = B_I_PITEM (de->de_bh, de->de_ih) + deh_location(deh);
+ if (de->de_name[de->de_namelen - 1] == 0)
+ de->de_namelen = strlen (de->de_name);
+}
+
+
+// what entry points to
+static inline void set_de_object_key (struct reiserfs_dir_entry * de)
+{
+ if (de->de_entry_num >= ih_entry_count (de->de_ih))
+ BUG ();
+ de->de_dir_id = deh_dir_id( &(de->de_deh[de->de_entry_num]));
+ de->de_objectid = deh_objectid( &(de->de_deh[de->de_entry_num]));
+}
+
+
+static inline void store_de_entry_key (struct reiserfs_dir_entry * de)
+{
+ struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num;
+
+ if (de->de_entry_num >= ih_entry_count (de->de_ih))
+ BUG ();
+
+ /* store key of the found entry */
+ de->de_entry_key.version = KEY_FORMAT_3_5;
+ de->de_entry_key.on_disk_key.k_dir_id = le32_to_cpu (de->de_ih->ih_key.k_dir_id);
+ de->de_entry_key.on_disk_key.k_objectid = le32_to_cpu (de->de_ih->ih_key.k_objectid);
+ set_cpu_key_k_offset (&(de->de_entry_key), deh_offset (deh));
+ set_cpu_key_k_type (&(de->de_entry_key), TYPE_DIRENTRY);
+}
+
+
+/* We assign a key to each directory item, and place multiple entries
+in a single directory item. A directory item has a key equal to the
+key of the first directory entry in it.
+
+This function first calls search_by_key, then, if item whose first
+entry matches is not found it looks for the entry inside directory
+item found by search_by_key. Fills the path to the entry, and to the
+entry position in the item
+
+*/
+
+/* The function is NOT SCHEDULE-SAFE! */
+int search_by_entry_key (struct super_block * sb, const struct cpu_key * key,
+ struct path * path, struct reiserfs_dir_entry * de)
+{
+ int retval;
+
+ retval = search_item (sb, key, path);
+ switch (retval) {
+ case ITEM_NOT_FOUND:
+ if (!PATH_LAST_POSITION (path)) {
+ reiserfs_warning (sb, "vs-7000: search_by_entry_key: search_by_key returned item position == 0");
+ pathrelse(path) ;
+ return IO_ERROR ;
+ }
+ PATH_LAST_POSITION (path) --;
+
+ case ITEM_FOUND:
+ break;
+
+ case IO_ERROR:
+ return retval;
+
+ default:
+ pathrelse (path);
+ reiserfs_warning (sb, "vs-7002: search_by_entry_key: no path to here");
+ return IO_ERROR;
+ }
+
+ set_de_item_location (de, path);
+
+#ifdef CONFIG_REISERFS_CHECK
+ if (!is_direntry_le_ih (de->de_ih) ||
+ COMP_SHORT_KEYS (&(de->de_ih->ih_key), key)) {
+ print_block (de->de_bh, 0, -1, -1);
+ reiserfs_panic (sb, "vs-7005: search_by_entry_key: found item %h is not directory item or "
+ "does not belong to the same directory as key %K", de->de_ih, key);
+ }
+#endif /* CONFIG_REISERFS_CHECK */
+
+ /* binary search in directory item by third componen t of the
+ key. sets de->de_entry_num of de */
+ retval = bin_search_in_dir_item (de, cpu_key_k_offset (key));
+ path->pos_in_item = de->de_entry_num;
+ if (retval != NAME_NOT_FOUND) {
+ // ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set
+ set_de_name_and_namelen (de);
+ set_de_object_key (de);
+ }
+ return retval;
+}
+
+
+
+/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */
+
+/* The third component is hashed, and you can choose from more than
+ one hash function. Per directory hashes are not yet implemented
+ but are thought about. This function should be moved to hashes.c
+ Jedi, please do so. -Hans */
+
+static __u32 get_third_component (struct super_block * s,
+ const char * name, int len)
+{
+ __u32 res;
+
+ if (!len || (len == 1 && name[0] == '.'))
+ return DOT_OFFSET;
+ if (len == 2 && name[0] == '.' && name[1] == '.')
+ return DOT_DOT_OFFSET;
+
+ res = REISERFS_SB(s)->s_hash_function (name, len);
+
+ // take bits from 7-th to 30-th including both bounds
+ res = GET_HASH_VALUE(res);
+ if (res == 0)
+ // needed to have no names before "." and ".." those have hash
+ // value == 0 and generation conters 1 and 2 accordingly
+ res = 128;
+ return res + MAX_GENERATION_NUMBER;
+}
+
+
+static int reiserfs_match (struct reiserfs_dir_entry * de,
+ const char * name, int namelen)
+{
+ int retval = NAME_NOT_FOUND;
+
+ if ((namelen == de->de_namelen) &&
+ !memcmp(de->de_name, name, de->de_namelen))
+ retval = (de_visible (de->de_deh + de->de_entry_num) ? NAME_FOUND : NAME_FOUND_INVISIBLE);
+
+ return retval;
+}
+
+
+/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */
+
+ /* used when hash collisions exist */
+
+
+static int linear_search_in_dir_item (struct cpu_key * key, struct reiserfs_dir_entry * de,
+ const char * name, int namelen)
+{
+ struct reiserfs_de_head * deh = de->de_deh;
+ int retval;
+ int i;
+
+ i = de->de_entry_num;
+
+ if (i == I_ENTRY_COUNT (de->de_ih) ||
+ GET_HASH_VALUE (deh_offset (deh + i)) != GET_HASH_VALUE (cpu_key_k_offset (key))) {
+ i --;
+ }
+
+ RFALSE( de->de_deh != B_I_DEH (de->de_bh, de->de_ih),
+ "vs-7010: array of entry headers not found");
+
+ deh += i;
+
+ for (; i >= 0; i --, deh --) {
+ if (GET_HASH_VALUE (deh_offset (deh)) !=
+ GET_HASH_VALUE (cpu_key_k_offset (key))) {
+ // hash value does not match, no need to check whole name
+ return NAME_NOT_FOUND;
+ }
+
+ /* mark, that this generation number is used */
+ if (de->de_gen_number_bit_string)
+ set_bit (GET_GENERATION_NUMBER (deh_offset (deh)), (unsigned long *)de->de_gen_number_bit_string);
+
+ // calculate pointer to name and namelen
+ de->de_entry_num = i;
+ set_de_name_and_namelen (de);
+
+ if ((retval = reiserfs_match (de, name, namelen)) != NAME_NOT_FOUND) {
+ // de's de_name, de_namelen, de_recordlen are set. Fill the rest:
+
+ // key of pointed object
+ set_de_object_key (de);
+
+ store_de_entry_key (de);
+
+ // retval can be NAME_FOUND or NAME_FOUND_INVISIBLE
+ return retval;
+ }
+ }
+
+ if (GET_GENERATION_NUMBER (le_ih_k_offset (de->de_ih)) == 0)
+ /* we have reached left most entry in the node. In common we
+ have to go to the left neighbor, but if generation counter
+ is 0 already, we know for sure, that there is no name with
+ the same hash value */
+ // FIXME: this work correctly only because hash value can not
+ // be 0. Btw, in case of Yura's hash it is probably possible,
+ // so, this is a bug
+ return NAME_NOT_FOUND;
+
+ RFALSE( de->de_item_num,
+ "vs-7015: two diritems of the same directory in one node?");
+
+ return GOTO_PREVIOUS_ITEM;
+}
+
+
+// may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
+// FIXME: should add something like IOERROR
+static int reiserfs_find_entry (struct inode * dir, const char * name, int namelen,
+ struct path * path_to_entry, struct reiserfs_dir_entry * de)
+{
+ struct cpu_key key_to_search;
+ int retval;
+
+
+ if (namelen > REISERFS_MAX_NAME (dir->i_sb->s_blocksize))
+ return NAME_NOT_FOUND;
+
+ /* we will search for this key in the tree */
+ make_cpu_key (&key_to_search, dir,
+ get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3);
+
+ while (1) {
+ retval = search_by_entry_key (dir->i_sb, &key_to_search, path_to_entry, de);
+ if (retval == IO_ERROR) {
+ reiserfs_warning (dir->i_sb, "zam-7001: io error in %s",
+ __FUNCTION__);
+ return IO_ERROR;
+ }
+
+ /* compare names for all entries having given hash value */
+ retval = linear_search_in_dir_item (&key_to_search, de, name, namelen);
+ if (retval != GOTO_PREVIOUS_ITEM) {
+ /* there is no need to scan directory anymore. Given entry found or does not exist */
+ path_to_entry->pos_in_item = de->de_entry_num;
+ return retval;
+ }
+
+ /* there is left neighboring item of this directory and given entry can be there */
+ set_cpu_key_k_offset (&key_to_search, le_ih_k_offset (de->de_ih) - 1);
+ pathrelse (path_to_entry);
+
+ } /* while (1) */
+}
+
+
+static struct dentry * reiserfs_lookup (struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+{
+ int retval;
+ struct inode * inode = NULL;
+ struct reiserfs_dir_entry de;
+ INITIALIZE_PATH (path_to_entry);
+
+ if (REISERFS_MAX_NAME (dir->i_sb->s_blocksize) < dentry->d_name.len)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ reiserfs_write_lock(dir->i_sb);
+ de.de_gen_number_bit_string = NULL;
+ retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path_to_entry, &de);
+ pathrelse (&path_to_entry);
+ if (retval == NAME_FOUND) {
+ /* Hide the .reiserfs_priv directory */
+ if (reiserfs_xattrs (dir->i_sb) &&
+ !old_format_only(dir->i_sb) &&
+ REISERFS_SB(dir->i_sb)->priv_root &&
+ REISERFS_SB(dir->i_sb)->priv_root->d_inode &&
+ de.de_objectid == le32_to_cpu (INODE_PKEY(REISERFS_SB(dir->i_sb)->priv_root->d_inode)->k_objectid)) {
+ reiserfs_write_unlock (dir->i_sb);
+ return ERR_PTR (-EACCES);
+ }
+
+ inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
+ if (!inode || IS_ERR(inode)) {
+ reiserfs_write_unlock(dir->i_sb);
+ return ERR_PTR(-EACCES);
+ }
+
+ /* Propogate the priv_object flag so we know we're in the priv tree */
+ if (is_reiserfs_priv_object (dir))
+ reiserfs_mark_inode_private (inode);
+ }
+ reiserfs_write_unlock(dir->i_sb);
+ if ( retval == IO_ERROR ) {
+ return ERR_PTR(-EIO);
+ }
+
+ if (inode)
+ return d_splice_alias(inode, dentry);
+
+ d_add(dentry, inode);
+ return NULL;
+}
+
+
+/*
+** looks up the dentry of the parent directory for child.
+** taken from ext2_get_parent
+*/
+struct dentry *reiserfs_get_parent(struct dentry *child)
+{
+ int retval;
+ struct inode * inode = NULL;
+ struct reiserfs_dir_entry de;
+ INITIALIZE_PATH (path_to_entry);
+ struct dentry *parent;
+ struct inode *dir = child->d_inode ;
+
+
+ if (dir->i_nlink == 0) {
+ return ERR_PTR(-ENOENT);
+ }
+ de.de_gen_number_bit_string = NULL;
+
+ reiserfs_write_lock(dir->i_sb);
+ retval = reiserfs_find_entry (dir, "..", 2, &path_to_entry, &de);
+ pathrelse (&path_to_entry);
+ if (retval != NAME_FOUND) {
+ reiserfs_write_unlock(dir->i_sb);
+ return ERR_PTR(-ENOENT);
+ }
+ inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
+ reiserfs_write_unlock(dir->i_sb);
+
+ if (!inode || IS_ERR(inode)) {
+ return ERR_PTR(-EACCES);
+ }
+ parent = d_alloc_anon(inode);
+ if (!parent) {
+ iput(inode);
+ parent = ERR_PTR(-ENOMEM);
+ }
+ return parent;
+}
+
+
+/* add entry to the directory (entry can be hidden).
+
+insert definition of when hidden directories are used here -Hans
+
+ Does not mark dir inode dirty, do it after successesfull call to it */
+
+static int reiserfs_add_entry (struct reiserfs_transaction_handle *th, struct inode * dir,
+ const char * name, int namelen, struct inode * inode,
+ int visible)
+{
+ struct cpu_key entry_key;
+ struct reiserfs_de_head * deh;
+ INITIALIZE_PATH (path);
+ struct reiserfs_dir_entry de;
+ int bit_string [MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1];
+ int gen_number;
+ char small_buf[32+DEH_SIZE] ; /* 48 bytes now and we avoid kmalloc
+ if we create file with short name */
+ char * buffer;
+ int buflen, paste_size;
+ int retval;
+
+ BUG_ON (!th->t_trans_id);
+
+ /* cannot allow items to be added into a busy deleted directory */
+ if (!namelen)
+ return -EINVAL;
+
+ if (namelen > REISERFS_MAX_NAME (dir->i_sb->s_blocksize))
+ return -ENAMETOOLONG;
+
+ /* each entry has unique key. compose it */
+ make_cpu_key (&entry_key, dir,
+ get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3);
+
+ /* get memory for composing the entry */
+ buflen = DEH_SIZE + ROUND_UP (namelen);
+ if (buflen > sizeof (small_buf)) {
+ buffer = reiserfs_kmalloc (buflen, GFP_NOFS, dir->i_sb);
+ if (buffer == 0)
+ return -ENOMEM;
+ } else
+ buffer = small_buf;
+
+ paste_size = (get_inode_sd_version (dir) == STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen;
+
+ /* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */
+ deh = (struct reiserfs_de_head *)buffer;
+ deh->deh_location = 0; /* JDM Endian safe if 0 */
+ put_deh_offset( deh, cpu_key_k_offset( &entry_key ) );
+ deh->deh_state = 0; /* JDM Endian safe if 0 */
+ /* put key (ino analog) to de */
+ deh->deh_dir_id = INODE_PKEY (inode)->k_dir_id; /* safe: k_dir_id is le */
+ deh->deh_objectid = INODE_PKEY (inode)->k_objectid; /* safe: k_objectid is le */
+
+ /* copy name */
+ memcpy ((char *)(deh + 1), name, namelen);
+ /* padd by 0s to the 4 byte boundary */
+ padd_item ((char *)(deh + 1), ROUND_UP (namelen), namelen);
+
+ /* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */
+ mark_de_without_sd (deh);
+ visible ? mark_de_visible (deh) : mark_de_hidden (deh);
+
+ /* find the proper place for the new entry */
+ memset (bit_string, 0, sizeof (bit_string));
+ de.de_gen_number_bit_string = (char *)bit_string;
+ retval = reiserfs_find_entry (dir, name, namelen, &path, &de);
+ if( retval != NAME_NOT_FOUND ) {
+ if (buffer != small_buf)
+ reiserfs_kfree (buffer, buflen, dir->i_sb);
+ pathrelse (&path);
+
+ if ( retval == IO_ERROR ) {
+ return -EIO;
+ }
+
+ if (retval != NAME_FOUND) {
+ reiserfs_warning (dir->i_sb, "zam-7002:%s: \"reiserfs_find_entry\" "
+ "has returned unexpected value (%d)",
+ __FUNCTION__, retval);
+ }
+
+ return -EEXIST;
+ }
+
+ gen_number = find_first_zero_bit ((unsigned long *)bit_string, MAX_GENERATION_NUMBER + 1);
+ if (gen_number > MAX_GENERATION_NUMBER) {
+ /* there is no free generation number */
+ reiserfs_warning (dir->i_sb, "reiserfs_add_entry: Congratulations! we have got hash function screwed up");
+ if (buffer != small_buf)
+ reiserfs_kfree (buffer, buflen, dir->i_sb);
+ pathrelse (&path);
+ return -EBUSY;
+ }
+ /* adjust offset of directory enrty */
+ put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number));
+ set_cpu_key_k_offset (&entry_key, deh_offset(deh));
+
+ /* update max-hash-collisions counter in reiserfs_sb_info */
+ PROC_INFO_MAX( th -> t_super, max_hash_collisions, gen_number );
+
+ if (gen_number != 0) { /* we need to re-search for the insertion point */
+ if (search_by_entry_key (dir->i_sb, &entry_key, &path, &de) != NAME_NOT_FOUND) {
+ reiserfs_warning (dir->i_sb, "vs-7032: reiserfs_add_entry: "
+ "entry with this key (%K) already exists",
+ &entry_key);
+
+ if (buffer != small_buf)
+ reiserfs_kfree (buffer, buflen, dir->i_sb);
+ pathrelse (&path);
+ return -EBUSY;
+ }
+ }
+
+ /* perform the insertion of the entry that we have prepared */
+ retval = reiserfs_paste_into_item (th, &path, &entry_key, dir, buffer, paste_size);
+ if (buffer != small_buf)
+ reiserfs_kfree (buffer, buflen, dir->i_sb);
+ if (retval) {
+ reiserfs_check_path(&path) ;
+ return retval;
+ }
+
+ dir->i_size += paste_size;
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ if (!S_ISDIR (inode->i_mode) && visible)
+ // reiserfs_mkdir or reiserfs_rename will do that by itself
+ reiserfs_update_sd (th, dir);
+
+ reiserfs_check_path(&path) ;
+ return 0;
+}
+
+/* quota utility function, call if you've had to abort after calling
+** new_inode_init, and have not called reiserfs_new_inode yet.
+** This should only be called on inodes that do not have stat data
+** inserted into the tree yet.
+*/
+static int drop_new_inode(struct inode *inode) {
+ DQUOT_DROP(inode);
+ make_bad_inode(inode) ;
+ inode->i_flags |= S_NOQUOTA;
+ iput(inode) ;
+ return 0 ;
+}
+
+/* utility function that does setup for reiserfs_new_inode.
+** DQUOT_INIT needs lots of credits so it's better to have it
+** outside of a transaction, so we had to pull some bits of
+** reiserfs_new_inode out into this func.
+*/
+static int new_inode_init(struct inode *inode, struct inode *dir, int mode) {
+
+ /* the quota init calls have to know who to charge the quota to, so
+ ** we have to set uid and gid here
+ */
+ inode->i_uid = current->fsuid;
+ inode->i_mode = mode;
+
+ if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ inode->i_mode |= S_ISGID;
+ } else {
<