]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - fs/reiserfs/stree.c
Merge branch 'for-4.8/core' of git://git.kernel.dk/linux-block
[karo-tx-linux.git] / fs / reiserfs / stree.c
1 /*
2  *  Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4
5 /*
6  *  Written by Anatoly P. Pinchuk pap@namesys.botik.ru
7  *  Programm System Institute
8  *  Pereslavl-Zalessky Russia
9  */
10
11 #include <linux/time.h>
12 #include <linux/string.h>
13 #include <linux/pagemap.h>
14 #include "reiserfs.h"
15 #include <linux/buffer_head.h>
16 #include <linux/quotaops.h>
17
18 /* Does the buffer contain a disk block which is in the tree. */
19 inline int B_IS_IN_TREE(const struct buffer_head *bh)
20 {
21
22         RFALSE(B_LEVEL(bh) > MAX_HEIGHT,
23                "PAP-1010: block (%b) has too big level (%z)", bh, bh);
24
25         return (B_LEVEL(bh) != FREE_LEVEL);
26 }
27
28 /* to get item head in le form */
29 inline void copy_item_head(struct item_head *to,
30                            const struct item_head *from)
31 {
32         memcpy(to, from, IH_SIZE);
33 }
34
35 /*
36  * k1 is pointer to on-disk structure which is stored in little-endian
37  * form. k2 is pointer to cpu variable. For key of items of the same
38  * object this returns 0.
39  * Returns: -1 if key1 < key2
40  * 0 if key1 == key2
41  * 1 if key1 > key2
42  */
43 inline int comp_short_keys(const struct reiserfs_key *le_key,
44                            const struct cpu_key *cpu_key)
45 {
46         __u32 n;
47         n = le32_to_cpu(le_key->k_dir_id);
48         if (n < cpu_key->on_disk_key.k_dir_id)
49                 return -1;
50         if (n > cpu_key->on_disk_key.k_dir_id)
51                 return 1;
52         n = le32_to_cpu(le_key->k_objectid);
53         if (n < cpu_key->on_disk_key.k_objectid)
54                 return -1;
55         if (n > cpu_key->on_disk_key.k_objectid)
56                 return 1;
57         return 0;
58 }
59
60 /*
61  * k1 is pointer to on-disk structure which is stored in little-endian
62  * form. k2 is pointer to cpu variable.
63  * Compare keys using all 4 key fields.
64  * Returns: -1 if key1 < key2 0
65  * if key1 = key2 1 if key1 > key2
66  */
67 static inline int comp_keys(const struct reiserfs_key *le_key,
68                             const struct cpu_key *cpu_key)
69 {
70         int retval;
71
72         retval = comp_short_keys(le_key, cpu_key);
73         if (retval)
74                 return retval;
75         if (le_key_k_offset(le_key_version(le_key), le_key) <
76             cpu_key_k_offset(cpu_key))
77                 return -1;
78         if (le_key_k_offset(le_key_version(le_key), le_key) >
79             cpu_key_k_offset(cpu_key))
80                 return 1;
81
82         if (cpu_key->key_length == 3)
83                 return 0;
84
85         /* this part is needed only when tail conversion is in progress */
86         if (le_key_k_type(le_key_version(le_key), le_key) <
87             cpu_key_k_type(cpu_key))
88                 return -1;
89
90         if (le_key_k_type(le_key_version(le_key), le_key) >
91             cpu_key_k_type(cpu_key))
92                 return 1;
93
94         return 0;
95 }
96
97 inline int comp_short_le_keys(const struct reiserfs_key *key1,
98                               const struct reiserfs_key *key2)
99 {
100         __u32 *k1_u32, *k2_u32;
101         int key_length = REISERFS_SHORT_KEY_LEN;
102
103         k1_u32 = (__u32 *) key1;
104         k2_u32 = (__u32 *) key2;
105         for (; key_length--; ++k1_u32, ++k2_u32) {
106                 if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32))
107                         return -1;
108                 if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32))
109                         return 1;
110         }
111         return 0;
112 }
113
114 inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from)
115 {
116         int version;
117         to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id);
118         to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid);
119
120         /* find out version of the key */
121         version = le_key_version(from);
122         to->version = version;
123         to->on_disk_key.k_offset = le_key_k_offset(version, from);
124         to->on_disk_key.k_type = le_key_k_type(version, from);
125 }
126
127 /*
128  * this does not say which one is bigger, it only returns 1 if keys
129  * are not equal, 0 otherwise
130  */
131 inline int comp_le_keys(const struct reiserfs_key *k1,
132                         const struct reiserfs_key *k2)
133 {
134         return memcmp(k1, k2, sizeof(struct reiserfs_key));
135 }
136
137 /**************************************************************************
138  *  Binary search toolkit function                                        *
139  *  Search for an item in the array by the item key                       *
140  *  Returns:    1 if found,  0 if not found;                              *
141  *        *pos = number of the searched element if found, else the        *
142  *        number of the first element that is larger than key.            *
143  **************************************************************************/
144 /*
145  * For those not familiar with binary search: lbound is the leftmost item
146  * that it could be, rbound the rightmost item that it could be.  We examine
147  * the item halfway between lbound and rbound, and that tells us either
148  * that we can increase lbound, or decrease rbound, or that we have found it,
149  * or if lbound <= rbound that there are no possible items, and we have not
150  * found it. With each examination we cut the number of possible items it
151  * could be by one more than half rounded down, or we find it.
152  */
153 static inline int bin_search(const void *key,   /* Key to search for. */
154                              const void *base,  /* First item in the array. */
155                              int num,   /* Number of items in the array. */
156                              /*
157                               * Item size in the array.  searched. Lest the
158                               * reader be confused, note that this is crafted
159                               * as a general function, and when it is applied
160                               * specifically to the array of item headers in a
161                               * node, width is actually the item header size
162                               * not the item size.
163                               */
164                              int width,
165                              int *pos /* Number of the searched for element. */
166     )
167 {
168         int rbound, lbound, j;
169
170         for (j = ((rbound = num - 1) + (lbound = 0)) / 2;
171              lbound <= rbound; j = (rbound + lbound) / 2)
172                 switch (comp_keys
173                         ((struct reiserfs_key *)((char *)base + j * width),
174                          (struct cpu_key *)key)) {
175                 case -1:
176                         lbound = j + 1;
177                         continue;
178                 case 1:
179                         rbound = j - 1;
180                         continue;
181                 case 0:
182                         *pos = j;
183                         return ITEM_FOUND;      /* Key found in the array.  */
184                 }
185
186         /*
187          * bin_search did not find given key, it returns position of key,
188          * that is minimal and greater than the given one.
189          */
190         *pos = lbound;
191         return ITEM_NOT_FOUND;
192 }
193
194
195 /* Minimal possible key. It is never in the tree. */
196 const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
197
198 /* Maximal possible key. It is never in the tree. */
199 static const struct reiserfs_key MAX_KEY = {
200         cpu_to_le32(0xffffffff),
201         cpu_to_le32(0xffffffff),
202         {{cpu_to_le32(0xffffffff),
203           cpu_to_le32(0xffffffff)},}
204 };
205
206 /*
207  * Get delimiting key of the buffer by looking for it in the buffers in the
208  * path, starting from the bottom of the path, and going upwards.  We must
209  * check the path's validity at each step.  If the key is not in the path,
210  * there is no delimiting key in the tree (buffer is first or last buffer
211  * in tree), and in this case we return a special key, either MIN_KEY or
212  * MAX_KEY.
213  */
214 static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path,
215                                                   const struct super_block *sb)
216 {
217         int position, path_offset = chk_path->path_length;
218         struct buffer_head *parent;
219
220         RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
221                "PAP-5010: invalid offset in the path");
222
223         /* While not higher in path than first element. */
224         while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
225
226                 RFALSE(!buffer_uptodate
227                        (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
228                        "PAP-5020: parent is not uptodate");
229
230                 /* Parent at the path is not in the tree now. */
231                 if (!B_IS_IN_TREE
232                     (parent =
233                      PATH_OFFSET_PBUFFER(chk_path, path_offset)))
234                         return &MAX_KEY;
235                 /* Check whether position in the parent is correct. */
236                 if ((position =
237                      PATH_OFFSET_POSITION(chk_path,
238                                           path_offset)) >
239                     B_NR_ITEMS(parent))
240                         return &MAX_KEY;
241                 /* Check whether parent at the path really points to the child. */
242                 if (B_N_CHILD_NUM(parent, position) !=
243                     PATH_OFFSET_PBUFFER(chk_path,
244                                         path_offset + 1)->b_blocknr)
245                         return &MAX_KEY;
246                 /*
247                  * Return delimiting key if position in the parent
248                  * is not equal to zero.
249                  */
250                 if (position)
251                         return internal_key(parent, position - 1);
252         }
253         /* Return MIN_KEY if we are in the root of the buffer tree. */
254         if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
255             b_blocknr == SB_ROOT_BLOCK(sb))
256                 return &MIN_KEY;
257         return &MAX_KEY;
258 }
259
260 /* Get delimiting key of the buffer at the path and its right neighbor. */
261 inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
262                                            const struct super_block *sb)
263 {
264         int position, path_offset = chk_path->path_length;
265         struct buffer_head *parent;
266
267         RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
268                "PAP-5030: invalid offset in the path");
269
270         while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
271
272                 RFALSE(!buffer_uptodate
273                        (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
274                        "PAP-5040: parent is not uptodate");
275
276                 /* Parent at the path is not in the tree now. */
277                 if (!B_IS_IN_TREE
278                     (parent =
279                      PATH_OFFSET_PBUFFER(chk_path, path_offset)))
280                         return &MIN_KEY;
281                 /* Check whether position in the parent is correct. */
282                 if ((position =
283                      PATH_OFFSET_POSITION(chk_path,
284                                           path_offset)) >
285                     B_NR_ITEMS(parent))
286                         return &MIN_KEY;
287                 /*
288                  * Check whether parent at the path really points
289                  * to the child.
290                  */
291                 if (B_N_CHILD_NUM(parent, position) !=
292                     PATH_OFFSET_PBUFFER(chk_path,
293                                         path_offset + 1)->b_blocknr)
294                         return &MIN_KEY;
295
296                 /*
297                  * Return delimiting key if position in the parent
298                  * is not the last one.
299                  */
300                 if (position != B_NR_ITEMS(parent))
301                         return internal_key(parent, position);
302         }
303
304         /* Return MAX_KEY if we are in the root of the buffer tree. */
305         if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
306             b_blocknr == SB_ROOT_BLOCK(sb))
307                 return &MAX_KEY;
308         return &MIN_KEY;
309 }
310
311 /*
312  * Check whether a key is contained in the tree rooted from a buffer at a path.
313  * This works by looking at the left and right delimiting keys for the buffer
314  * in the last path_element in the path.  These delimiting keys are stored
315  * at least one level above that buffer in the tree. If the buffer is the
316  * first or last node in the tree order then one of the delimiting keys may
317  * be absent, and in this case get_lkey and get_rkey return a special key
318  * which is MIN_KEY or MAX_KEY.
319  */
320 static inline int key_in_buffer(
321                                 /* Path which should be checked. */
322                                 struct treepath *chk_path,
323                                 /* Key which should be checked. */
324                                 const struct cpu_key *key,
325                                 struct super_block *sb
326     )
327 {
328
329         RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
330                || chk_path->path_length > MAX_HEIGHT,
331                "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
332                key, chk_path->path_length);
333         RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev,
334                "PAP-5060: device must not be NODEV");
335
336         if (comp_keys(get_lkey(chk_path, sb), key) == 1)
337                 /* left delimiting key is bigger, that the key we look for */
338                 return 0;
339         /*  if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */
340         if (comp_keys(get_rkey(chk_path, sb), key) != 1)
341                 /* key must be less than right delimitiing key */
342                 return 0;
343         return 1;
344 }
345
346 int reiserfs_check_path(struct treepath *p)
347 {
348         RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
349                "path not properly relsed");
350         return 0;
351 }
352
353 /*
354  * Drop the reference to each buffer in a path and restore
355  * dirty bits clean when preparing the buffer for the log.
356  * This version should only be called from fix_nodes()
357  */
358 void pathrelse_and_restore(struct super_block *sb,
359                            struct treepath *search_path)
360 {
361         int path_offset = search_path->path_length;
362
363         RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
364                "clm-4000: invalid path offset");
365
366         while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
367                 struct buffer_head *bh;
368                 bh = PATH_OFFSET_PBUFFER(search_path, path_offset--);
369                 reiserfs_restore_prepared_buffer(sb, bh);
370                 brelse(bh);
371         }
372         search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
373 }
374
375 /* Drop the reference to each buffer in a path */
376 void pathrelse(struct treepath *search_path)
377 {
378         int path_offset = search_path->path_length;
379
380         RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
381                "PAP-5090: invalid path offset");
382
383         while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
384                 brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--));
385
386         search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
387 }
388
389 static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
390 {
391         struct block_head *blkh;
392         struct item_head *ih;
393         int used_space;
394         int prev_location;
395         int i;
396         int nr;
397
398         blkh = (struct block_head *)buf;
399         if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
400                 reiserfs_warning(NULL, "reiserfs-5080",
401                                  "this should be caught earlier");
402                 return 0;
403         }
404
405         nr = blkh_nr_item(blkh);
406         if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
407                 /* item number is too big or too small */
408                 reiserfs_warning(NULL, "reiserfs-5081",
409                                  "nr_item seems wrong: %z", bh);
410                 return 0;
411         }
412         ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
413         used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
414
415         /* free space does not match to calculated amount of use space */
416         if (used_space != blocksize - blkh_free_space(blkh)) {
417                 reiserfs_warning(NULL, "reiserfs-5082",
418                                  "free space seems wrong: %z", bh);
419                 return 0;
420         }
421         /*
422          * FIXME: it is_leaf will hit performance too much - we may have
423          * return 1 here
424          */
425
426         /* check tables of item heads */
427         ih = (struct item_head *)(buf + BLKH_SIZE);
428         prev_location = blocksize;
429         for (i = 0; i < nr; i++, ih++) {
430                 if (le_ih_k_type(ih) == TYPE_ANY) {
431                         reiserfs_warning(NULL, "reiserfs-5083",
432                                          "wrong item type for item %h",
433                                          ih);
434                         return 0;
435                 }
436                 if (ih_location(ih) >= blocksize
437                     || ih_location(ih) < IH_SIZE * nr) {
438                         reiserfs_warning(NULL, "reiserfs-5084",
439                                          "item location seems wrong: %h",
440                                          ih);
441                         return 0;
442                 }
443                 if (ih_item_len(ih) < 1
444                     || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) {
445                         reiserfs_warning(NULL, "reiserfs-5085",
446                                          "item length seems wrong: %h",
447                                          ih);
448                         return 0;
449                 }
450                 if (prev_location - ih_location(ih) != ih_item_len(ih)) {
451                         reiserfs_warning(NULL, "reiserfs-5086",
452                                          "item location seems wrong "
453                                          "(second one): %h", ih);
454                         return 0;
455                 }
456                 prev_location = ih_location(ih);
457         }
458
459         /* one may imagine many more checks */
460         return 1;
461 }
462
463 /* returns 1 if buf looks like an internal node, 0 otherwise */
464 static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
465 {
466         struct block_head *blkh;
467         int nr;
468         int used_space;
469
470         blkh = (struct block_head *)buf;
471         nr = blkh_level(blkh);
472         if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
473                 /* this level is not possible for internal nodes */
474                 reiserfs_warning(NULL, "reiserfs-5087",
475                                  "this should be caught earlier");
476                 return 0;
477         }
478
479         nr = blkh_nr_item(blkh);
480         /* for internal which is not root we might check min number of keys */
481         if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
482                 reiserfs_warning(NULL, "reiserfs-5088",
483                                  "number of key seems wrong: %z", bh);
484                 return 0;
485         }
486
487         used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
488         if (used_space != blocksize - blkh_free_space(blkh)) {
489                 reiserfs_warning(NULL, "reiserfs-5089",
490                                  "free space seems wrong: %z", bh);
491                 return 0;
492         }
493
494         /* one may imagine many more checks */
495         return 1;
496 }
497
498 /*
499  * make sure that bh contains formatted node of reiserfs tree of
500  * 'level'-th level
501  */
502 static int is_tree_node(struct buffer_head *bh, int level)
503 {
504         if (B_LEVEL(bh) != level) {
505                 reiserfs_warning(NULL, "reiserfs-5090", "node level %d does "
506                                  "not match to the expected one %d",
507                                  B_LEVEL(bh), level);
508                 return 0;
509         }
510         if (level == DISK_LEAF_NODE_LEVEL)
511                 return is_leaf(bh->b_data, bh->b_size, bh);
512
513         return is_internal(bh->b_data, bh->b_size, bh);
514 }
515
516 #define SEARCH_BY_KEY_READA 16
517
518 /*
519  * The function is NOT SCHEDULE-SAFE!
520  * It might unlock the write lock if we needed to wait for a block
521  * to be read. Note that in this case it won't recover the lock to avoid
522  * high contention resulting from too much lock requests, especially
523  * the caller (search_by_key) will perform other schedule-unsafe
524  * operations just after calling this function.
525  *
526  * @return depth of lock to be restored after read completes
527  */
528 static int search_by_key_reada(struct super_block *s,
529                                 struct buffer_head **bh,
530                                 b_blocknr_t *b, int num)
531 {
532         int i, j;
533         int depth = -1;
534
535         for (i = 0; i < num; i++) {
536                 bh[i] = sb_getblk(s, b[i]);
537         }
538         /*
539          * We are going to read some blocks on which we
540          * have a reference. It's safe, though we might be
541          * reading blocks concurrently changed if we release
542          * the lock. But it's still fine because we check later
543          * if the tree changed
544          */
545         for (j = 0; j < i; j++) {
546                 /*
547                  * note, this needs attention if we are getting rid of the BKL
548                  * you have to make sure the prepared bit isn't set on this
549                  * buffer
550                  */
551                 if (!buffer_uptodate(bh[j])) {
552                         if (depth == -1)
553                                 depth = reiserfs_write_unlock_nested(s);
554                         ll_rw_block(REQ_OP_READ, READA, 1, bh + j);
555                 }
556                 brelse(bh[j]);
557         }
558         return depth;
559 }
560
561 /*
562  * This function fills up the path from the root to the leaf as it
563  * descends the tree looking for the key.  It uses reiserfs_bread to
564  * try to find buffers in the cache given their block number.  If it
565  * does not find them in the cache it reads them from disk.  For each
566  * node search_by_key finds using reiserfs_bread it then uses
567  * bin_search to look through that node.  bin_search will find the
568  * position of the block_number of the next node if it is looking
569  * through an internal node.  If it is looking through a leaf node
570  * bin_search will find the position of the item which has key either
571  * equal to given key, or which is the maximal key less than the given
572  * key.  search_by_key returns a path that must be checked for the
573  * correctness of the top of the path but need not be checked for the
574  * correctness of the bottom of the path
575  */
576 /*
577  * search_by_key - search for key (and item) in stree
578  * @sb: superblock
579  * @key: pointer to key to search for
580  * @search_path: Allocated and initialized struct treepath; Returned filled
581  *               on success.
582  * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to
583  *              stop at leaf level.
584  *
585  * The function is NOT SCHEDULE-SAFE!
586  */
587 int search_by_key(struct super_block *sb, const struct cpu_key *key,
588                   struct treepath *search_path, int stop_level)
589 {
590         b_blocknr_t block_number;
591         int expected_level;
592         struct buffer_head *bh;
593         struct path_element *last_element;
594         int node_level, retval;
595         int right_neighbor_of_leaf_node;
596         int fs_gen;
597         struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
598         b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA];
599         int reada_count = 0;
600
601 #ifdef CONFIG_REISERFS_CHECK
602         int repeat_counter = 0;
603 #endif
604
605         PROC_INFO_INC(sb, search_by_key);
606
607         /*
608          * As we add each node to a path we increase its count.  This means
609          * that we must be careful to release all nodes in a path before we
610          * either discard the path struct or re-use the path struct, as we
611          * do here.
612          */
613
614         pathrelse(search_path);
615
616         right_neighbor_of_leaf_node = 0;
617
618         /*
619          * With each iteration of this loop we search through the items in the
620          * current node, and calculate the next current node(next path element)
621          * for the next iteration of this loop..
622          */
623         block_number = SB_ROOT_BLOCK(sb);
624         expected_level = -1;
625         while (1) {
626
627 #ifdef CONFIG_REISERFS_CHECK
628                 if (!(++repeat_counter % 50000))
629                         reiserfs_warning(sb, "PAP-5100",
630                                          "%s: there were %d iterations of "
631                                          "while loop looking for key %K",
632                                          current->comm, repeat_counter,
633                                          key);
634 #endif
635
636                 /* prep path to have another element added to it. */
637                 last_element =
638                     PATH_OFFSET_PELEMENT(search_path,
639                                          ++search_path->path_length);
640                 fs_gen = get_generation(sb);
641
642                 /*
643                  * Read the next tree node, and set the last element
644                  * in the path to have a pointer to it.
645                  */
646                 if ((bh = last_element->pe_buffer =
647                      sb_getblk(sb, block_number))) {
648
649                         /*
650                          * We'll need to drop the lock if we encounter any
651                          * buffers that need to be read. If all of them are
652                          * already up to date, we don't need to drop the lock.
653                          */
654                         int depth = -1;
655
656                         if (!buffer_uptodate(bh) && reada_count > 1)
657                                 depth = search_by_key_reada(sb, reada_bh,
658                                                     reada_blocks, reada_count);
659
660                         if (!buffer_uptodate(bh) && depth == -1)
661                                 depth = reiserfs_write_unlock_nested(sb);
662
663                         ll_rw_block(REQ_OP_READ, 0, 1, &bh);
664                         wait_on_buffer(bh);
665
666                         if (depth != -1)
667                                 reiserfs_write_lock_nested(sb, depth);
668                         if (!buffer_uptodate(bh))
669                                 goto io_error;
670                 } else {
671 io_error:
672                         search_path->path_length--;
673                         pathrelse(search_path);
674                         return IO_ERROR;
675                 }
676                 reada_count = 0;
677                 if (expected_level == -1)
678                         expected_level = SB_TREE_HEIGHT(sb);
679                 expected_level--;
680
681                 /*
682                  * It is possible that schedule occurred. We must check
683                  * whether the key to search is still in the tree rooted
684                  * from the current buffer. If not then repeat search
685                  * from the root.
686                  */
687                 if (fs_changed(fs_gen, sb) &&
688                     (!B_IS_IN_TREE(bh) ||
689                      B_LEVEL(bh) != expected_level ||
690                      !key_in_buffer(search_path, key, sb))) {
691                         PROC_INFO_INC(sb, search_by_key_fs_changed);
692                         PROC_INFO_INC(sb, search_by_key_restarted);
693                         PROC_INFO_INC(sb,
694                                       sbk_restarted[expected_level - 1]);
695                         pathrelse(search_path);
696
697                         /*
698                          * Get the root block number so that we can
699                          * repeat the search starting from the root.
700                          */
701                         block_number = SB_ROOT_BLOCK(sb);
702                         expected_level = -1;
703                         right_neighbor_of_leaf_node = 0;
704
705                         /* repeat search from the root */
706                         continue;
707                 }
708
709                 /*
710                  * only check that the key is in the buffer if key is not
711                  * equal to the MAX_KEY. Latter case is only possible in
712                  * "finish_unfinished()" processing during mount.
713                  */
714                 RFALSE(comp_keys(&MAX_KEY, key) &&
715                        !key_in_buffer(search_path, key, sb),
716                        "PAP-5130: key is not in the buffer");
717 #ifdef CONFIG_REISERFS_CHECK
718                 if (REISERFS_SB(sb)->cur_tb) {
719                         print_cur_tb("5140");
720                         reiserfs_panic(sb, "PAP-5140",
721                                        "schedule occurred in do_balance!");
722                 }
723 #endif
724
725                 /*
726                  * make sure, that the node contents look like a node of
727                  * certain level
728                  */
729                 if (!is_tree_node(bh, expected_level)) {
730                         reiserfs_error(sb, "vs-5150",
731                                        "invalid format found in block %ld. "
732                                        "Fsck?", bh->b_blocknr);
733                         pathrelse(search_path);
734                         return IO_ERROR;
735                 }
736
737                 /* ok, we have acquired next formatted node in the tree */
738                 node_level = B_LEVEL(bh);
739
740                 PROC_INFO_BH_STAT(sb, bh, node_level - 1);
741
742                 RFALSE(node_level < stop_level,
743                        "vs-5152: tree level (%d) is less than stop level (%d)",
744                        node_level, stop_level);
745
746                 retval = bin_search(key, item_head(bh, 0),
747                                       B_NR_ITEMS(bh),
748                                       (node_level ==
749                                        DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
750                                       KEY_SIZE,
751                                       &last_element->pe_position);
752                 if (node_level == stop_level) {
753                         return retval;
754                 }
755
756                 /* we are not in the stop level */
757                 /*
758                  * item has been found, so we choose the pointer which
759                  * is to the right of the found one
760                  */
761                 if (retval == ITEM_FOUND)
762                         last_element->pe_position++;
763
764                 /*
765                  * if item was not found we choose the position which is to
766                  * the left of the found item. This requires no code,
767                  * bin_search did it already.
768                  */
769
770                 /*
771                  * So we have chosen a position in the current node which is
772                  * an internal node.  Now we calculate child block number by
773                  * position in the node.
774                  */
775                 block_number =
776                     B_N_CHILD_NUM(bh, last_element->pe_position);
777
778                 /*
779                  * if we are going to read leaf nodes, try for read
780                  * ahead as well
781                  */
782                 if ((search_path->reada & PATH_READA) &&
783                     node_level == DISK_LEAF_NODE_LEVEL + 1) {
784                         int pos = last_element->pe_position;
785                         int limit = B_NR_ITEMS(bh);
786                         struct reiserfs_key *le_key;
787
788                         if (search_path->reada & PATH_READA_BACK)
789                                 limit = 0;
790                         while (reada_count < SEARCH_BY_KEY_READA) {
791                                 if (pos == limit)
792                                         break;
793                                 reada_blocks[reada_count++] =
794                                     B_N_CHILD_NUM(bh, pos);
795                                 if (search_path->reada & PATH_READA_BACK)
796                                         pos--;
797                                 else
798                                         pos++;
799
800                                 /*
801                                  * check to make sure we're in the same object
802                                  */
803                                 le_key = internal_key(bh, pos);
804                                 if (le32_to_cpu(le_key->k_objectid) !=
805                                     key->on_disk_key.k_objectid) {
806                                         break;
807                                 }
808                         }
809                 }
810         }
811 }
812
813 /*
814  * Form the path to an item and position in this item which contains
815  * file byte defined by key. If there is no such item
816  * corresponding to the key, we point the path to the item with
817  * maximal key less than key, and *pos_in_item is set to one
818  * past the last entry/byte in the item.  If searching for entry in a
819  * directory item, and it is not found, *pos_in_item is set to one
820  * entry more than the entry with maximal key which is less than the
821  * sought key.
822  *
823  * Note that if there is no entry in this same node which is one more,
824  * then we point to an imaginary entry.  for direct items, the
825  * position is in units of bytes, for indirect items the position is
826  * in units of blocknr entries, for directory items the position is in
827  * units of directory entries.
828  */
829 /* The function is NOT SCHEDULE-SAFE! */
830 int search_for_position_by_key(struct super_block *sb,
831                                /* Key to search (cpu variable) */
832                                const struct cpu_key *p_cpu_key,
833                                /* Filled up by this function. */
834                                struct treepath *search_path)
835 {
836         struct item_head *p_le_ih;      /* pointer to on-disk structure */
837         int blk_size;
838         loff_t item_offset, offset;
839         struct reiserfs_dir_entry de;
840         int retval;
841
842         /* If searching for directory entry. */
843         if (is_direntry_cpu_key(p_cpu_key))
844                 return search_by_entry_key(sb, p_cpu_key, search_path,
845                                            &de);
846
847         /* If not searching for directory entry. */
848
849         /* If item is found. */
850         retval = search_item(sb, p_cpu_key, search_path);
851         if (retval == IO_ERROR)
852                 return retval;
853         if (retval == ITEM_FOUND) {
854
855                 RFALSE(!ih_item_len
856                        (item_head
857                         (PATH_PLAST_BUFFER(search_path),
858                          PATH_LAST_POSITION(search_path))),
859                        "PAP-5165: item length equals zero");
860
861                 pos_in_item(search_path) = 0;
862                 return POSITION_FOUND;
863         }
864
865         RFALSE(!PATH_LAST_POSITION(search_path),
866                "PAP-5170: position equals zero");
867
868         /* Item is not found. Set path to the previous item. */
869         p_le_ih =
870             item_head(PATH_PLAST_BUFFER(search_path),
871                            --PATH_LAST_POSITION(search_path));
872         blk_size = sb->s_blocksize;
873
874         if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key))
875                 return FILE_NOT_FOUND;
876
877         /* FIXME: quite ugly this far */
878
879         item_offset = le_ih_k_offset(p_le_ih);
880         offset = cpu_key_k_offset(p_cpu_key);
881
882         /* Needed byte is contained in the item pointed to by the path. */
883         if (item_offset <= offset &&
884             item_offset + op_bytes_number(p_le_ih, blk_size) > offset) {
885                 pos_in_item(search_path) = offset - item_offset;
886                 if (is_indirect_le_ih(p_le_ih)) {
887                         pos_in_item(search_path) /= blk_size;
888                 }
889                 return POSITION_FOUND;
890         }
891
892         /*
893          * Needed byte is not contained in the item pointed to by the
894          * path. Set pos_in_item out of the item.
895          */
896         if (is_indirect_le_ih(p_le_ih))
897                 pos_in_item(search_path) =
898                     ih_item_len(p_le_ih) / UNFM_P_SIZE;
899         else
900                 pos_in_item(search_path) = ih_item_len(p_le_ih);
901
902         return POSITION_NOT_FOUND;
903 }
904
905 /* Compare given item and item pointed to by the path. */
906 int comp_items(const struct item_head *stored_ih, const struct treepath *path)
907 {
908         struct buffer_head *bh = PATH_PLAST_BUFFER(path);
909         struct item_head *ih;
910
911         /* Last buffer at the path is not in the tree. */
912         if (!B_IS_IN_TREE(bh))
913                 return 1;
914
915         /* Last path position is invalid. */
916         if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh))
917                 return 1;
918
919         /* we need only to know, whether it is the same item */
920         ih = tp_item_head(path);
921         return memcmp(stored_ih, ih, IH_SIZE);
922 }
923
924 /* unformatted nodes are not logged anymore, ever.  This is safe now */
925 #define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1)
926
927 /* block can not be forgotten as it is in I/O or held by someone */
928 #define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh)))
929
930 /* prepare for delete or cut of direct item */
931 static inline int prepare_for_direct_item(struct treepath *path,
932                                           struct item_head *le_ih,
933                                           struct inode *inode,
934                                           loff_t new_file_length, int *cut_size)
935 {
936         loff_t round_len;
937
938         if (new_file_length == max_reiserfs_offset(inode)) {
939                 /* item has to be deleted */
940                 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
941                 return M_DELETE;
942         }
943         /* new file gets truncated */
944         if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
945                 round_len = ROUND_UP(new_file_length);
946                 /* this was new_file_length < le_ih ... */
947                 if (round_len < le_ih_k_offset(le_ih)) {
948                         *cut_size = -(IH_SIZE + ih_item_len(le_ih));
949                         return M_DELETE;        /* Delete this item. */
950                 }
951                 /* Calculate first position and size for cutting from item. */
952                 pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1);
953                 *cut_size = -(ih_item_len(le_ih) - pos_in_item(path));
954
955                 return M_CUT;   /* Cut from this item. */
956         }
957
958         /* old file: items may have any length */
959
960         if (new_file_length < le_ih_k_offset(le_ih)) {
961                 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
962                 return M_DELETE;        /* Delete this item. */
963         }
964
965         /* Calculate first position and size for cutting from item. */
966         *cut_size = -(ih_item_len(le_ih) -
967                       (pos_in_item(path) =
968                        new_file_length + 1 - le_ih_k_offset(le_ih)));
969         return M_CUT;           /* Cut from this item. */
970 }
971
972 static inline int prepare_for_direntry_item(struct treepath *path,
973                                             struct item_head *le_ih,
974                                             struct inode *inode,
975                                             loff_t new_file_length,
976                                             int *cut_size)
977 {
978         if (le_ih_k_offset(le_ih) == DOT_OFFSET &&
979             new_file_length == max_reiserfs_offset(inode)) {
980                 RFALSE(ih_entry_count(le_ih) != 2,
981                        "PAP-5220: incorrect empty directory item (%h)", le_ih);
982                 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
983                 /* Delete the directory item containing "." and ".." entry. */
984                 return M_DELETE;
985         }
986
987         if (ih_entry_count(le_ih) == 1) {
988                 /*
989                  * Delete the directory item such as there is one record only
990                  * in this item
991                  */
992                 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
993                 return M_DELETE;
994         }
995
996         /* Cut one record from the directory item. */
997         *cut_size =
998             -(DEH_SIZE +
999               entry_length(get_last_bh(path), le_ih, pos_in_item(path)));
1000         return M_CUT;
1001 }
1002
1003 #define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1)
1004
1005 /*
1006  * If the path points to a directory or direct item, calculate mode
1007  * and the size cut, for balance.
1008  * If the path points to an indirect item, remove some number of its
1009  * unformatted nodes.
1010  * In case of file truncate calculate whether this item must be
1011  * deleted/truncated or last unformatted node of this item will be
1012  * converted to a direct item.
1013  * This function returns a determination of what balance mode the
1014  * calling function should employ.
1015  */
1016 static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th,
1017                                       struct inode *inode,
1018                                       struct treepath *path,
1019                                       const struct cpu_key *item_key,
1020                                       /*
1021                                        * Number of unformatted nodes
1022                                        * which were removed from end
1023                                        * of the file.
1024                                        */
1025                                       int *removed,
1026                                       int *cut_size,
1027                                       /* MAX_KEY_OFFSET in case of delete. */
1028                                       unsigned long long new_file_length
1029     )
1030 {
1031         struct super_block *sb = inode->i_sb;
1032         struct item_head *p_le_ih = tp_item_head(path);
1033         struct buffer_head *bh = PATH_PLAST_BUFFER(path);
1034
1035         BUG_ON(!th->t_trans_id);
1036
1037         /* Stat_data item. */
1038         if (is_statdata_le_ih(p_le_ih)) {
1039
1040                 RFALSE(new_file_length != max_reiserfs_offset(inode),
1041                        "PAP-5210: mode must be M_DELETE");
1042
1043                 *cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
1044                 return M_DELETE;
1045         }
1046
1047         /* Directory item. */
1048         if (is_direntry_le_ih(p_le_ih))
1049                 return prepare_for_direntry_item(path, p_le_ih, inode,
1050                                                  new_file_length,
1051                                                  cut_size);
1052
1053         /* Direct item. */
1054         if (is_direct_le_ih(p_le_ih))
1055                 return prepare_for_direct_item(path, p_le_ih, inode,
1056                                                new_file_length, cut_size);
1057
1058         /* Case of an indirect item. */
1059         {
1060             int blk_size = sb->s_blocksize;
1061             struct item_head s_ih;
1062             int need_re_search;
1063             int delete = 0;
1064             int result = M_CUT;
1065             int pos = 0;
1066
1067             if ( new_file_length == max_reiserfs_offset (inode) ) {
1068                 /*
1069                  * prepare_for_delete_or_cut() is called by
1070                  * reiserfs_delete_item()
1071                  */
1072                 new_file_length = 0;
1073                 delete = 1;
1074             }
1075
1076             do {
1077                 need_re_search = 0;
1078                 *cut_size = 0;
1079                 bh = PATH_PLAST_BUFFER(path);
1080                 copy_item_head(&s_ih, tp_item_head(path));
1081                 pos = I_UNFM_NUM(&s_ih);
1082
1083                 while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) {
1084                     __le32 *unfm;
1085                     __u32 block;
1086
1087                     /*
1088                      * Each unformatted block deletion may involve
1089                      * one additional bitmap block into the transaction,
1090                      * thereby the initial journal space reservation
1091                      * might not be enough.
1092                      */
1093                     if (!delete && (*cut_size) != 0 &&
1094                         reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
1095                         break;
1096
1097                     unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1;
1098                     block = get_block_num(unfm, 0);
1099
1100                     if (block != 0) {
1101                         reiserfs_prepare_for_journal(sb, bh, 1);
1102                         put_block_num(unfm, 0, 0);
1103                         journal_mark_dirty(th, bh);
1104                         reiserfs_free_block(th, inode, block, 1);
1105                     }
1106
1107                     reiserfs_cond_resched(sb);
1108
1109                     if (item_moved (&s_ih, path))  {
1110                         need_re_search = 1;
1111                         break;
1112                     }
1113
1114                     pos --;
1115                     (*removed)++;
1116                     (*cut_size) -= UNFM_P_SIZE;
1117
1118                     if (pos == 0) {
1119                         (*cut_size) -= IH_SIZE;
1120                         result = M_DELETE;
1121                         break;
1122                     }
1123                 }
1124                 /*
1125                  * a trick.  If the buffer has been logged, this will
1126                  * do nothing.  If we've broken the loop without logging
1127                  * it, it will restore the buffer
1128                  */
1129                 reiserfs_restore_prepared_buffer(sb, bh);
1130             } while (need_re_search &&
1131                      search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
1132             pos_in_item(path) = pos * UNFM_P_SIZE;
1133
1134             if (*cut_size == 0) {
1135                 /*
1136                  * Nothing was cut. maybe convert last unformatted node to the
1137                  * direct item?
1138                  */
1139                 result = M_CONVERT;
1140             }
1141             return result;
1142         }
1143 }
1144
1145 /* Calculate number of bytes which will be deleted or cut during balance */
1146 static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
1147 {
1148         int del_size;
1149         struct item_head *p_le_ih = tp_item_head(tb->tb_path);
1150
1151         if (is_statdata_le_ih(p_le_ih))
1152                 return 0;
1153
1154         del_size =
1155             (mode ==
1156              M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
1157         if (is_direntry_le_ih(p_le_ih)) {
1158                 /*
1159                  * return EMPTY_DIR_SIZE; We delete emty directories only.
1160                  * we can't use EMPTY_DIR_SIZE, as old format dirs have a
1161                  * different empty size.  ick. FIXME, is this right?
1162                  */
1163                 return del_size;
1164         }
1165
1166         if (is_indirect_le_ih(p_le_ih))
1167                 del_size = (del_size / UNFM_P_SIZE) *
1168                                 (PATH_PLAST_BUFFER(tb->tb_path)->b_size);
1169         return del_size;
1170 }
1171
1172 static void init_tb_struct(struct reiserfs_transaction_handle *th,
1173                            struct tree_balance *tb,
1174                            struct super_block *sb,
1175                            struct treepath *path, int size)
1176 {
1177
1178         BUG_ON(!th->t_trans_id);
1179
1180         memset(tb, '\0', sizeof(struct tree_balance));
1181         tb->transaction_handle = th;
1182         tb->tb_sb = sb;
1183         tb->tb_path = path;
1184         PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
1185         PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
1186         tb->insert_size[0] = size;
1187 }
1188
1189 void padd_item(char *item, int total_length, int length)
1190 {
1191         int i;
1192
1193         for (i = total_length; i > length;)
1194                 item[--i] = 0;
1195 }
1196
1197 #ifdef REISERQUOTA_DEBUG
1198 char key2type(struct reiserfs_key *ih)
1199 {
1200         if (is_direntry_le_key(2, ih))
1201                 return 'd';
1202         if (is_direct_le_key(2, ih))
1203                 return 'D';
1204         if (is_indirect_le_key(2, ih))
1205                 return 'i';
1206         if (is_statdata_le_key(2, ih))
1207                 return 's';
1208         return 'u';
1209 }
1210
1211 char head2type(struct item_head *ih)
1212 {
1213         if (is_direntry_le_ih(ih))
1214                 return 'd';
1215         if (is_direct_le_ih(ih))
1216                 return 'D';
1217         if (is_indirect_le_ih(ih))
1218                 return 'i';
1219         if (is_statdata_le_ih(ih))
1220                 return 's';
1221         return 'u';
1222 }
1223 #endif
1224
1225 /*
1226  * Delete object item.
1227  * th       - active transaction handle
1228  * path     - path to the deleted item
1229  * item_key - key to search for the deleted item
1230  * indode   - used for updating i_blocks and quotas
1231  * un_bh    - NULL or unformatted node pointer
1232  */
1233 int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
1234                          struct treepath *path, const struct cpu_key *item_key,
1235                          struct inode *inode, struct buffer_head *un_bh)
1236 {
1237         struct super_block *sb = inode->i_sb;
1238         struct tree_balance s_del_balance;
1239         struct item_head s_ih;
1240         struct item_head *q_ih;
1241         int quota_cut_bytes;
1242         int ret_value, del_size, removed;
1243         int depth;
1244
1245 #ifdef CONFIG_REISERFS_CHECK
1246         char mode;
1247         int iter = 0;
1248 #endif
1249
1250         BUG_ON(!th->t_trans_id);
1251
1252         init_tb_struct(th, &s_del_balance, sb, path,
1253                        0 /*size is unknown */ );
1254
1255         while (1) {
1256                 removed = 0;
1257
1258 #ifdef CONFIG_REISERFS_CHECK
1259                 iter++;
1260                 mode =
1261 #endif
1262                     prepare_for_delete_or_cut(th, inode, path,
1263                                               item_key, &removed,
1264                                               &del_size,
1265                                               max_reiserfs_offset(inode));
1266
1267                 RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
1268
1269                 copy_item_head(&s_ih, tp_item_head(path));
1270                 s_del_balance.insert_size[0] = del_size;
1271
1272                 ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
1273                 if (ret_value != REPEAT_SEARCH)
1274                         break;
1275
1276                 PROC_INFO_INC(sb, delete_item_restarted);
1277
1278                 /* file system changed, repeat search */
1279                 ret_value =
1280                     search_for_position_by_key(sb, item_key, path);
1281                 if (ret_value == IO_ERROR)
1282                         break;
1283                 if (ret_value == FILE_NOT_FOUND) {
1284                         reiserfs_warning(sb, "vs-5340",
1285                                          "no items of the file %K found",
1286                                          item_key);
1287                         break;
1288                 }
1289         }                       /* while (1) */
1290
1291         if (ret_value != CARRY_ON) {
1292                 unfix_nodes(&s_del_balance);
1293                 return 0;
1294         }
1295
1296         /* reiserfs_delete_item returns item length when success */
1297         ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
1298         q_ih = tp_item_head(path);
1299         quota_cut_bytes = ih_item_len(q_ih);
1300
1301         /*
1302          * hack so the quota code doesn't have to guess if the file has a
1303          * tail.  On tail insert, we allocate quota for 1 unformatted node.
1304          * We test the offset because the tail might have been
1305          * split into multiple items, and we only want to decrement for
1306          * the unfm node once
1307          */
1308         if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
1309                 if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
1310                         quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
1311                 } else {
1312                         quota_cut_bytes = 0;
1313                 }
1314         }
1315
1316         if (un_bh) {
1317                 int off;
1318                 char *data;
1319
1320                 /*
1321                  * We are in direct2indirect conversion, so move tail contents
1322                  * to the unformatted node
1323                  */
1324                 /*
1325                  * note, we do the copy before preparing the buffer because we
1326                  * don't care about the contents of the unformatted node yet.
1327                  * the only thing we really care about is the direct item's
1328                  * data is in the unformatted node.
1329                  *
1330                  * Otherwise, we would have to call
1331                  * reiserfs_prepare_for_journal on the unformatted node,
1332                  * which might schedule, meaning we'd have to loop all the
1333                  * way back up to the start of the while loop.
1334                  *
1335                  * The unformatted node must be dirtied later on.  We can't be
1336                  * sure here if the entire tail has been deleted yet.
1337                  *
1338                  * un_bh is from the page cache (all unformatted nodes are
1339                  * from the page cache) and might be a highmem page.  So, we
1340                  * can't use un_bh->b_data.
1341                  * -clm
1342                  */
1343
1344                 data = kmap_atomic(un_bh->b_page);
1345                 off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_SIZE - 1));
1346                 memcpy(data + off,
1347                        ih_item_body(PATH_PLAST_BUFFER(path), &s_ih),
1348                        ret_value);
1349                 kunmap_atomic(data);
1350         }
1351
1352         /* Perform balancing after all resources have been collected at once. */
1353         do_balance(&s_del_balance, NULL, NULL, M_DELETE);
1354
1355 #ifdef REISERQUOTA_DEBUG
1356         reiserfs_debug(sb, REISERFS_DEBUG_CODE,
1357                        "reiserquota delete_item(): freeing %u, id=%u type=%c",
1358                        quota_cut_bytes, inode->i_uid, head2type(&s_ih));
1359 #endif
1360         depth = reiserfs_write_unlock_nested(inode->i_sb);
1361         dquot_free_space_nodirty(inode, quota_cut_bytes);
1362         reiserfs_write_lock_nested(inode->i_sb, depth);
1363
1364         /* Return deleted body length */
1365         return ret_value;
1366 }
1367
1368 /*
1369  * Summary Of Mechanisms For Handling Collisions Between Processes:
1370  *
1371  *  deletion of the body of the object is performed by iput(), with the
1372  *  result that if multiple processes are operating on a file, the
1373  *  deletion of the body of the file is deferred until the last process
1374  *  that has an open inode performs its iput().
1375  *
1376  *  writes and truncates are protected from collisions by use of
1377  *  semaphores.
1378  *
1379  *  creates, linking, and mknod are protected from collisions with other
1380  *  processes by making the reiserfs_add_entry() the last step in the
1381  *  creation, and then rolling back all changes if there was a collision.
1382  *  - Hans
1383 */
1384
1385 /* this deletes item which never gets split */
1386 void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
1387                                 struct inode *inode, struct reiserfs_key *key)
1388 {
1389         struct super_block *sb = th->t_super;
1390         struct tree_balance tb;
1391         INITIALIZE_PATH(path);
1392         int item_len = 0;
1393         int tb_init = 0;
1394         struct cpu_key cpu_key;
1395         int retval;
1396         int quota_cut_bytes = 0;
1397
1398         BUG_ON(!th->t_trans_id);
1399
1400         le_key2cpu_key(&cpu_key, key);
1401
1402         while (1) {
1403                 retval = search_item(th->t_super, &cpu_key, &path);
1404                 if (retval == IO_ERROR) {
1405                         reiserfs_error(th->t_super, "vs-5350",
1406                                        "i/o failure occurred trying "
1407                                        "to delete %K", &cpu_key);
1408                         break;
1409                 }
1410                 if (retval != ITEM_FOUND) {
1411                         pathrelse(&path);
1412                         /*
1413                          * No need for a warning, if there is just no free
1414                          * space to insert '..' item into the
1415                          * newly-created subdir
1416                          */
1417                         if (!
1418                             ((unsigned long long)
1419                              GET_HASH_VALUE(le_key_k_offset
1420                                             (le_key_version(key), key)) == 0
1421                              && (unsigned long long)
1422                              GET_GENERATION_NUMBER(le_key_k_offset
1423                                                    (le_key_version(key),
1424                                                     key)) == 1))
1425                                 reiserfs_warning(th->t_super, "vs-5355",
1426                                                  "%k not found", key);
1427                         break;
1428                 }
1429                 if (!tb_init) {
1430                         tb_init = 1;
1431                         item_len = ih_item_len(tp_item_head(&path));
1432                         init_tb_struct(th, &tb, th->t_super, &path,
1433                                        -(IH_SIZE + item_len));
1434                 }
1435                 quota_cut_bytes = ih_item_len(tp_item_head(&path));
1436
1437                 retval = fix_nodes(M_DELETE, &tb, NULL, NULL);
1438                 if (retval == REPEAT_SEARCH) {
1439                         PROC_INFO_INC(th->t_super, delete_solid_item_restarted);
1440                         continue;
1441                 }
1442
1443                 if (retval == CARRY_ON) {
1444                         do_balance(&tb, NULL, NULL, M_DELETE);
1445                         /*
1446                          * Should we count quota for item? (we don't
1447                          * count quotas for save-links)
1448                          */
1449                         if (inode) {
1450                                 int depth;
1451 #ifdef REISERQUOTA_DEBUG
1452                                 reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
1453                                                "reiserquota delete_solid_item(): freeing %u id=%u type=%c",
1454                                                quota_cut_bytes, inode->i_uid,
1455                                                key2type(key));
1456 #endif
1457                                 depth = reiserfs_write_unlock_nested(sb);
1458                                 dquot_free_space_nodirty(inode,
1459                                                          quota_cut_bytes);
1460                                 reiserfs_write_lock_nested(sb, depth);
1461                         }
1462                         break;
1463                 }
1464
1465                 /* IO_ERROR, NO_DISK_SPACE, etc */
1466                 reiserfs_warning(th->t_super, "vs-5360",
1467                                  "could not delete %K due to fix_nodes failure",
1468                                  &cpu_key);
1469                 unfix_nodes(&tb);
1470                 break;
1471         }
1472
1473         reiserfs_check_path(&path);
1474 }
1475
1476 int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
1477                            struct inode *inode)
1478 {
1479         int err;
1480         inode->i_size = 0;
1481         BUG_ON(!th->t_trans_id);
1482
1483         /* for directory this deletes item containing "." and ".." */
1484         err =
1485             reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ );
1486         if (err)
1487                 return err;
1488
1489 #if defined( USE_INODE_GENERATION_COUNTER )
1490         if (!old_format_only(th->t_super)) {
1491                 __le32 *inode_generation;
1492
1493                 inode_generation =
1494                     &REISERFS_SB(th->t_super)->s_rs->s_inode_generation;
1495                 le32_add_cpu(inode_generation, 1);
1496         }
1497 /* USE_INODE_GENERATION_COUNTER */
1498 #endif
1499         reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
1500
1501         return err;
1502 }
1503
1504 static void unmap_buffers(struct page *page, loff_t pos)
1505 {
1506         struct buffer_head *bh;
1507         struct buffer_head *head;
1508         struct buffer_head *next;
1509         unsigned long tail_index;
1510         unsigned long cur_index;
1511
1512         if (page) {
1513                 if (page_has_buffers(page)) {
1514                         tail_index = pos & (PAGE_SIZE - 1);
1515                         cur_index = 0;
1516                         head = page_buffers(page);
1517                         bh = head;
1518                         do {
1519                                 next = bh->b_this_page;
1520
1521                                 /*
1522                                  * we want to unmap the buffers that contain
1523                                  * the tail, and all the buffers after it
1524                                  * (since the tail must be at the end of the
1525                                  * file).  We don't want to unmap file data
1526                                  * before the tail, since it might be dirty
1527                                  * and waiting to reach disk
1528                                  */
1529                                 cur_index += bh->b_size;
1530                                 if (cur_index > tail_index) {
1531                                         reiserfs_unmap_buffer(bh);
1532                                 }
1533                                 bh = next;
1534                         } while (bh != head);
1535                 }
1536         }
1537 }
1538
1539 static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
1540                                     struct inode *inode,
1541                                     struct page *page,
1542                                     struct treepath *path,
1543                                     const struct cpu_key *item_key,
1544                                     loff_t new_file_size, char *mode)
1545 {
1546         struct super_block *sb = inode->i_sb;
1547         int block_size = sb->s_blocksize;
1548         int cut_bytes;
1549         BUG_ON(!th->t_trans_id);
1550         BUG_ON(new_file_size != inode->i_size);
1551
1552         /*
1553          * the page being sent in could be NULL if there was an i/o error
1554          * reading in the last block.  The user will hit problems trying to
1555          * read the file, but for now we just skip the indirect2direct
1556          */
1557         if (atomic_read(&inode->i_count) > 1 ||
1558             !tail_has_to_be_packed(inode) ||
1559             !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) {
1560                 /* leave tail in an unformatted node */
1561                 *mode = M_SKIP_BALANCING;
1562                 cut_bytes =
1563                     block_size - (new_file_size & (block_size - 1));
1564                 pathrelse(path);
1565                 return cut_bytes;
1566         }
1567
1568         /* Perform the conversion to a direct_item. */
1569         return indirect2direct(th, inode, page, path, item_key,
1570                                new_file_size, mode);
1571 }
1572
1573 /*
1574  * we did indirect_to_direct conversion. And we have inserted direct
1575  * item successesfully, but there were no disk space to cut unfm
1576  * pointer being converted. Therefore we have to delete inserted
1577  * direct item(s)
1578  */
1579 static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
1580                                          struct inode *inode, struct treepath *path)
1581 {
1582         struct cpu_key tail_key;
1583         int tail_len;
1584         int removed;
1585         BUG_ON(!th->t_trans_id);
1586
1587         make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);
1588         tail_key.key_length = 4;
1589
1590         tail_len =
1591             (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1;
1592         while (tail_len) {
1593                 /* look for the last byte of the tail */
1594                 if (search_for_position_by_key(inode->i_sb, &tail_key, path) ==
1595                     POSITION_NOT_FOUND)
1596                         reiserfs_panic(inode->i_sb, "vs-5615",
1597                                        "found invalid item");
1598                 RFALSE(path->pos_in_item !=
1599                        ih_item_len(tp_item_head(path)) - 1,
1600                        "vs-5616: appended bytes found");
1601                 PATH_LAST_POSITION(path)--;
1602
1603                 removed =
1604                     reiserfs_delete_item(th, path, &tail_key, inode,
1605                                          NULL /*unbh not needed */ );
1606                 RFALSE(removed <= 0
1607                        || removed > tail_len,
1608                        "vs-5617: there was tail %d bytes, removed item length %d bytes",
1609                        tail_len, removed);
1610                 tail_len -= removed;
1611                 set_cpu_key_k_offset(&tail_key,
1612                                      cpu_key_k_offset(&tail_key) - removed);
1613         }
1614         reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
1615                          "conversion has been rolled back due to "
1616                          "lack of disk space");
1617         mark_inode_dirty(inode);
1618 }
1619
1620 /* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
1621 int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
1622                            struct treepath *path,
1623                            struct cpu_key *item_key,
1624                            struct inode *inode,
1625                            struct page *page, loff_t new_file_size)
1626 {
1627         struct super_block *sb = inode->i_sb;
1628         /*
1629          * Every function which is going to call do_balance must first
1630          * create a tree_balance structure.  Then it must fill up this
1631          * structure by using the init_tb_struct and fix_nodes functions.
1632          * After that we can make tree balancing.
1633          */
1634         struct tree_balance s_cut_balance;
1635         struct item_head *p_le_ih;
1636         int cut_size = 0;       /* Amount to be cut. */
1637         int ret_value = CARRY_ON;
1638         int removed = 0;        /* Number of the removed unformatted nodes. */
1639         int is_inode_locked = 0;
1640         char mode;              /* Mode of the balance. */
1641         int retval2 = -1;
1642         int quota_cut_bytes;
1643         loff_t tail_pos = 0;
1644         int depth;
1645
1646         BUG_ON(!th->t_trans_id);
1647
1648         init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
1649                        cut_size);
1650
1651         /*
1652          * Repeat this loop until we either cut the item without needing
1653          * to balance, or we fix_nodes without schedule occurring
1654          */
1655         while (1) {
1656                 /*
1657                  * Determine the balance mode, position of the first byte to
1658                  * be cut, and size to be cut.  In case of the indirect item
1659                  * free unformatted nodes which are pointed to by the cut
1660                  * pointers.
1661                  */
1662
1663                 mode =
1664                     prepare_for_delete_or_cut(th, inode, path,
1665                                               item_key, &removed,
1666                                               &cut_size, new_file_size);
1667                 if (mode == M_CONVERT) {
1668                         /*
1669                          * convert last unformatted node to direct item or
1670                          * leave tail in the unformatted node
1671                          */
1672                         RFALSE(ret_value != CARRY_ON,
1673                                "PAP-5570: can not convert twice");
1674
1675                         ret_value =
1676                             maybe_indirect_to_direct(th, inode, page,
1677                                                      path, item_key,
1678                                                      new_file_size, &mode);
1679                         if (mode == M_SKIP_BALANCING)
1680                                 /* tail has been left in the unformatted node */
1681                                 return ret_value;
1682
1683                         is_inode_locked = 1;
1684
1685                         /*
1686                          * removing of last unformatted node will
1687                          * change value we have to return to truncate.
1688                          * Save it
1689                          */
1690                         retval2 = ret_value;
1691
1692                         /*
1693                          * So, we have performed the first part of the
1694                          * conversion:
1695                          * inserting the new direct item.  Now we are
1696                          * removing the last unformatted node pointer.
1697                          * Set key to search for it.
1698                          */
1699                         set_cpu_key_k_type(item_key, TYPE_INDIRECT);
1700                         item_key->key_length = 4;
1701                         new_file_size -=
1702                             (new_file_size & (sb->s_blocksize - 1));
1703                         tail_pos = new_file_size;
1704                         set_cpu_key_k_offset(item_key, new_file_size + 1);
1705                         if (search_for_position_by_key
1706                             (sb, item_key,
1707                              path) == POSITION_NOT_FOUND) {
1708                                 print_block(PATH_PLAST_BUFFER(path), 3,
1709                                             PATH_LAST_POSITION(path) - 1,
1710                                             PATH_LAST_POSITION(path) + 1);
1711                                 reiserfs_panic(sb, "PAP-5580", "item to "
1712                                                "convert does not exist (%K)",
1713                                                item_key);
1714                         }
1715                         continue;
1716                 }
1717                 if (cut_size == 0) {
1718                         pathrelse(path);
1719                         return 0;
1720                 }
1721
1722                 s_cut_balance.insert_size[0] = cut_size;
1723
1724                 ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL);
1725                 if (ret_value != REPEAT_SEARCH)
1726                         break;
1727
1728                 PROC_INFO_INC(sb, cut_from_item_restarted);
1729
1730                 ret_value =
1731                     search_for_position_by_key(sb, item_key, path);
1732                 if (ret_value == POSITION_FOUND)
1733                         continue;
1734
1735                 reiserfs_warning(sb, "PAP-5610", "item %K not found",
1736                                  item_key);
1737                 unfix_nodes(&s_cut_balance);
1738                 return (ret_value == IO_ERROR) ? -EIO : -ENOENT;
1739         }                       /* while */
1740
1741         /* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */
1742         if (ret_value != CARRY_ON) {
1743                 if (is_inode_locked) {
1744                         /*
1745                          * FIXME: this seems to be not needed: we are always
1746                          * able to cut item
1747                          */
1748                         indirect_to_direct_roll_back(th, inode, path);
1749                 }
1750                 if (ret_value == NO_DISK_SPACE)
1751                         reiserfs_warning(sb, "reiserfs-5092",
1752                                          "NO_DISK_SPACE");
1753                 unfix_nodes(&s_cut_balance);
1754                 return -EIO;
1755         }
1756
1757         /* go ahead and perform balancing */
1758
1759         RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode");
1760
1761         /* Calculate number of bytes that need to be cut from the item. */
1762         quota_cut_bytes =
1763             (mode ==
1764              M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance.
1765             insert_size[0];
1766         if (retval2 == -1)
1767                 ret_value = calc_deleted_bytes_number(&s_cut_balance, mode);
1768         else
1769                 ret_value = retval2;
1770
1771         /*
1772          * For direct items, we only change the quota when deleting the last
1773          * item.
1774          */
1775         p_le_ih = tp_item_head(s_cut_balance.tb_path);
1776         if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
1777                 if (mode == M_DELETE &&
1778                     (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
1779                     1) {
1780                         /* FIXME: this is to keep 3.5 happy */
1781                         REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
1782                         quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
1783                 } else {
1784                         quota_cut_bytes = 0;
1785                 }
1786         }
1787 #ifdef CONFIG_REISERFS_CHECK
1788         if (is_inode_locked) {
1789                 struct item_head *le_ih =
1790                     tp_item_head(s_cut_balance.tb_path);
1791                 /*
1792                  * we are going to complete indirect2direct conversion. Make
1793                  * sure, that we exactly remove last unformatted node pointer
1794                  * of the item
1795                  */
1796                 if (!is_indirect_le_ih(le_ih))
1797                         reiserfs_panic(sb, "vs-5652",
1798                                        "item must be indirect %h", le_ih);
1799
1800                 if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
1801                         reiserfs_panic(sb, "vs-5653", "completing "
1802                                        "indirect2direct conversion indirect "
1803                                        "item %h being deleted must be of "
1804                                        "4 byte long", le_ih);
1805
1806                 if (mode == M_CUT
1807                     && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
1808                         reiserfs_panic(sb, "vs-5654", "can not complete "
1809                                        "indirect2direct conversion of %h "
1810                                        "(CUT, insert_size==%d)",
1811                                        le_ih, s_cut_balance.insert_size[0]);
1812                 }
1813                 /*
1814                  * it would be useful to make sure, that right neighboring
1815                  * item is direct item of this file
1816                  */
1817         }
1818 #endif
1819
1820         do_balance(&s_cut_balance, NULL, NULL, mode);
1821         if (is_inode_locked) {
1822                 /*
1823                  * we've done an indirect->direct conversion.  when the
1824                  * data block was freed, it was removed from the list of
1825                  * blocks that must be flushed before the transaction
1826                  * commits, make sure to unmap and invalidate it
1827                  */
1828                 unmap_buffers(page, tail_pos);
1829                 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
1830         }
1831 #ifdef REISERQUOTA_DEBUG
1832         reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
1833                        "reiserquota cut_from_item(): freeing %u id=%u type=%c",
1834                        quota_cut_bytes, inode->i_uid, '?');
1835 #endif
1836         depth = reiserfs_write_unlock_nested(sb);
1837         dquot_free_space_nodirty(inode, quota_cut_bytes);
1838         reiserfs_write_lock_nested(sb, depth);
1839         return ret_value;
1840 }
1841
1842 static void truncate_directory(struct reiserfs_transaction_handle *th,
1843                                struct inode *inode)
1844 {
1845         BUG_ON(!th->t_trans_id);
1846         if (inode->i_nlink)
1847                 reiserfs_error(inode->i_sb, "vs-5655", "link count != 0");
1848
1849         set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET);
1850         set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY);
1851         reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
1852         reiserfs_update_sd(th, inode);
1853         set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET);
1854         set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA);
1855 }
1856
1857 /*
1858  * Truncate file to the new size. Note, this must be called with a
1859  * transaction already started
1860  */
1861 int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
1862                          struct inode *inode,   /* ->i_size contains new size */
1863                          struct page *page,     /* up to date for last block */
1864                          /*
1865                           * when it is called by file_release to convert
1866                           * the tail - no timestamps should be updated
1867                           */
1868                          int update_timestamps
1869     )
1870 {
1871         INITIALIZE_PATH(s_search_path); /* Path to the current object item. */
1872         struct item_head *p_le_ih;      /* Pointer to an item header. */
1873
1874         /* Key to search for a previous file item. */
1875         struct cpu_key s_item_key;
1876         loff_t file_size,       /* Old file size. */
1877          new_file_size; /* New file size. */
1878         int deleted;            /* Number of deleted or truncated bytes. */
1879         int retval;
1880         int err = 0;
1881
1882         BUG_ON(!th->t_trans_id);
1883         if (!
1884             (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
1885              || S_ISLNK(inode->i_mode)))
1886                 return 0;
1887
1888         /* deletion of directory - no need to update timestamps */
1889         if (S_ISDIR(inode->i_mode)) {
1890                 truncate_directory(th, inode);
1891                 return 0;
1892         }
1893
1894         /* Get new file size. */
1895         new_file_size = inode->i_size;
1896
1897         /* FIXME: note, that key type is unimportant here */
1898         make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
1899                      TYPE_DIRECT, 3);
1900
1901         retval =
1902             search_for_position_by_key(inode->i_sb, &s_item_key,
1903                                        &s_search_path);
1904         if (retval == IO_ERROR) {
1905                 reiserfs_error(inode->i_sb, "vs-5657",
1906                                "i/o failure occurred trying to truncate %K",
1907                                &s_item_key);
1908                 err = -EIO;
1909                 goto out;
1910         }
1911         if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
1912                 reiserfs_error(inode->i_sb, "PAP-5660",
1913                                "wrong result %d of search for %K", retval,
1914                                &s_item_key);
1915
1916                 err = -EIO;
1917                 goto out;
1918         }
1919
1920         s_search_path.pos_in_item--;
1921
1922         /* Get real file size (total length of all file items) */
1923         p_le_ih = tp_item_head(&s_search_path);
1924         if (is_statdata_le_ih(p_le_ih))
1925                 file_size = 0;
1926         else {
1927                 loff_t offset = le_ih_k_offset(p_le_ih);
1928                 int bytes =
1929                     op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
1930
1931                 /*
1932                  * this may mismatch with real file size: if last direct item
1933                  * had no padding zeros and last unformatted node had no free
1934                  * space, this file would have this file size
1935                  */
1936                 file_size = offset + bytes - 1;
1937         }
1938         /*
1939          * are we doing a full truncate or delete, if so
1940          * kick in the reada code
1941          */
1942         if (new_file_size == 0)
1943                 s_search_path.reada = PATH_READA | PATH_READA_BACK;
1944
1945         if (file_size == 0 || file_size < new_file_size) {
1946                 goto update_and_out;
1947         }
1948
1949         /* Update key to search for the last file item. */
1950         set_cpu_key_k_offset(&s_item_key, file_size);
1951
1952         do {
1953                 /* Cut or delete file item. */
1954                 deleted =
1955                     reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
1956                                            inode, page, new_file_size);
1957                 if (deleted < 0) {
1958                         reiserfs_warning(inode->i_sb, "vs-5665",
1959                                          "reiserfs_cut_from_item failed");
1960                         reiserfs_check_path(&s_search_path);
1961                         return 0;
1962                 }
1963
1964                 RFALSE(deleted > file_size,
1965                        "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
1966                        deleted, file_size, &s_item_key);
1967
1968                 /* Change key to search the last file item. */
1969                 file_size -= deleted;
1970
1971                 set_cpu_key_k_offset(&s_item_key, file_size);
1972
1973                 /*
1974                  * While there are bytes to truncate and previous
1975                  * file item is presented in the tree.
1976                  */
1977
1978                 /*
1979                  * This loop could take a really long time, and could log
1980                  * many more blocks than a transaction can hold.  So, we do
1981                  * a polite journal end here, and if the transaction needs
1982                  * ending, we make sure the file is consistent before ending
1983                  * the current trans and starting a new one
1984                  */
1985                 if (journal_transaction_should_end(th, 0) ||
1986                     reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
1987                         pathrelse(&s_search_path);
1988
1989                         if (update_timestamps) {
1990                                 inode->i_mtime = CURRENT_TIME_SEC;
1991                                 inode->i_ctime = CURRENT_TIME_SEC;
1992                         }
1993                         reiserfs_update_sd(th, inode);
1994
1995                         err = journal_end(th);
1996                         if (err)
1997                                 goto out;
1998                         err = journal_begin(th, inode->i_sb,
1999                                             JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ;
2000                         if (err)
2001                                 goto out;
2002                         reiserfs_update_inode_transaction(inode);
2003                 }
2004         } while (file_size > ROUND_UP(new_file_size) &&
2005                  search_for_position_by_key(inode->i_sb, &s_item_key,
2006                                             &s_search_path) == POSITION_FOUND);
2007
2008         RFALSE(file_size > ROUND_UP(new_file_size),
2009                "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d",
2010                new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
2011
2012 update_and_out:
2013         if (update_timestamps) {
2014                 /* this is truncate, not file closing */
2015                 inode->i_mtime = CURRENT_TIME_SEC;
2016                 inode->i_ctime = CURRENT_TIME_SEC;
2017         }
2018         reiserfs_update_sd(th, inode);
2019
2020 out:
2021         pathrelse(&s_search_path);
2022         return err;
2023 }
2024
2025 #ifdef CONFIG_REISERFS_CHECK
2026 /* this makes sure, that we __append__, not overwrite or add holes */
2027 static void check_research_for_paste(struct treepath *path,
2028                                      const struct cpu_key *key)
2029 {
2030         struct item_head *found_ih = tp_item_head(path);
2031
2032         if (is_direct_le_ih(found_ih)) {
2033                 if (le_ih_k_offset(found_ih) +
2034                     op_bytes_number(found_ih,
2035                                     get_last_bh(path)->b_size) !=
2036                     cpu_key_k_offset(key)
2037                     || op_bytes_number(found_ih,
2038                                        get_last_bh(path)->b_size) !=
2039                     pos_in_item(path))
2040                         reiserfs_panic(NULL, "PAP-5720", "found direct item "
2041                                        "%h or position (%d) does not match "
2042                                        "to key %K", found_ih,
2043                                        pos_in_item(path), key);
2044         }
2045         if (is_indirect_le_ih(found_ih)) {
2046                 if (le_ih_k_offset(found_ih) +
2047                     op_bytes_number(found_ih,
2048                                     get_last_bh(path)->b_size) !=
2049                     cpu_key_k_offset(key)
2050                     || I_UNFM_NUM(found_ih) != pos_in_item(path)
2051                     || get_ih_free_space(found_ih) != 0)
2052                         reiserfs_panic(NULL, "PAP-5730", "found indirect "
2053                                        "item (%h) or position (%d) does not "
2054                                        "match to key (%K)",
2055                                        found_ih, pos_in_item(path), key);
2056         }
2057 }
2058 #endif                          /* config reiserfs check */
2059
2060 /*
2061  * Paste bytes to the existing item.
2062  * Returns bytes number pasted into the item.
2063  */
2064 int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
2065                              /* Path to the pasted item. */
2066                              struct treepath *search_path,
2067                              /* Key to search for the needed item. */
2068                              const struct cpu_key *key,
2069                              /* Inode item belongs to */
2070                              struct inode *inode,
2071                              /* Pointer to the bytes to paste. */
2072                              const char *body,
2073                              /* Size of pasted bytes. */
2074                              int pasted_size)
2075 {
2076         struct super_block *sb = inode->i_sb;
2077         struct tree_balance s_paste_balance;
2078         int retval;
2079         int fs_gen;
2080         int depth;
2081
2082         BUG_ON(!th->t_trans_id);
2083
2084         fs_gen = get_generation(inode->i_sb);
2085
2086 #ifdef REISERQUOTA_DEBUG
2087         reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
2088                        "reiserquota paste_into_item(): allocating %u id=%u type=%c",
2089                        pasted_size, inode->i_uid,
2090                        key2type(&key->on_disk_key));
2091 #endif
2092
2093         depth = reiserfs_write_unlock_nested(sb);
2094         retval = dquot_alloc_space_nodirty(inode, pasted_size);
2095         reiserfs_write_lock_nested(sb, depth);
2096         if (retval) {
2097                 pathrelse(search_path);
2098                 return retval;
2099         }
2100         init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
2101                        pasted_size);
2102 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
2103         s_paste_balance.key = key->on_disk_key;
2104 #endif
2105
2106         /* DQUOT_* can schedule, must check before the fix_nodes */
2107         if (fs_changed(fs_gen, inode->i_sb)) {
2108                 goto search_again;
2109         }
2110
2111         while ((retval =
2112                 fix_nodes(M_PASTE, &s_paste_balance, NULL,
2113                           body)) == REPEAT_SEARCH) {
2114 search_again:
2115                 /* file system changed while we were in the fix_nodes */
2116                 PROC_INFO_INC(th->t_super, paste_into_item_restarted);
2117                 retval =
2118                     search_for_position_by_key(th->t_super, key,
2119                                                search_path);
2120                 if (retval == IO_ERROR) {
2121                         retval = -EIO;
2122                         goto error_out;
2123                 }
2124                 if (retval == POSITION_FOUND) {
2125                         reiserfs_warning(inode->i_sb, "PAP-5710",
2126                                          "entry or pasted byte (%K) exists",
2127                                          key);
2128                         retval = -EEXIST;
2129                         goto error_out;
2130                 }
2131 #ifdef CONFIG_REISERFS_CHECK
2132                 check_research_for_paste(search_path, key);
2133 #endif
2134         }
2135
2136         /*
2137          * Perform balancing after all resources are collected by fix_nodes,
2138          * and accessing them will not risk triggering schedule.
2139          */
2140         if (retval == CARRY_ON) {
2141                 do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
2142                 return 0;
2143         }
2144         retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
2145 error_out:
2146         /* this also releases the path */
2147         unfix_nodes(&s_paste_balance);
2148 #ifdef REISERQUOTA_DEBUG
2149         reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
2150                        "reiserquota paste_into_item(): freeing %u id=%u type=%c",
2151                        pasted_size, inode->i_uid,
2152                        key2type(&key->on_disk_key));
2153 #endif
2154         depth = reiserfs_write_unlock_nested(sb);
2155         dquot_free_space_nodirty(inode, pasted_size);
2156         reiserfs_write_lock_nested(sb, depth);
2157         return retval;
2158 }
2159
2160 /*
2161  * Insert new item into the buffer at the path.
2162  * th   - active transaction handle
2163  * path - path to the inserted item
2164  * ih   - pointer to the item header to insert
2165  * body - pointer to the bytes to insert
2166  */
2167 int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2168                          struct treepath *path, const struct cpu_key *key,
2169                          struct item_head *ih, struct inode *inode,
2170                          const char *body)
2171 {
2172         struct tree_balance s_ins_balance;
2173         int retval;
2174         int fs_gen = 0;
2175         int quota_bytes = 0;
2176
2177         BUG_ON(!th->t_trans_id);
2178
2179         if (inode) {            /* Do we count quotas for item? */
2180                 int depth;
2181                 fs_gen = get_generation(inode->i_sb);
2182                 quota_bytes = ih_item_len(ih);
2183
2184                 /*
2185                  * hack so the quota code doesn't have to guess
2186                  * if the file has a tail, links are always tails,
2187                  * so there's no guessing needed
2188                  */
2189                 if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
2190                         quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
2191 #ifdef REISERQUOTA_DEBUG
2192                 reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
2193                                "reiserquota insert_item(): allocating %u id=%u type=%c",
2194                                quota_bytes, inode->i_uid, head2type(ih));
2195 #endif
2196                 /*
2197                  * We can't dirty inode here. It would be immediately
2198                  * written but appropriate stat item isn't inserted yet...
2199                  */
2200                 depth = reiserfs_write_unlock_nested(inode->i_sb);
2201                 retval = dquot_alloc_space_nodirty(inode, quota_bytes);
2202                 reiserfs_write_lock_nested(inode->i_sb, depth);
2203                 if (retval) {
2204                         pathrelse(path);
2205                         return retval;
2206                 }
2207         }
2208         init_tb_struct(th, &s_ins_balance, th->t_super, path,
2209                        IH_SIZE + ih_item_len(ih));
2210 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
2211         s_ins_balance.key = key->on_disk_key;
2212 #endif
2213         /*
2214          * DQUOT_* can schedule, must check to be sure calling
2215          * fix_nodes is safe
2216          */
2217         if (inode && fs_changed(fs_gen, inode->i_sb)) {
2218                 goto search_again;
2219         }
2220
2221         while ((retval =
2222                 fix_nodes(M_INSERT, &s_ins_balance, ih,
2223                           body)) == REPEAT_SEARCH) {
2224 search_again:
2225                 /* file system changed while we were in the fix_nodes */
2226                 PROC_INFO_INC(th->t_super, insert_item_restarted);
2227                 retval = search_item(th->t_super, key, path);
2228                 if (retval == IO_ERROR) {
2229                         retval = -EIO;
2230                         goto error_out;
2231                 }
2232                 if (retval == ITEM_FOUND) {
2233                         reiserfs_warning(th->t_super, "PAP-5760",
2234                                          "key %K already exists in the tree",
2235                                          key);
2236                         retval = -EEXIST;
2237                         goto error_out;
2238                 }
2239         }
2240
2241         /* make balancing after all resources will be collected at a time */
2242         if (retval == CARRY_ON) {
2243                 do_balance(&s_ins_balance, ih, body, M_INSERT);
2244                 return 0;
2245         }
2246
2247         retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
2248 error_out:
2249         /* also releases the path */
2250         unfix_nodes(&s_ins_balance);
2251 #ifdef REISERQUOTA_DEBUG
2252         reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
2253                        "reiserquota insert_item(): freeing %u id=%u type=%c",
2254                        quota_bytes, inode->i_uid, head2type(ih));
2255 #endif
2256         if (inode) {
2257                 int depth = reiserfs_write_unlock_nested(inode->i_sb);
2258                 dquot_free_space_nodirty(inode, quota_bytes);
2259                 reiserfs_write_lock_nested(inode->i_sb, depth);
2260         }
2261         return retval;
2262 }