]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - fs/btrfs/ctree.c
btrfs: expand btrfs_find_item if found_key is NULL
[karo-tx-linux.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29                       *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31                       *root, struct btrfs_key *ins_key,
32                       struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34                           struct btrfs_root *root, struct extent_buffer *dst,
35                           struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37                               struct btrfs_root *root,
38                               struct extent_buffer *dst_buf,
39                               struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41                     int level, int slot);
42 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43                                  struct extent_buffer *eb);
44
45 struct btrfs_path *btrfs_alloc_path(void)
46 {
47         struct btrfs_path *path;
48         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
49         return path;
50 }
51
52 /*
53  * set all locked nodes in the path to blocking locks.  This should
54  * be done before scheduling
55  */
56 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
57 {
58         int i;
59         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
60                 if (!p->nodes[i] || !p->locks[i])
61                         continue;
62                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
63                 if (p->locks[i] == BTRFS_READ_LOCK)
64                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
65                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
66                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
67         }
68 }
69
70 /*
71  * reset all the locked nodes in the patch to spinning locks.
72  *
73  * held is used to keep lockdep happy, when lockdep is enabled
74  * we set held to a blocking lock before we go around and
75  * retake all the spinlocks in the path.  You can safely use NULL
76  * for held
77  */
78 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79                                         struct extent_buffer *held, int held_rw)
80 {
81         int i;
82
83         if (held) {
84                 btrfs_set_lock_blocking_rw(held, held_rw);
85                 if (held_rw == BTRFS_WRITE_LOCK)
86                         held_rw = BTRFS_WRITE_LOCK_BLOCKING;
87                 else if (held_rw == BTRFS_READ_LOCK)
88                         held_rw = BTRFS_READ_LOCK_BLOCKING;
89         }
90         btrfs_set_path_blocking(p);
91
92         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
93                 if (p->nodes[i] && p->locks[i]) {
94                         btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
95                         if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
96                                 p->locks[i] = BTRFS_WRITE_LOCK;
97                         else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
98                                 p->locks[i] = BTRFS_READ_LOCK;
99                 }
100         }
101
102         if (held)
103                 btrfs_clear_lock_blocking_rw(held, held_rw);
104 }
105
106 /* this also releases the path */
107 void btrfs_free_path(struct btrfs_path *p)
108 {
109         if (!p)
110                 return;
111         btrfs_release_path(p);
112         kmem_cache_free(btrfs_path_cachep, p);
113 }
114
115 /*
116  * path release drops references on the extent buffers in the path
117  * and it drops any locks held by this path
118  *
119  * It is safe to call this on paths that no locks or extent buffers held.
120  */
121 noinline void btrfs_release_path(struct btrfs_path *p)
122 {
123         int i;
124
125         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
126                 p->slots[i] = 0;
127                 if (!p->nodes[i])
128                         continue;
129                 if (p->locks[i]) {
130                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
131                         p->locks[i] = 0;
132                 }
133                 free_extent_buffer(p->nodes[i]);
134                 p->nodes[i] = NULL;
135         }
136 }
137
138 /*
139  * safely gets a reference on the root node of a tree.  A lock
140  * is not taken, so a concurrent writer may put a different node
141  * at the root of the tree.  See btrfs_lock_root_node for the
142  * looping required.
143  *
144  * The extent buffer returned by this has a reference taken, so
145  * it won't disappear.  It may stop being the root of the tree
146  * at any time because there are no locks held.
147  */
148 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
149 {
150         struct extent_buffer *eb;
151
152         while (1) {
153                 rcu_read_lock();
154                 eb = rcu_dereference(root->node);
155
156                 /*
157                  * RCU really hurts here, we could free up the root node because
158                  * it was cow'ed but we may not get the new root node yet so do
159                  * the inc_not_zero dance and if it doesn't work then
160                  * synchronize_rcu and try again.
161                  */
162                 if (atomic_inc_not_zero(&eb->refs)) {
163                         rcu_read_unlock();
164                         break;
165                 }
166                 rcu_read_unlock();
167                 synchronize_rcu();
168         }
169         return eb;
170 }
171
172 /* loop around taking references on and locking the root node of the
173  * tree until you end up with a lock on the root.  A locked buffer
174  * is returned, with a reference held.
175  */
176 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
177 {
178         struct extent_buffer *eb;
179
180         while (1) {
181                 eb = btrfs_root_node(root);
182                 btrfs_tree_lock(eb);
183                 if (eb == root->node)
184                         break;
185                 btrfs_tree_unlock(eb);
186                 free_extent_buffer(eb);
187         }
188         return eb;
189 }
190
191 /* loop around taking references on and locking the root node of the
192  * tree until you end up with a lock on the root.  A locked buffer
193  * is returned, with a reference held.
194  */
195 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
196 {
197         struct extent_buffer *eb;
198
199         while (1) {
200                 eb = btrfs_root_node(root);
201                 btrfs_tree_read_lock(eb);
202                 if (eb == root->node)
203                         break;
204                 btrfs_tree_read_unlock(eb);
205                 free_extent_buffer(eb);
206         }
207         return eb;
208 }
209
210 /* cowonly root (everything not a reference counted cow subvolume), just get
211  * put onto a simple dirty list.  transaction.c walks this to make sure they
212  * get properly updated on disk.
213  */
214 static void add_root_to_dirty_list(struct btrfs_root *root)
215 {
216         spin_lock(&root->fs_info->trans_lock);
217         if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
218             list_empty(&root->dirty_list)) {
219                 list_add(&root->dirty_list,
220                          &root->fs_info->dirty_cowonly_roots);
221         }
222         spin_unlock(&root->fs_info->trans_lock);
223 }
224
225 /*
226  * used by snapshot creation to make a copy of a root for a tree with
227  * a given objectid.  The buffer with the new root node is returned in
228  * cow_ret, and this func returns zero on success or a negative error code.
229  */
230 int btrfs_copy_root(struct btrfs_trans_handle *trans,
231                       struct btrfs_root *root,
232                       struct extent_buffer *buf,
233                       struct extent_buffer **cow_ret, u64 new_root_objectid)
234 {
235         struct extent_buffer *cow;
236         int ret = 0;
237         int level;
238         struct btrfs_disk_key disk_key;
239
240         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
241                 trans->transid != root->fs_info->running_transaction->transid);
242         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
243                 trans->transid != root->last_trans);
244
245         level = btrfs_header_level(buf);
246         if (level == 0)
247                 btrfs_item_key(buf, &disk_key, 0);
248         else
249                 btrfs_node_key(buf, &disk_key, 0);
250
251         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
252                         &disk_key, level, buf->start, 0);
253         if (IS_ERR(cow))
254                 return PTR_ERR(cow);
255
256         copy_extent_buffer(cow, buf, 0, 0, cow->len);
257         btrfs_set_header_bytenr(cow, cow->start);
258         btrfs_set_header_generation(cow, trans->transid);
259         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
260         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
261                                      BTRFS_HEADER_FLAG_RELOC);
262         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
263                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
264         else
265                 btrfs_set_header_owner(cow, new_root_objectid);
266
267         write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
268                             BTRFS_FSID_SIZE);
269
270         WARN_ON(btrfs_header_generation(buf) > trans->transid);
271         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
272                 ret = btrfs_inc_ref(trans, root, cow, 1);
273         else
274                 ret = btrfs_inc_ref(trans, root, cow, 0);
275
276         if (ret)
277                 return ret;
278
279         btrfs_mark_buffer_dirty(cow);
280         *cow_ret = cow;
281         return 0;
282 }
283
284 enum mod_log_op {
285         MOD_LOG_KEY_REPLACE,
286         MOD_LOG_KEY_ADD,
287         MOD_LOG_KEY_REMOVE,
288         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
289         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
290         MOD_LOG_MOVE_KEYS,
291         MOD_LOG_ROOT_REPLACE,
292 };
293
294 struct tree_mod_move {
295         int dst_slot;
296         int nr_items;
297 };
298
299 struct tree_mod_root {
300         u64 logical;
301         u8 level;
302 };
303
304 struct tree_mod_elem {
305         struct rb_node node;
306         u64 index;              /* shifted logical */
307         u64 seq;
308         enum mod_log_op op;
309
310         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
311         int slot;
312
313         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
314         u64 generation;
315
316         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
317         struct btrfs_disk_key key;
318         u64 blockptr;
319
320         /* this is used for op == MOD_LOG_MOVE_KEYS */
321         struct tree_mod_move move;
322
323         /* this is used for op == MOD_LOG_ROOT_REPLACE */
324         struct tree_mod_root old_root;
325 };
326
327 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
328 {
329         read_lock(&fs_info->tree_mod_log_lock);
330 }
331
332 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
333 {
334         read_unlock(&fs_info->tree_mod_log_lock);
335 }
336
337 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
338 {
339         write_lock(&fs_info->tree_mod_log_lock);
340 }
341
342 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
343 {
344         write_unlock(&fs_info->tree_mod_log_lock);
345 }
346
347 /*
348  * Pull a new tree mod seq number for our operation.
349  */
350 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
351 {
352         return atomic64_inc_return(&fs_info->tree_mod_seq);
353 }
354
355 /*
356  * This adds a new blocker to the tree mod log's blocker list if the @elem
357  * passed does not already have a sequence number set. So when a caller expects
358  * to record tree modifications, it should ensure to set elem->seq to zero
359  * before calling btrfs_get_tree_mod_seq.
360  * Returns a fresh, unused tree log modification sequence number, even if no new
361  * blocker was added.
362  */
363 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
364                            struct seq_list *elem)
365 {
366         tree_mod_log_write_lock(fs_info);
367         spin_lock(&fs_info->tree_mod_seq_lock);
368         if (!elem->seq) {
369                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
370                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
371         }
372         spin_unlock(&fs_info->tree_mod_seq_lock);
373         tree_mod_log_write_unlock(fs_info);
374
375         return elem->seq;
376 }
377
378 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
379                             struct seq_list *elem)
380 {
381         struct rb_root *tm_root;
382         struct rb_node *node;
383         struct rb_node *next;
384         struct seq_list *cur_elem;
385         struct tree_mod_elem *tm;
386         u64 min_seq = (u64)-1;
387         u64 seq_putting = elem->seq;
388
389         if (!seq_putting)
390                 return;
391
392         spin_lock(&fs_info->tree_mod_seq_lock);
393         list_del(&elem->list);
394         elem->seq = 0;
395
396         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
397                 if (cur_elem->seq < min_seq) {
398                         if (seq_putting > cur_elem->seq) {
399                                 /*
400                                  * blocker with lower sequence number exists, we
401                                  * cannot remove anything from the log
402                                  */
403                                 spin_unlock(&fs_info->tree_mod_seq_lock);
404                                 return;
405                         }
406                         min_seq = cur_elem->seq;
407                 }
408         }
409         spin_unlock(&fs_info->tree_mod_seq_lock);
410
411         /*
412          * anything that's lower than the lowest existing (read: blocked)
413          * sequence number can be removed from the tree.
414          */
415         tree_mod_log_write_lock(fs_info);
416         tm_root = &fs_info->tree_mod_log;
417         for (node = rb_first(tm_root); node; node = next) {
418                 next = rb_next(node);
419                 tm = container_of(node, struct tree_mod_elem, node);
420                 if (tm->seq > min_seq)
421                         continue;
422                 rb_erase(node, tm_root);
423                 kfree(tm);
424         }
425         tree_mod_log_write_unlock(fs_info);
426 }
427
428 /*
429  * key order of the log:
430  *       index -> sequence
431  *
432  * the index is the shifted logical of the *new* root node for root replace
433  * operations, or the shifted logical of the affected block for all other
434  * operations.
435  *
436  * Note: must be called with write lock (tree_mod_log_write_lock).
437  */
438 static noinline int
439 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
440 {
441         struct rb_root *tm_root;
442         struct rb_node **new;
443         struct rb_node *parent = NULL;
444         struct tree_mod_elem *cur;
445
446         BUG_ON(!tm);
447
448         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
449
450         tm_root = &fs_info->tree_mod_log;
451         new = &tm_root->rb_node;
452         while (*new) {
453                 cur = container_of(*new, struct tree_mod_elem, node);
454                 parent = *new;
455                 if (cur->index < tm->index)
456                         new = &((*new)->rb_left);
457                 else if (cur->index > tm->index)
458                         new = &((*new)->rb_right);
459                 else if (cur->seq < tm->seq)
460                         new = &((*new)->rb_left);
461                 else if (cur->seq > tm->seq)
462                         new = &((*new)->rb_right);
463                 else
464                         return -EEXIST;
465         }
466
467         rb_link_node(&tm->node, parent, new);
468         rb_insert_color(&tm->node, tm_root);
469         return 0;
470 }
471
472 /*
473  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
474  * returns zero with the tree_mod_log_lock acquired. The caller must hold
475  * this until all tree mod log insertions are recorded in the rb tree and then
476  * call tree_mod_log_write_unlock() to release.
477  */
478 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
479                                     struct extent_buffer *eb) {
480         smp_mb();
481         if (list_empty(&(fs_info)->tree_mod_seq_list))
482                 return 1;
483         if (eb && btrfs_header_level(eb) == 0)
484                 return 1;
485
486         tree_mod_log_write_lock(fs_info);
487         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
488                 tree_mod_log_write_unlock(fs_info);
489                 return 1;
490         }
491
492         return 0;
493 }
494
495 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
496 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
497                                     struct extent_buffer *eb)
498 {
499         smp_mb();
500         if (list_empty(&(fs_info)->tree_mod_seq_list))
501                 return 0;
502         if (eb && btrfs_header_level(eb) == 0)
503                 return 0;
504
505         return 1;
506 }
507
508 static struct tree_mod_elem *
509 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
510                     enum mod_log_op op, gfp_t flags)
511 {
512         struct tree_mod_elem *tm;
513
514         tm = kzalloc(sizeof(*tm), flags);
515         if (!tm)
516                 return NULL;
517
518         tm->index = eb->start >> PAGE_CACHE_SHIFT;
519         if (op != MOD_LOG_KEY_ADD) {
520                 btrfs_node_key(eb, &tm->key, slot);
521                 tm->blockptr = btrfs_node_blockptr(eb, slot);
522         }
523         tm->op = op;
524         tm->slot = slot;
525         tm->generation = btrfs_node_ptr_generation(eb, slot);
526         RB_CLEAR_NODE(&tm->node);
527
528         return tm;
529 }
530
531 static noinline int
532 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
533                         struct extent_buffer *eb, int slot,
534                         enum mod_log_op op, gfp_t flags)
535 {
536         struct tree_mod_elem *tm;
537         int ret;
538
539         if (!tree_mod_need_log(fs_info, eb))
540                 return 0;
541
542         tm = alloc_tree_mod_elem(eb, slot, op, flags);
543         if (!tm)
544                 return -ENOMEM;
545
546         if (tree_mod_dont_log(fs_info, eb)) {
547                 kfree(tm);
548                 return 0;
549         }
550
551         ret = __tree_mod_log_insert(fs_info, tm);
552         tree_mod_log_write_unlock(fs_info);
553         if (ret)
554                 kfree(tm);
555
556         return ret;
557 }
558
559 static noinline int
560 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
561                          struct extent_buffer *eb, int dst_slot, int src_slot,
562                          int nr_items, gfp_t flags)
563 {
564         struct tree_mod_elem *tm = NULL;
565         struct tree_mod_elem **tm_list = NULL;
566         int ret = 0;
567         int i;
568         int locked = 0;
569
570         if (!tree_mod_need_log(fs_info, eb))
571                 return 0;
572
573         tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
574         if (!tm_list)
575                 return -ENOMEM;
576
577         tm = kzalloc(sizeof(*tm), flags);
578         if (!tm) {
579                 ret = -ENOMEM;
580                 goto free_tms;
581         }
582
583         tm->index = eb->start >> PAGE_CACHE_SHIFT;
584         tm->slot = src_slot;
585         tm->move.dst_slot = dst_slot;
586         tm->move.nr_items = nr_items;
587         tm->op = MOD_LOG_MOVE_KEYS;
588
589         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
590                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
591                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
592                 if (!tm_list[i]) {
593                         ret = -ENOMEM;
594                         goto free_tms;
595                 }
596         }
597
598         if (tree_mod_dont_log(fs_info, eb))
599                 goto free_tms;
600         locked = 1;
601
602         /*
603          * When we override something during the move, we log these removals.
604          * This can only happen when we move towards the beginning of the
605          * buffer, i.e. dst_slot < src_slot.
606          */
607         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
608                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
609                 if (ret)
610                         goto free_tms;
611         }
612
613         ret = __tree_mod_log_insert(fs_info, tm);
614         if (ret)
615                 goto free_tms;
616         tree_mod_log_write_unlock(fs_info);
617         kfree(tm_list);
618
619         return 0;
620 free_tms:
621         for (i = 0; i < nr_items; i++) {
622                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
623                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
624                 kfree(tm_list[i]);
625         }
626         if (locked)
627                 tree_mod_log_write_unlock(fs_info);
628         kfree(tm_list);
629         kfree(tm);
630
631         return ret;
632 }
633
634 static inline int
635 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
636                        struct tree_mod_elem **tm_list,
637                        int nritems)
638 {
639         int i, j;
640         int ret;
641
642         for (i = nritems - 1; i >= 0; i--) {
643                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
644                 if (ret) {
645                         for (j = nritems - 1; j > i; j--)
646                                 rb_erase(&tm_list[j]->node,
647                                          &fs_info->tree_mod_log);
648                         return ret;
649                 }
650         }
651
652         return 0;
653 }
654
655 static noinline int
656 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
657                          struct extent_buffer *old_root,
658                          struct extent_buffer *new_root, gfp_t flags,
659                          int log_removal)
660 {
661         struct tree_mod_elem *tm = NULL;
662         struct tree_mod_elem **tm_list = NULL;
663         int nritems = 0;
664         int ret = 0;
665         int i;
666
667         if (!tree_mod_need_log(fs_info, NULL))
668                 return 0;
669
670         if (log_removal && btrfs_header_level(old_root) > 0) {
671                 nritems = btrfs_header_nritems(old_root);
672                 tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
673                                   flags);
674                 if (!tm_list) {
675                         ret = -ENOMEM;
676                         goto free_tms;
677                 }
678                 for (i = 0; i < nritems; i++) {
679                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
680                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
681                         if (!tm_list[i]) {
682                                 ret = -ENOMEM;
683                                 goto free_tms;
684                         }
685                 }
686         }
687
688         tm = kzalloc(sizeof(*tm), flags);
689         if (!tm) {
690                 ret = -ENOMEM;
691                 goto free_tms;
692         }
693
694         tm->index = new_root->start >> PAGE_CACHE_SHIFT;
695         tm->old_root.logical = old_root->start;
696         tm->old_root.level = btrfs_header_level(old_root);
697         tm->generation = btrfs_header_generation(old_root);
698         tm->op = MOD_LOG_ROOT_REPLACE;
699
700         if (tree_mod_dont_log(fs_info, NULL))
701                 goto free_tms;
702
703         if (tm_list)
704                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
705         if (!ret)
706                 ret = __tree_mod_log_insert(fs_info, tm);
707
708         tree_mod_log_write_unlock(fs_info);
709         if (ret)
710                 goto free_tms;
711         kfree(tm_list);
712
713         return ret;
714
715 free_tms:
716         if (tm_list) {
717                 for (i = 0; i < nritems; i++)
718                         kfree(tm_list[i]);
719                 kfree(tm_list);
720         }
721         kfree(tm);
722
723         return ret;
724 }
725
726 static struct tree_mod_elem *
727 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
728                       int smallest)
729 {
730         struct rb_root *tm_root;
731         struct rb_node *node;
732         struct tree_mod_elem *cur = NULL;
733         struct tree_mod_elem *found = NULL;
734         u64 index = start >> PAGE_CACHE_SHIFT;
735
736         tree_mod_log_read_lock(fs_info);
737         tm_root = &fs_info->tree_mod_log;
738         node = tm_root->rb_node;
739         while (node) {
740                 cur = container_of(node, struct tree_mod_elem, node);
741                 if (cur->index < index) {
742                         node = node->rb_left;
743                 } else if (cur->index > index) {
744                         node = node->rb_right;
745                 } else if (cur->seq < min_seq) {
746                         node = node->rb_left;
747                 } else if (!smallest) {
748                         /* we want the node with the highest seq */
749                         if (found)
750                                 BUG_ON(found->seq > cur->seq);
751                         found = cur;
752                         node = node->rb_left;
753                 } else if (cur->seq > min_seq) {
754                         /* we want the node with the smallest seq */
755                         if (found)
756                                 BUG_ON(found->seq < cur->seq);
757                         found = cur;
758                         node = node->rb_right;
759                 } else {
760                         found = cur;
761                         break;
762                 }
763         }
764         tree_mod_log_read_unlock(fs_info);
765
766         return found;
767 }
768
769 /*
770  * this returns the element from the log with the smallest time sequence
771  * value that's in the log (the oldest log item). any element with a time
772  * sequence lower than min_seq will be ignored.
773  */
774 static struct tree_mod_elem *
775 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
776                            u64 min_seq)
777 {
778         return __tree_mod_log_search(fs_info, start, min_seq, 1);
779 }
780
781 /*
782  * this returns the element from the log with the largest time sequence
783  * value that's in the log (the most recent log item). any element with
784  * a time sequence lower than min_seq will be ignored.
785  */
786 static struct tree_mod_elem *
787 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
788 {
789         return __tree_mod_log_search(fs_info, start, min_seq, 0);
790 }
791
792 static noinline int
793 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
794                      struct extent_buffer *src, unsigned long dst_offset,
795                      unsigned long src_offset, int nr_items)
796 {
797         int ret = 0;
798         struct tree_mod_elem **tm_list = NULL;
799         struct tree_mod_elem **tm_list_add, **tm_list_rem;
800         int i;
801         int locked = 0;
802
803         if (!tree_mod_need_log(fs_info, NULL))
804                 return 0;
805
806         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
807                 return 0;
808
809         tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
810                           GFP_NOFS);
811         if (!tm_list)
812                 return -ENOMEM;
813
814         tm_list_add = tm_list;
815         tm_list_rem = tm_list + nr_items;
816         for (i = 0; i < nr_items; i++) {
817                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
818                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
819                 if (!tm_list_rem[i]) {
820                         ret = -ENOMEM;
821                         goto free_tms;
822                 }
823
824                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
825                     MOD_LOG_KEY_ADD, GFP_NOFS);
826                 if (!tm_list_add[i]) {
827                         ret = -ENOMEM;
828                         goto free_tms;
829                 }
830         }
831
832         if (tree_mod_dont_log(fs_info, NULL))
833                 goto free_tms;
834         locked = 1;
835
836         for (i = 0; i < nr_items; i++) {
837                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
838                 if (ret)
839                         goto free_tms;
840                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
841                 if (ret)
842                         goto free_tms;
843         }
844
845         tree_mod_log_write_unlock(fs_info);
846         kfree(tm_list);
847
848         return 0;
849
850 free_tms:
851         for (i = 0; i < nr_items * 2; i++) {
852                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
853                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
854                 kfree(tm_list[i]);
855         }
856         if (locked)
857                 tree_mod_log_write_unlock(fs_info);
858         kfree(tm_list);
859
860         return ret;
861 }
862
863 static inline void
864 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
865                      int dst_offset, int src_offset, int nr_items)
866 {
867         int ret;
868         ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
869                                        nr_items, GFP_NOFS);
870         BUG_ON(ret < 0);
871 }
872
873 static noinline void
874 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
875                           struct extent_buffer *eb, int slot, int atomic)
876 {
877         int ret;
878
879         ret = tree_mod_log_insert_key(fs_info, eb, slot,
880                                         MOD_LOG_KEY_REPLACE,
881                                         atomic ? GFP_ATOMIC : GFP_NOFS);
882         BUG_ON(ret < 0);
883 }
884
885 static noinline int
886 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
887 {
888         struct tree_mod_elem **tm_list = NULL;
889         int nritems = 0;
890         int i;
891         int ret = 0;
892
893         if (btrfs_header_level(eb) == 0)
894                 return 0;
895
896         if (!tree_mod_need_log(fs_info, NULL))
897                 return 0;
898
899         nritems = btrfs_header_nritems(eb);
900         tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
901                           GFP_NOFS);
902         if (!tm_list)
903                 return -ENOMEM;
904
905         for (i = 0; i < nritems; i++) {
906                 tm_list[i] = alloc_tree_mod_elem(eb, i,
907                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
908                 if (!tm_list[i]) {
909                         ret = -ENOMEM;
910                         goto free_tms;
911                 }
912         }
913
914         if (tree_mod_dont_log(fs_info, eb))
915                 goto free_tms;
916
917         ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
918         tree_mod_log_write_unlock(fs_info);
919         if (ret)
920                 goto free_tms;
921         kfree(tm_list);
922
923         return 0;
924
925 free_tms:
926         for (i = 0; i < nritems; i++)
927                 kfree(tm_list[i]);
928         kfree(tm_list);
929
930         return ret;
931 }
932
933 static noinline void
934 tree_mod_log_set_root_pointer(struct btrfs_root *root,
935                               struct extent_buffer *new_root_node,
936                               int log_removal)
937 {
938         int ret;
939         ret = tree_mod_log_insert_root(root->fs_info, root->node,
940                                        new_root_node, GFP_NOFS, log_removal);
941         BUG_ON(ret < 0);
942 }
943
944 /*
945  * check if the tree block can be shared by multiple trees
946  */
947 int btrfs_block_can_be_shared(struct btrfs_root *root,
948                               struct extent_buffer *buf)
949 {
950         /*
951          * Tree blocks not in refernece counted trees and tree roots
952          * are never shared. If a block was allocated after the last
953          * snapshot and the block was not allocated by tree relocation,
954          * we know the block is not shared.
955          */
956         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
957             buf != root->node && buf != root->commit_root &&
958             (btrfs_header_generation(buf) <=
959              btrfs_root_last_snapshot(&root->root_item) ||
960              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
961                 return 1;
962 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
963         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
964             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
965                 return 1;
966 #endif
967         return 0;
968 }
969
970 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
971                                        struct btrfs_root *root,
972                                        struct extent_buffer *buf,
973                                        struct extent_buffer *cow,
974                                        int *last_ref)
975 {
976         u64 refs;
977         u64 owner;
978         u64 flags;
979         u64 new_flags = 0;
980         int ret;
981
982         /*
983          * Backrefs update rules:
984          *
985          * Always use full backrefs for extent pointers in tree block
986          * allocated by tree relocation.
987          *
988          * If a shared tree block is no longer referenced by its owner
989          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
990          * use full backrefs for extent pointers in tree block.
991          *
992          * If a tree block is been relocating
993          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
994          * use full backrefs for extent pointers in tree block.
995          * The reason for this is some operations (such as drop tree)
996          * are only allowed for blocks use full backrefs.
997          */
998
999         if (btrfs_block_can_be_shared(root, buf)) {
1000                 ret = btrfs_lookup_extent_info(trans, root, buf->start,
1001                                                btrfs_header_level(buf), 1,
1002                                                &refs, &flags);
1003                 if (ret)
1004                         return ret;
1005                 if (refs == 0) {
1006                         ret = -EROFS;
1007                         btrfs_std_error(root->fs_info, ret);
1008                         return ret;
1009                 }
1010         } else {
1011                 refs = 1;
1012                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1013                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1014                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1015                 else
1016                         flags = 0;
1017         }
1018
1019         owner = btrfs_header_owner(buf);
1020         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1021                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1022
1023         if (refs > 1) {
1024                 if ((owner == root->root_key.objectid ||
1025                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1026                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1027                         ret = btrfs_inc_ref(trans, root, buf, 1);
1028                         BUG_ON(ret); /* -ENOMEM */
1029
1030                         if (root->root_key.objectid ==
1031                             BTRFS_TREE_RELOC_OBJECTID) {
1032                                 ret = btrfs_dec_ref(trans, root, buf, 0);
1033                                 BUG_ON(ret); /* -ENOMEM */
1034                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1035                                 BUG_ON(ret); /* -ENOMEM */
1036                         }
1037                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1038                 } else {
1039
1040                         if (root->root_key.objectid ==
1041                             BTRFS_TREE_RELOC_OBJECTID)
1042                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1043                         else
1044                                 ret = btrfs_inc_ref(trans, root, cow, 0);
1045                         BUG_ON(ret); /* -ENOMEM */
1046                 }
1047                 if (new_flags != 0) {
1048                         int level = btrfs_header_level(buf);
1049
1050                         ret = btrfs_set_disk_extent_flags(trans, root,
1051                                                           buf->start,
1052                                                           buf->len,
1053                                                           new_flags, level, 0);
1054                         if (ret)
1055                                 return ret;
1056                 }
1057         } else {
1058                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1059                         if (root->root_key.objectid ==
1060                             BTRFS_TREE_RELOC_OBJECTID)
1061                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1062                         else
1063                                 ret = btrfs_inc_ref(trans, root, cow, 0);
1064                         BUG_ON(ret); /* -ENOMEM */
1065                         ret = btrfs_dec_ref(trans, root, buf, 1);
1066                         BUG_ON(ret); /* -ENOMEM */
1067                 }
1068                 clean_tree_block(trans, root, buf);
1069                 *last_ref = 1;
1070         }
1071         return 0;
1072 }
1073
1074 /*
1075  * does the dirty work in cow of a single block.  The parent block (if
1076  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1077  * dirty and returned locked.  If you modify the block it needs to be marked
1078  * dirty again.
1079  *
1080  * search_start -- an allocation hint for the new block
1081  *
1082  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1083  * bytes the allocator should try to find free next to the block it returns.
1084  * This is just a hint and may be ignored by the allocator.
1085  */
1086 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1087                              struct btrfs_root *root,
1088                              struct extent_buffer *buf,
1089                              struct extent_buffer *parent, int parent_slot,
1090                              struct extent_buffer **cow_ret,
1091                              u64 search_start, u64 empty_size)
1092 {
1093         struct btrfs_disk_key disk_key;
1094         struct extent_buffer *cow;
1095         int level, ret;
1096         int last_ref = 0;
1097         int unlock_orig = 0;
1098         u64 parent_start;
1099
1100         if (*cow_ret == buf)
1101                 unlock_orig = 1;
1102
1103         btrfs_assert_tree_locked(buf);
1104
1105         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1106                 trans->transid != root->fs_info->running_transaction->transid);
1107         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1108                 trans->transid != root->last_trans);
1109
1110         level = btrfs_header_level(buf);
1111
1112         if (level == 0)
1113                 btrfs_item_key(buf, &disk_key, 0);
1114         else
1115                 btrfs_node_key(buf, &disk_key, 0);
1116
1117         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1118                 if (parent)
1119                         parent_start = parent->start;
1120                 else
1121                         parent_start = 0;
1122         } else
1123                 parent_start = 0;
1124
1125         cow = btrfs_alloc_tree_block(trans, root, parent_start,
1126                         root->root_key.objectid, &disk_key, level,
1127                         search_start, empty_size);
1128         if (IS_ERR(cow))
1129                 return PTR_ERR(cow);
1130
1131         /* cow is set to blocking by btrfs_init_new_buffer */
1132
1133         copy_extent_buffer(cow, buf, 0, 0, cow->len);
1134         btrfs_set_header_bytenr(cow, cow->start);
1135         btrfs_set_header_generation(cow, trans->transid);
1136         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1137         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1138                                      BTRFS_HEADER_FLAG_RELOC);
1139         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1140                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1141         else
1142                 btrfs_set_header_owner(cow, root->root_key.objectid);
1143
1144         write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
1145                             BTRFS_FSID_SIZE);
1146
1147         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1148         if (ret) {
1149                 btrfs_abort_transaction(trans, root, ret);
1150                 return ret;
1151         }
1152
1153         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1154                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1155                 if (ret)
1156                         return ret;
1157         }
1158
1159         if (buf == root->node) {
1160                 WARN_ON(parent && parent != buf);
1161                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1162                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1163                         parent_start = buf->start;
1164                 else
1165                         parent_start = 0;
1166
1167                 extent_buffer_get(cow);
1168                 tree_mod_log_set_root_pointer(root, cow, 1);
1169                 rcu_assign_pointer(root->node, cow);
1170
1171                 btrfs_free_tree_block(trans, root, buf, parent_start,
1172                                       last_ref);
1173                 free_extent_buffer(buf);
1174                 add_root_to_dirty_list(root);
1175         } else {
1176                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1177                         parent_start = parent->start;
1178                 else
1179                         parent_start = 0;
1180
1181                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1182                 tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1183                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1184                 btrfs_set_node_blockptr(parent, parent_slot,
1185                                         cow->start);
1186                 btrfs_set_node_ptr_generation(parent, parent_slot,
1187                                               trans->transid);
1188                 btrfs_mark_buffer_dirty(parent);
1189                 if (last_ref) {
1190                         ret = tree_mod_log_free_eb(root->fs_info, buf);
1191                         if (ret) {
1192                                 btrfs_abort_transaction(trans, root, ret);
1193                                 return ret;
1194                         }
1195                 }
1196                 btrfs_free_tree_block(trans, root, buf, parent_start,
1197                                       last_ref);
1198         }
1199         if (unlock_orig)
1200                 btrfs_tree_unlock(buf);
1201         free_extent_buffer_stale(buf);
1202         btrfs_mark_buffer_dirty(cow);
1203         *cow_ret = cow;
1204         return 0;
1205 }
1206
1207 /*
1208  * returns the logical address of the oldest predecessor of the given root.
1209  * entries older than time_seq are ignored.
1210  */
1211 static struct tree_mod_elem *
1212 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1213                            struct extent_buffer *eb_root, u64 time_seq)
1214 {
1215         struct tree_mod_elem *tm;
1216         struct tree_mod_elem *found = NULL;
1217         u64 root_logical = eb_root->start;
1218         int looped = 0;
1219
1220         if (!time_seq)
1221                 return NULL;
1222
1223         /*
1224          * the very last operation that's logged for a root is the replacement
1225          * operation (if it is replaced at all). this has the index of the *new*
1226          * root, making it the very first operation that's logged for this root.
1227          */
1228         while (1) {
1229                 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1230                                                 time_seq);
1231                 if (!looped && !tm)
1232                         return NULL;
1233                 /*
1234                  * if there are no tree operation for the oldest root, we simply
1235                  * return it. this should only happen if that (old) root is at
1236                  * level 0.
1237                  */
1238                 if (!tm)
1239                         break;
1240
1241                 /*
1242                  * if there's an operation that's not a root replacement, we
1243                  * found the oldest version of our root. normally, we'll find a
1244                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1245                  */
1246                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1247                         break;
1248
1249                 found = tm;
1250                 root_logical = tm->old_root.logical;
1251                 looped = 1;
1252         }
1253
1254         /* if there's no old root to return, return what we found instead */
1255         if (!found)
1256                 found = tm;
1257
1258         return found;
1259 }
1260
1261 /*
1262  * tm is a pointer to the first operation to rewind within eb. then, all
1263  * previous operations will be rewinded (until we reach something older than
1264  * time_seq).
1265  */
1266 static void
1267 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1268                       u64 time_seq, struct tree_mod_elem *first_tm)
1269 {
1270         u32 n;
1271         struct rb_node *next;
1272         struct tree_mod_elem *tm = first_tm;
1273         unsigned long o_dst;
1274         unsigned long o_src;
1275         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1276
1277         n = btrfs_header_nritems(eb);
1278         tree_mod_log_read_lock(fs_info);
1279         while (tm && tm->seq >= time_seq) {
1280                 /*
1281                  * all the operations are recorded with the operator used for
1282                  * the modification. as we're going backwards, we do the
1283                  * opposite of each operation here.
1284                  */
1285                 switch (tm->op) {
1286                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1287                         BUG_ON(tm->slot < n);
1288                         /* Fallthrough */
1289                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1290                 case MOD_LOG_KEY_REMOVE:
1291                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1292                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1293                         btrfs_set_node_ptr_generation(eb, tm->slot,
1294                                                       tm->generation);
1295                         n++;
1296                         break;
1297                 case MOD_LOG_KEY_REPLACE:
1298                         BUG_ON(tm->slot >= n);
1299                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1300                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1301                         btrfs_set_node_ptr_generation(eb, tm->slot,
1302                                                       tm->generation);
1303                         break;
1304                 case MOD_LOG_KEY_ADD:
1305                         /* if a move operation is needed it's in the log */
1306                         n--;
1307                         break;
1308                 case MOD_LOG_MOVE_KEYS:
1309                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1310                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1311                         memmove_extent_buffer(eb, o_dst, o_src,
1312                                               tm->move.nr_items * p_size);
1313                         break;
1314                 case MOD_LOG_ROOT_REPLACE:
1315                         /*
1316                          * this operation is special. for roots, this must be
1317                          * handled explicitly before rewinding.
1318                          * for non-roots, this operation may exist if the node
1319                          * was a root: root A -> child B; then A gets empty and
1320                          * B is promoted to the new root. in the mod log, we'll
1321                          * have a root-replace operation for B, a tree block
1322                          * that is no root. we simply ignore that operation.
1323                          */
1324                         break;
1325                 }
1326                 next = rb_next(&tm->node);
1327                 if (!next)
1328                         break;
1329                 tm = container_of(next, struct tree_mod_elem, node);
1330                 if (tm->index != first_tm->index)
1331                         break;
1332         }
1333         tree_mod_log_read_unlock(fs_info);
1334         btrfs_set_header_nritems(eb, n);
1335 }
1336
1337 /*
1338  * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1339  * is returned. If rewind operations happen, a fresh buffer is returned. The
1340  * returned buffer is always read-locked. If the returned buffer is not the
1341  * input buffer, the lock on the input buffer is released and the input buffer
1342  * is freed (its refcount is decremented).
1343  */
1344 static struct extent_buffer *
1345 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1346                     struct extent_buffer *eb, u64 time_seq)
1347 {
1348         struct extent_buffer *eb_rewin;
1349         struct tree_mod_elem *tm;
1350
1351         if (!time_seq)
1352                 return eb;
1353
1354         if (btrfs_header_level(eb) == 0)
1355                 return eb;
1356
1357         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1358         if (!tm)
1359                 return eb;
1360
1361         btrfs_set_path_blocking(path);
1362         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1363
1364         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1365                 BUG_ON(tm->slot != 0);
1366                 eb_rewin = alloc_dummy_extent_buffer(eb->start,
1367                                                 fs_info->tree_root->nodesize);
1368                 if (!eb_rewin) {
1369                         btrfs_tree_read_unlock_blocking(eb);
1370                         free_extent_buffer(eb);
1371                         return NULL;
1372                 }
1373                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1374                 btrfs_set_header_backref_rev(eb_rewin,
1375                                              btrfs_header_backref_rev(eb));
1376                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1377                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1378         } else {
1379                 eb_rewin = btrfs_clone_extent_buffer(eb);
1380                 if (!eb_rewin) {
1381                         btrfs_tree_read_unlock_blocking(eb);
1382                         free_extent_buffer(eb);
1383                         return NULL;
1384                 }
1385         }
1386
1387         btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1388         btrfs_tree_read_unlock_blocking(eb);
1389         free_extent_buffer(eb);
1390
1391         extent_buffer_get(eb_rewin);
1392         btrfs_tree_read_lock(eb_rewin);
1393         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1394         WARN_ON(btrfs_header_nritems(eb_rewin) >
1395                 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1396
1397         return eb_rewin;
1398 }
1399
1400 /*
1401  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1402  * value. If there are no changes, the current root->root_node is returned. If
1403  * anything changed in between, there's a fresh buffer allocated on which the
1404  * rewind operations are done. In any case, the returned buffer is read locked.
1405  * Returns NULL on error (with no locks held).
1406  */
1407 static inline struct extent_buffer *
1408 get_old_root(struct btrfs_root *root, u64 time_seq)
1409 {
1410         struct tree_mod_elem *tm;
1411         struct extent_buffer *eb = NULL;
1412         struct extent_buffer *eb_root;
1413         struct extent_buffer *old;
1414         struct tree_mod_root *old_root = NULL;
1415         u64 old_generation = 0;
1416         u64 logical;
1417
1418         eb_root = btrfs_read_lock_root_node(root);
1419         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1420         if (!tm)
1421                 return eb_root;
1422
1423         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1424                 old_root = &tm->old_root;
1425                 old_generation = tm->generation;
1426                 logical = old_root->logical;
1427         } else {
1428                 logical = eb_root->start;
1429         }
1430
1431         tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1432         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1433                 btrfs_tree_read_unlock(eb_root);
1434                 free_extent_buffer(eb_root);
1435                 old = read_tree_block(root, logical, 0);
1436                 if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
1437                         free_extent_buffer(old);
1438                         btrfs_warn(root->fs_info,
1439                                 "failed to read tree block %llu from get_old_root", logical);
1440                 } else {
1441                         eb = btrfs_clone_extent_buffer(old);
1442                         free_extent_buffer(old);
1443                 }
1444         } else if (old_root) {
1445                 btrfs_tree_read_unlock(eb_root);
1446                 free_extent_buffer(eb_root);
1447                 eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1448         } else {
1449                 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1450                 eb = btrfs_clone_extent_buffer(eb_root);
1451                 btrfs_tree_read_unlock_blocking(eb_root);
1452                 free_extent_buffer(eb_root);
1453         }
1454
1455         if (!eb)
1456                 return NULL;
1457         extent_buffer_get(eb);
1458         btrfs_tree_read_lock(eb);
1459         if (old_root) {
1460                 btrfs_set_header_bytenr(eb, eb->start);
1461                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1462                 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1463                 btrfs_set_header_level(eb, old_root->level);
1464                 btrfs_set_header_generation(eb, old_generation);
1465         }
1466         if (tm)
1467                 __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1468         else
1469                 WARN_ON(btrfs_header_level(eb) != 0);
1470         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1471
1472         return eb;
1473 }
1474
1475 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1476 {
1477         struct tree_mod_elem *tm;
1478         int level;
1479         struct extent_buffer *eb_root = btrfs_root_node(root);
1480
1481         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1482         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1483                 level = tm->old_root.level;
1484         } else {
1485                 level = btrfs_header_level(eb_root);
1486         }
1487         free_extent_buffer(eb_root);
1488
1489         return level;
1490 }
1491
1492 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1493                                    struct btrfs_root *root,
1494                                    struct extent_buffer *buf)
1495 {
1496         if (btrfs_test_is_dummy_root(root))
1497                 return 0;
1498
1499         /* ensure we can see the force_cow */
1500         smp_rmb();
1501
1502         /*
1503          * We do not need to cow a block if
1504          * 1) this block is not created or changed in this transaction;
1505          * 2) this block does not belong to TREE_RELOC tree;
1506          * 3) the root is not forced COW.
1507          *
1508          * What is forced COW:
1509          *    when we create snapshot during commiting the transaction,
1510          *    after we've finished coping src root, we must COW the shared
1511          *    block to ensure the metadata consistency.
1512          */
1513         if (btrfs_header_generation(buf) == trans->transid &&
1514             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1515             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1516               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1517             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1518                 return 0;
1519         return 1;
1520 }
1521
1522 /*
1523  * cows a single block, see __btrfs_cow_block for the real work.
1524  * This version of it has extra checks so that a block isn't cow'd more than
1525  * once per transaction, as long as it hasn't been written yet
1526  */
1527 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1528                     struct btrfs_root *root, struct extent_buffer *buf,
1529                     struct extent_buffer *parent, int parent_slot,
1530                     struct extent_buffer **cow_ret)
1531 {
1532         u64 search_start;
1533         int ret;
1534
1535         if (trans->transaction != root->fs_info->running_transaction)
1536                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1537                        trans->transid,
1538                        root->fs_info->running_transaction->transid);
1539
1540         if (trans->transid != root->fs_info->generation)
1541                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1542                        trans->transid, root->fs_info->generation);
1543
1544         if (!should_cow_block(trans, root, buf)) {
1545                 *cow_ret = buf;
1546                 return 0;
1547         }
1548
1549         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1550
1551         if (parent)
1552                 btrfs_set_lock_blocking(parent);
1553         btrfs_set_lock_blocking(buf);
1554
1555         ret = __btrfs_cow_block(trans, root, buf, parent,
1556                                  parent_slot, cow_ret, search_start, 0);
1557
1558         trace_btrfs_cow_block(root, buf, *cow_ret);
1559
1560         return ret;
1561 }
1562
1563 /*
1564  * helper function for defrag to decide if two blocks pointed to by a
1565  * node are actually close by
1566  */
1567 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1568 {
1569         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1570                 return 1;
1571         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1572                 return 1;
1573         return 0;
1574 }
1575
1576 /*
1577  * compare two keys in a memcmp fashion
1578  */
1579 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1580 {
1581         struct btrfs_key k1;
1582
1583         btrfs_disk_key_to_cpu(&k1, disk);
1584
1585         return btrfs_comp_cpu_keys(&k1, k2);
1586 }
1587
1588 /*
1589  * same as comp_keys only with two btrfs_key's
1590  */
1591 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1592 {
1593         if (k1->objectid > k2->objectid)
1594                 return 1;
1595         if (k1->objectid < k2->objectid)
1596                 return -1;
1597         if (k1->type > k2->type)
1598                 return 1;
1599         if (k1->type < k2->type)
1600                 return -1;
1601         if (k1->offset > k2->offset)
1602                 return 1;
1603         if (k1->offset < k2->offset)
1604                 return -1;
1605         return 0;
1606 }
1607
1608 /*
1609  * this is used by the defrag code to go through all the
1610  * leaves pointed to by a node and reallocate them so that
1611  * disk order is close to key order
1612  */
1613 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1614                        struct btrfs_root *root, struct extent_buffer *parent,
1615                        int start_slot, u64 *last_ret,
1616                        struct btrfs_key *progress)
1617 {
1618         struct extent_buffer *cur;
1619         u64 blocknr;
1620         u64 gen;
1621         u64 search_start = *last_ret;
1622         u64 last_block = 0;
1623         u64 other;
1624         u32 parent_nritems;
1625         int end_slot;
1626         int i;
1627         int err = 0;
1628         int parent_level;
1629         int uptodate;
1630         u32 blocksize;
1631         int progress_passed = 0;
1632         struct btrfs_disk_key disk_key;
1633
1634         parent_level = btrfs_header_level(parent);
1635
1636         WARN_ON(trans->transaction != root->fs_info->running_transaction);
1637         WARN_ON(trans->transid != root->fs_info->generation);
1638
1639         parent_nritems = btrfs_header_nritems(parent);
1640         blocksize = root->nodesize;
1641         end_slot = parent_nritems;
1642
1643         if (parent_nritems == 1)
1644                 return 0;
1645
1646         btrfs_set_lock_blocking(parent);
1647
1648         for (i = start_slot; i < end_slot; i++) {
1649                 int close = 1;
1650
1651                 btrfs_node_key(parent, &disk_key, i);
1652                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1653                         continue;
1654
1655                 progress_passed = 1;
1656                 blocknr = btrfs_node_blockptr(parent, i);
1657                 gen = btrfs_node_ptr_generation(parent, i);
1658                 if (last_block == 0)
1659                         last_block = blocknr;
1660
1661                 if (i > 0) {
1662                         other = btrfs_node_blockptr(parent, i - 1);
1663                         close = close_blocks(blocknr, other, blocksize);
1664                 }
1665                 if (!close && i < end_slot - 2) {
1666                         other = btrfs_node_blockptr(parent, i + 1);
1667                         close = close_blocks(blocknr, other, blocksize);
1668                 }
1669                 if (close) {
1670                         last_block = blocknr;
1671                         continue;
1672                 }
1673
1674                 cur = btrfs_find_tree_block(root, blocknr);
1675                 if (cur)
1676                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1677                 else
1678                         uptodate = 0;
1679                 if (!cur || !uptodate) {
1680                         if (!cur) {
1681                                 cur = read_tree_block(root, blocknr, gen);
1682                                 if (!cur || !extent_buffer_uptodate(cur)) {
1683                                         free_extent_buffer(cur);
1684                                         return -EIO;
1685                                 }
1686                         } else if (!uptodate) {
1687                                 err = btrfs_read_buffer(cur, gen);
1688                                 if (err) {
1689                                         free_extent_buffer(cur);
1690                                         return err;
1691                                 }
1692                         }
1693                 }
1694                 if (search_start == 0)
1695                         search_start = last_block;
1696
1697                 btrfs_tree_lock(cur);
1698                 btrfs_set_lock_blocking(cur);
1699                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1700                                         &cur, search_start,
1701                                         min(16 * blocksize,
1702                                             (end_slot - i) * blocksize));
1703                 if (err) {
1704                         btrfs_tree_unlock(cur);
1705                         free_extent_buffer(cur);
1706                         break;
1707                 }
1708                 search_start = cur->start;
1709                 last_block = cur->start;
1710                 *last_ret = search_start;
1711                 btrfs_tree_unlock(cur);
1712                 free_extent_buffer(cur);
1713         }
1714         return err;
1715 }
1716
1717 /*
1718  * The leaf data grows from end-to-front in the node.
1719  * this returns the address of the start of the last item,
1720  * which is the stop of the leaf data stack
1721  */
1722 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1723                                          struct extent_buffer *leaf)
1724 {
1725         u32 nr = btrfs_header_nritems(leaf);
1726         if (nr == 0)
1727                 return BTRFS_LEAF_DATA_SIZE(root);
1728         return btrfs_item_offset_nr(leaf, nr - 1);
1729 }
1730
1731
1732 /*
1733  * search for key in the extent_buffer.  The items start at offset p,
1734  * and they are item_size apart.  There are 'max' items in p.
1735  *
1736  * the slot in the array is returned via slot, and it points to
1737  * the place where you would insert key if it is not found in
1738  * the array.
1739  *
1740  * slot may point to max if the key is bigger than all of the keys
1741  */
1742 static noinline int generic_bin_search(struct extent_buffer *eb,
1743                                        unsigned long p,
1744                                        int item_size, struct btrfs_key *key,
1745                                        int max, int *slot)
1746 {
1747         int low = 0;
1748         int high = max;
1749         int mid;
1750         int ret;
1751         struct btrfs_disk_key *tmp = NULL;
1752         struct btrfs_disk_key unaligned;
1753         unsigned long offset;
1754         char *kaddr = NULL;
1755         unsigned long map_start = 0;
1756         unsigned long map_len = 0;
1757         int err;
1758
1759         while (low < high) {
1760                 mid = (low + high) / 2;
1761                 offset = p + mid * item_size;
1762
1763                 if (!kaddr || offset < map_start ||
1764                     (offset + sizeof(struct btrfs_disk_key)) >
1765                     map_start + map_len) {
1766
1767                         err = map_private_extent_buffer(eb, offset,
1768                                                 sizeof(struct btrfs_disk_key),
1769                                                 &kaddr, &map_start, &map_len);
1770
1771                         if (!err) {
1772                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1773                                                         map_start);
1774                         } else {
1775                                 read_extent_buffer(eb, &unaligned,
1776                                                    offset, sizeof(unaligned));
1777                                 tmp = &unaligned;
1778                         }
1779
1780                 } else {
1781                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1782                                                         map_start);
1783                 }
1784                 ret = comp_keys(tmp, key);
1785
1786                 if (ret < 0)
1787                         low = mid + 1;
1788                 else if (ret > 0)
1789                         high = mid;
1790                 else {
1791                         *slot = mid;
1792                         return 0;
1793                 }
1794         }
1795         *slot = low;
1796         return 1;
1797 }
1798
1799 /*
1800  * simple bin_search frontend that does the right thing for
1801  * leaves vs nodes
1802  */
1803 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1804                       int level, int *slot)
1805 {
1806         if (level == 0)
1807                 return generic_bin_search(eb,
1808                                           offsetof(struct btrfs_leaf, items),
1809                                           sizeof(struct btrfs_item),
1810                                           key, btrfs_header_nritems(eb),
1811                                           slot);
1812         else
1813                 return generic_bin_search(eb,
1814                                           offsetof(struct btrfs_node, ptrs),
1815                                           sizeof(struct btrfs_key_ptr),
1816                                           key, btrfs_header_nritems(eb),
1817                                           slot);
1818 }
1819
1820 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1821                      int level, int *slot)
1822 {
1823         return bin_search(eb, key, level, slot);
1824 }
1825
1826 static void root_add_used(struct btrfs_root *root, u32 size)
1827 {
1828         spin_lock(&root->accounting_lock);
1829         btrfs_set_root_used(&root->root_item,
1830                             btrfs_root_used(&root->root_item) + size);
1831         spin_unlock(&root->accounting_lock);
1832 }
1833
1834 static void root_sub_used(struct btrfs_root *root, u32 size)
1835 {
1836         spin_lock(&root->accounting_lock);
1837         btrfs_set_root_used(&root->root_item,
1838                             btrfs_root_used(&root->root_item) - size);
1839         spin_unlock(&root->accounting_lock);
1840 }
1841
1842 /* given a node and slot number, this reads the blocks it points to.  The
1843  * extent buffer is returned with a reference taken (but unlocked).
1844  * NULL is returned on error.
1845  */
1846 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1847                                    struct extent_buffer *parent, int slot)
1848 {
1849         int level = btrfs_header_level(parent);
1850         struct extent_buffer *eb;
1851
1852         if (slot < 0)
1853                 return NULL;
1854         if (slot >= btrfs_header_nritems(parent))
1855                 return NULL;
1856
1857         BUG_ON(level == 0);
1858
1859         eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1860                              btrfs_node_ptr_generation(parent, slot));
1861         if (eb && !extent_buffer_uptodate(eb)) {
1862                 free_extent_buffer(eb);
1863                 eb = NULL;
1864         }
1865
1866         return eb;
1867 }
1868
1869 /*
1870  * node level balancing, used to make sure nodes are in proper order for
1871  * item deletion.  We balance from the top down, so we have to make sure
1872  * that a deletion won't leave an node completely empty later on.
1873  */
1874 static noinline int balance_level(struct btrfs_trans_handle *trans,
1875                          struct btrfs_root *root,
1876                          struct btrfs_path *path, int level)
1877 {
1878         struct extent_buffer *right = NULL;
1879         struct extent_buffer *mid;
1880         struct extent_buffer *left = NULL;
1881         struct extent_buffer *parent = NULL;
1882         int ret = 0;
1883         int wret;
1884         int pslot;
1885         int orig_slot = path->slots[level];
1886         u64 orig_ptr;
1887
1888         if (level == 0)
1889                 return 0;
1890
1891         mid = path->nodes[level];
1892
1893         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1894                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1895         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1896
1897         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1898
1899         if (level < BTRFS_MAX_LEVEL - 1) {
1900                 parent = path->nodes[level + 1];
1901                 pslot = path->slots[level + 1];
1902         }
1903
1904         /*
1905          * deal with the case where there is only one pointer in the root
1906          * by promoting the node below to a root
1907          */
1908         if (!parent) {
1909                 struct extent_buffer *child;
1910
1911                 if (btrfs_header_nritems(mid) != 1)
1912                         return 0;
1913
1914                 /* promote the child to a root */
1915                 child = read_node_slot(root, mid, 0);
1916                 if (!child) {
1917                         ret = -EROFS;
1918                         btrfs_std_error(root->fs_info, ret);
1919                         goto enospc;
1920                 }
1921
1922                 btrfs_tree_lock(child);
1923                 btrfs_set_lock_blocking(child);
1924                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1925                 if (ret) {
1926                         btrfs_tree_unlock(child);
1927                         free_extent_buffer(child);
1928                         goto enospc;
1929                 }
1930
1931                 tree_mod_log_set_root_pointer(root, child, 1);
1932                 rcu_assign_pointer(root->node, child);
1933
1934                 add_root_to_dirty_list(root);
1935                 btrfs_tree_unlock(child);
1936
1937                 path->locks[level] = 0;
1938                 path->nodes[level] = NULL;
1939                 clean_tree_block(trans, root, mid);
1940                 btrfs_tree_unlock(mid);
1941                 /* once for the path */
1942                 free_extent_buffer(mid);
1943
1944                 root_sub_used(root, mid->len);
1945                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1946                 /* once for the root ptr */
1947                 free_extent_buffer_stale(mid);
1948                 return 0;
1949         }
1950         if (btrfs_header_nritems(mid) >
1951             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1952                 return 0;
1953
1954         left = read_node_slot(root, parent, pslot - 1);
1955         if (left) {
1956                 btrfs_tree_lock(left);
1957                 btrfs_set_lock_blocking(left);
1958                 wret = btrfs_cow_block(trans, root, left,
1959                                        parent, pslot - 1, &left);
1960                 if (wret) {
1961                         ret = wret;
1962                         goto enospc;
1963                 }
1964         }
1965         right = read_node_slot(root, parent, pslot + 1);
1966         if (right) {
1967                 btrfs_tree_lock(right);
1968                 btrfs_set_lock_blocking(right);
1969                 wret = btrfs_cow_block(trans, root, right,
1970                                        parent, pslot + 1, &right);
1971                 if (wret) {
1972                         ret = wret;
1973                         goto enospc;
1974                 }
1975         }
1976
1977         /* first, try to make some room in the middle buffer */
1978         if (left) {
1979                 orig_slot += btrfs_header_nritems(left);
1980                 wret = push_node_left(trans, root, left, mid, 1);
1981                 if (wret < 0)
1982                         ret = wret;
1983         }
1984
1985         /*
1986          * then try to empty the right most buffer into the middle
1987          */
1988         if (right) {
1989                 wret = push_node_left(trans, root, mid, right, 1);
1990                 if (wret < 0 && wret != -ENOSPC)
1991                         ret = wret;
1992                 if (btrfs_header_nritems(right) == 0) {
1993                         clean_tree_block(trans, root, right);
1994                         btrfs_tree_unlock(right);
1995                         del_ptr(root, path, level + 1, pslot + 1);
1996                         root_sub_used(root, right->len);
1997                         btrfs_free_tree_block(trans, root, right, 0, 1);
1998                         free_extent_buffer_stale(right);
1999                         right = NULL;
2000                 } else {
2001                         struct btrfs_disk_key right_key;
2002                         btrfs_node_key(right, &right_key, 0);
2003                         tree_mod_log_set_node_key(root->fs_info, parent,
2004                                                   pslot + 1, 0);
2005                         btrfs_set_node_key(parent, &right_key, pslot + 1);
2006                         btrfs_mark_buffer_dirty(parent);
2007                 }
2008         }
2009         if (btrfs_header_nritems(mid) == 1) {
2010                 /*
2011                  * we're not allowed to leave a node with one item in the
2012                  * tree during a delete.  A deletion from lower in the tree
2013                  * could try to delete the only pointer in this node.
2014                  * So, pull some keys from the left.
2015                  * There has to be a left pointer at this point because
2016                  * otherwise we would have pulled some pointers from the
2017                  * right
2018                  */
2019                 if (!left) {
2020                         ret = -EROFS;
2021                         btrfs_std_error(root->fs_info, ret);
2022                         goto enospc;
2023                 }
2024                 wret = balance_node_right(trans, root, mid, left);
2025                 if (wret < 0) {
2026                         ret = wret;
2027                         goto enospc;
2028                 }
2029                 if (wret == 1) {
2030                         wret = push_node_left(trans, root, left, mid, 1);
2031                         if (wret < 0)
2032                                 ret = wret;
2033                 }
2034                 BUG_ON(wret == 1);
2035         }
2036         if (btrfs_header_nritems(mid) == 0) {
2037                 clean_tree_block(trans, root, mid);
2038                 btrfs_tree_unlock(mid);
2039                 del_ptr(root, path, level + 1, pslot);
2040                 root_sub_used(root, mid->len);
2041                 btrfs_free_tree_block(trans, root, mid, 0, 1);
2042                 free_extent_buffer_stale(mid);
2043                 mid = NULL;
2044         } else {
2045                 /* update the parent key to reflect our changes */
2046                 struct btrfs_disk_key mid_key;
2047                 btrfs_node_key(mid, &mid_key, 0);
2048                 tree_mod_log_set_node_key(root->fs_info, parent,
2049                                           pslot, 0);
2050                 btrfs_set_node_key(parent, &mid_key, pslot);
2051                 btrfs_mark_buffer_dirty(parent);
2052         }
2053
2054         /* update the path */
2055         if (left) {
2056                 if (btrfs_header_nritems(left) > orig_slot) {
2057                         extent_buffer_get(left);
2058                         /* left was locked after cow */
2059                         path->nodes[level] = left;
2060                         path->slots[level + 1] -= 1;
2061                         path->slots[level] = orig_slot;
2062                         if (mid) {
2063                                 btrfs_tree_unlock(mid);
2064                                 free_extent_buffer(mid);
2065                         }
2066                 } else {
2067                         orig_slot -= btrfs_header_nritems(left);
2068                         path->slots[level] = orig_slot;
2069                 }
2070         }
2071         /* double check we haven't messed things up */
2072         if (orig_ptr !=
2073             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2074                 BUG();
2075 enospc:
2076         if (right) {
2077                 btrfs_tree_unlock(right);
2078                 free_extent_buffer(right);
2079         }
2080         if (left) {
2081                 if (path->nodes[level] != left)
2082                         btrfs_tree_unlock(left);
2083                 free_extent_buffer(left);
2084         }
2085         return ret;
2086 }
2087
2088 /* Node balancing for insertion.  Here we only split or push nodes around
2089  * when they are completely full.  This is also done top down, so we
2090  * have to be pessimistic.
2091  */
2092 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2093                                           struct btrfs_root *root,
2094                                           struct btrfs_path *path, int level)
2095 {
2096         struct extent_buffer *right = NULL;
2097         struct extent_buffer *mid;
2098         struct extent_buffer *left = NULL;
2099         struct extent_buffer *parent = NULL;
2100         int ret = 0;
2101         int wret;
2102         int pslot;
2103         int orig_slot = path->slots[level];
2104
2105         if (level == 0)
2106                 return 1;
2107
2108         mid = path->nodes[level];
2109         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2110
2111         if (level < BTRFS_MAX_LEVEL - 1) {
2112                 parent = path->nodes[level + 1];
2113                 pslot = path->slots[level + 1];
2114         }
2115
2116         if (!parent)
2117                 return 1;
2118
2119         left = read_node_slot(root, parent, pslot - 1);
2120
2121         /* first, try to make some room in the middle buffer */
2122         if (left) {
2123                 u32 left_nr;
2124
2125                 btrfs_tree_lock(left);
2126                 btrfs_set_lock_blocking(left);
2127
2128                 left_nr = btrfs_header_nritems(left);
2129                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2130                         wret = 1;
2131                 } else {
2132                         ret = btrfs_cow_block(trans, root, left, parent,
2133                                               pslot - 1, &left);
2134                         if (ret)
2135                                 wret = 1;
2136                         else {
2137                                 wret = push_node_left(trans, root,
2138                                                       left, mid, 0);
2139                         }
2140                 }
2141                 if (wret < 0)
2142                         ret = wret;
2143                 if (wret == 0) {
2144                         struct btrfs_disk_key disk_key;
2145                         orig_slot += left_nr;
2146                         btrfs_node_key(mid, &disk_key, 0);
2147                         tree_mod_log_set_node_key(root->fs_info, parent,
2148                                                   pslot, 0);
2149                         btrfs_set_node_key(parent, &disk_key, pslot);
2150                         btrfs_mark_buffer_dirty(parent);
2151                         if (btrfs_header_nritems(left) > orig_slot) {
2152                                 path->nodes[level] = left;
2153                                 path->slots[level + 1] -= 1;
2154                                 path->slots[level] = orig_slot;
2155                                 btrfs_tree_unlock(mid);
2156                                 free_extent_buffer(mid);
2157                         } else {
2158                                 orig_slot -=
2159                                         btrfs_header_nritems(left);
2160                                 path->slots[level] = orig_slot;
2161                                 btrfs_tree_unlock(left);
2162                                 free_extent_buffer(left);
2163                         }
2164                         return 0;
2165                 }
2166                 btrfs_tree_unlock(left);
2167                 free_extent_buffer(left);
2168         }
2169         right = read_node_slot(root, parent, pslot + 1);
2170
2171         /*
2172          * then try to empty the right most buffer into the middle
2173          */
2174         if (right) {
2175                 u32 right_nr;
2176
2177                 btrfs_tree_lock(right);
2178                 btrfs_set_lock_blocking(right);
2179
2180                 right_nr = btrfs_header_nritems(right);
2181                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2182                         wret = 1;
2183                 } else {
2184                         ret = btrfs_cow_block(trans, root, right,
2185                                               parent, pslot + 1,
2186                                               &right);
2187                         if (ret)
2188                                 wret = 1;
2189                         else {
2190                                 wret = balance_node_right(trans, root,
2191                                                           right, mid);
2192                         }
2193                 }
2194                 if (wret < 0)
2195                         ret = wret;
2196                 if (wret == 0) {
2197                         struct btrfs_disk_key disk_key;
2198
2199                         btrfs_node_key(right, &disk_key, 0);
2200                         tree_mod_log_set_node_key(root->fs_info, parent,
2201                                                   pslot + 1, 0);
2202                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2203                         btrfs_mark_buffer_dirty(parent);
2204
2205                         if (btrfs_header_nritems(mid) <= orig_slot) {
2206                                 path->nodes[level] = right;
2207                                 path->slots[level + 1] += 1;
2208                                 path->slots[level] = orig_slot -
2209                                         btrfs_header_nritems(mid);
2210                                 btrfs_tree_unlock(mid);
2211                                 free_extent_buffer(mid);
2212                         } else {
2213                                 btrfs_tree_unlock(right);
2214                                 free_extent_buffer(right);
2215                         }
2216                         return 0;
2217                 }
2218                 btrfs_tree_unlock(right);
2219                 free_extent_buffer(right);
2220         }
2221         return 1;
2222 }
2223
2224 /*
2225  * readahead one full node of leaves, finding things that are close
2226  * to the block in 'slot', and triggering ra on them.
2227  */
2228 static void reada_for_search(struct btrfs_root *root,
2229                              struct btrfs_path *path,
2230                              int level, int slot, u64 objectid)
2231 {
2232         struct extent_buffer *node;
2233         struct btrfs_disk_key disk_key;
2234         u32 nritems;
2235         u64 search;
2236         u64 target;
2237         u64 nread = 0;
2238         u64 gen;
2239         int direction = path->reada;
2240         struct extent_buffer *eb;
2241         u32 nr;
2242         u32 blocksize;
2243         u32 nscan = 0;
2244
2245         if (level != 1)
2246                 return;
2247
2248         if (!path->nodes[level])
2249                 return;
2250
2251         node = path->nodes[level];
2252
2253         search = btrfs_node_blockptr(node, slot);
2254         blocksize = root->nodesize;
2255         eb = btrfs_find_tree_block(root, search);
2256         if (eb) {
2257                 free_extent_buffer(eb);
2258                 return;
2259         }
2260
2261         target = search;
2262
2263         nritems = btrfs_header_nritems(node);
2264         nr = slot;
2265
2266         while (1) {
2267                 if (direction < 0) {
2268                         if (nr == 0)
2269                                 break;
2270                         nr--;
2271                 } else if (direction > 0) {
2272                         nr++;
2273                         if (nr >= nritems)
2274                                 break;
2275                 }
2276                 if (path->reada < 0 && objectid) {
2277                         btrfs_node_key(node, &disk_key, nr);
2278                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2279                                 break;
2280                 }
2281                 search = btrfs_node_blockptr(node, nr);
2282                 if ((search <= target && target - search <= 65536) ||
2283                     (search > target && search - target <= 65536)) {
2284                         gen = btrfs_node_ptr_generation(node, nr);
2285                         readahead_tree_block(root, search, blocksize);
2286                         nread += blocksize;
2287                 }
2288                 nscan++;
2289                 if ((nread > 65536 || nscan > 32))
2290                         break;
2291         }
2292 }
2293
2294 static noinline void reada_for_balance(struct btrfs_root *root,
2295                                        struct btrfs_path *path, int level)
2296 {
2297         int slot;
2298         int nritems;
2299         struct extent_buffer *parent;
2300         struct extent_buffer *eb;
2301         u64 gen;
2302         u64 block1 = 0;
2303         u64 block2 = 0;
2304         int blocksize;
2305
2306         parent = path->nodes[level + 1];
2307         if (!parent)
2308                 return;
2309
2310         nritems = btrfs_header_nritems(parent);
2311         slot = path->slots[level + 1];
2312         blocksize = root->nodesize;
2313
2314         if (slot > 0) {
2315                 block1 = btrfs_node_blockptr(parent, slot - 1);
2316                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2317                 eb = btrfs_find_tree_block(root, block1);
2318                 /*
2319                  * if we get -eagain from btrfs_buffer_uptodate, we
2320                  * don't want to return eagain here.  That will loop
2321                  * forever
2322                  */
2323                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2324                         block1 = 0;
2325                 free_extent_buffer(eb);
2326         }
2327         if (slot + 1 < nritems) {
2328                 block2 = btrfs_node_blockptr(parent, slot + 1);
2329                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2330                 eb = btrfs_find_tree_block(root, block2);
2331                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2332                         block2 = 0;
2333                 free_extent_buffer(eb);
2334         }
2335
2336         if (block1)
2337                 readahead_tree_block(root, block1, blocksize);
2338         if (block2)
2339                 readahead_tree_block(root, block2, blocksize);
2340 }
2341
2342
2343 /*
2344  * when we walk down the tree, it is usually safe to unlock the higher layers
2345  * in the tree.  The exceptions are when our path goes through slot 0, because
2346  * operations on the tree might require changing key pointers higher up in the
2347  * tree.
2348  *
2349  * callers might also have set path->keep_locks, which tells this code to keep
2350  * the lock if the path points to the last slot in the block.  This is part of
2351  * walking through the tree, and selecting the next slot in the higher block.
2352  *
2353  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2354  * if lowest_unlock is 1, level 0 won't be unlocked
2355  */
2356 static noinline void unlock_up(struct btrfs_path *path, int level,
2357                                int lowest_unlock, int min_write_lock_level,
2358                                int *write_lock_level)
2359 {
2360         int i;
2361         int skip_level = level;
2362         int no_skips = 0;
2363         struct extent_buffer *t;
2364
2365         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2366                 if (!path->nodes[i])
2367                         break;
2368                 if (!path->locks[i])
2369                         break;
2370                 if (!no_skips && path->slots[i] == 0) {
2371                         skip_level = i + 1;
2372                         continue;
2373                 }
2374                 if (!no_skips && path->keep_locks) {
2375                         u32 nritems;
2376                         t = path->nodes[i];
2377                         nritems = btrfs_header_nritems(t);
2378                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2379                                 skip_level = i + 1;
2380                                 continue;
2381                         }
2382                 }
2383                 if (skip_level < i && i >= lowest_unlock)
2384                         no_skips = 1;
2385
2386                 t = path->nodes[i];
2387                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2388                         btrfs_tree_unlock_rw(t, path->locks[i]);
2389                         path->locks[i] = 0;
2390                         if (write_lock_level &&
2391                             i > min_write_lock_level &&
2392                             i <= *write_lock_level) {
2393                                 *write_lock_level = i - 1;
2394                         }
2395                 }
2396         }
2397 }
2398
2399 /*
2400  * This releases any locks held in the path starting at level and
2401  * going all the way up to the root.
2402  *
2403  * btrfs_search_slot will keep the lock held on higher nodes in a few
2404  * corner cases, such as COW of the block at slot zero in the node.  This
2405  * ignores those rules, and it should only be called when there are no
2406  * more updates to be done higher up in the tree.
2407  */
2408 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2409 {
2410         int i;
2411
2412         if (path->keep_locks)
2413                 return;
2414
2415         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2416                 if (!path->nodes[i])
2417                         continue;
2418                 if (!path->locks[i])
2419                         continue;
2420                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2421                 path->locks[i] = 0;
2422         }
2423 }
2424
2425 /*
2426  * helper function for btrfs_search_slot.  The goal is to find a block
2427  * in cache without setting the path to blocking.  If we find the block
2428  * we return zero and the path is unchanged.
2429  *
2430  * If we can't find the block, we set the path blocking and do some
2431  * reada.  -EAGAIN is returned and the search must be repeated.
2432  */
2433 static int
2434 read_block_for_search(struct btrfs_trans_handle *trans,
2435                        struct btrfs_root *root, struct btrfs_path *p,
2436                        struct extent_buffer **eb_ret, int level, int slot,
2437                        struct btrfs_key *key, u64 time_seq)
2438 {
2439         u64 blocknr;
2440         u64 gen;
2441         struct extent_buffer *b = *eb_ret;
2442         struct extent_buffer *tmp;
2443         int ret;
2444
2445         blocknr = btrfs_node_blockptr(b, slot);
2446         gen = btrfs_node_ptr_generation(b, slot);
2447
2448         tmp = btrfs_find_tree_block(root, blocknr);
2449         if (tmp) {
2450                 /* first we do an atomic uptodate check */
2451                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2452                         *eb_ret = tmp;
2453                         return 0;
2454                 }
2455
2456                 /* the pages were up to date, but we failed
2457                  * the generation number check.  Do a full
2458                  * read for the generation number that is correct.
2459                  * We must do this without dropping locks so
2460                  * we can trust our generation number
2461                  */
2462                 btrfs_set_path_blocking(p);
2463
2464                 /* now we're allowed to do a blocking uptodate check */
2465                 ret = btrfs_read_buffer(tmp, gen);
2466                 if (!ret) {
2467                         *eb_ret = tmp;
2468                         return 0;
2469                 }
2470                 free_extent_buffer(tmp);
2471                 btrfs_release_path(p);
2472                 return -EIO;
2473         }
2474
2475         /*
2476          * reduce lock contention at high levels
2477          * of the btree by dropping locks before
2478          * we read.  Don't release the lock on the current
2479          * level because we need to walk this node to figure
2480          * out which blocks to read.
2481          */
2482         btrfs_unlock_up_safe(p, level + 1);
2483         btrfs_set_path_blocking(p);
2484
2485         free_extent_buffer(tmp);
2486         if (p->reada)
2487                 reada_for_search(root, p, level, slot, key->objectid);
2488
2489         btrfs_release_path(p);
2490
2491         ret = -EAGAIN;
2492         tmp = read_tree_block(root, blocknr, 0);
2493         if (tmp) {
2494                 /*
2495                  * If the read above didn't mark this buffer up to date,
2496                  * it will never end up being up to date.  Set ret to EIO now
2497                  * and give up so that our caller doesn't loop forever
2498                  * on our EAGAINs.
2499                  */
2500                 if (!btrfs_buffer_uptodate(tmp, 0, 0))
2501                         ret = -EIO;
2502                 free_extent_buffer(tmp);
2503         }
2504         return ret;
2505 }
2506
2507 /*
2508  * helper function for btrfs_search_slot.  This does all of the checks
2509  * for node-level blocks and does any balancing required based on
2510  * the ins_len.
2511  *
2512  * If no extra work was required, zero is returned.  If we had to
2513  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2514  * start over
2515  */
2516 static int
2517 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2518                        struct btrfs_root *root, struct btrfs_path *p,
2519                        struct extent_buffer *b, int level, int ins_len,
2520                        int *write_lock_level)
2521 {
2522         int ret;
2523         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2524             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2525                 int sret;
2526
2527                 if (*write_lock_level < level + 1) {
2528                         *write_lock_level = level + 1;
2529                         btrfs_release_path(p);
2530                         goto again;
2531                 }
2532
2533                 btrfs_set_path_blocking(p);
2534                 reada_for_balance(root, p, level);
2535                 sret = split_node(trans, root, p, level);
2536                 btrfs_clear_path_blocking(p, NULL, 0);
2537
2538                 BUG_ON(sret > 0);
2539                 if (sret) {
2540                         ret = sret;
2541                         goto done;
2542                 }
2543                 b = p->nodes[level];
2544         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2545                    BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2546                 int sret;
2547
2548                 if (*write_lock_level < level + 1) {
2549                         *write_lock_level = level + 1;
2550                         btrfs_release_path(p);
2551                         goto again;
2552                 }
2553
2554                 btrfs_set_path_blocking(p);
2555                 reada_for_balance(root, p, level);
2556                 sret = balance_level(trans, root, p, level);
2557                 btrfs_clear_path_blocking(p, NULL, 0);
2558
2559                 if (sret) {
2560                         ret = sret;
2561                         goto done;
2562                 }
2563                 b = p->nodes[level];
2564                 if (!b) {
2565                         btrfs_release_path(p);
2566                         goto again;
2567                 }
2568                 BUG_ON(btrfs_header_nritems(b) == 1);
2569         }
2570         return 0;
2571
2572 again:
2573         ret = -EAGAIN;
2574 done:
2575         return ret;
2576 }
2577
2578 static void key_search_validate(struct extent_buffer *b,
2579                                 struct btrfs_key *key,
2580                                 int level)
2581 {
2582 #ifdef CONFIG_BTRFS_ASSERT
2583         struct btrfs_disk_key disk_key;
2584
2585         btrfs_cpu_key_to_disk(&disk_key, key);
2586
2587         if (level == 0)
2588                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2589                     offsetof(struct btrfs_leaf, items[0].key),
2590                     sizeof(disk_key)));
2591         else
2592                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2593                     offsetof(struct btrfs_node, ptrs[0].key),
2594                     sizeof(disk_key)));
2595 #endif
2596 }
2597
2598 static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2599                       int level, int *prev_cmp, int *slot)
2600 {
2601         if (*prev_cmp != 0) {
2602                 *prev_cmp = bin_search(b, key, level, slot);
2603                 return *prev_cmp;
2604         }
2605
2606         key_search_validate(b, key, level);
2607         *slot = 0;
2608
2609         return 0;
2610 }
2611
2612 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2613                 u64 iobjectid, u64 ioff, u8 key_type,
2614                 struct btrfs_key *found_key)
2615 {
2616         int ret;
2617         struct btrfs_key key;
2618         struct extent_buffer *eb;
2619
2620         ASSERT(path);
2621         ASSERT(found_key);
2622
2623         key.type = key_type;
2624         key.objectid = iobjectid;
2625         key.offset = ioff;
2626
2627         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2628         if (ret < 0)
2629                 return ret;
2630
2631         eb = path->nodes[0];
2632         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2633                 ret = btrfs_next_leaf(fs_root, path);
2634                 if (ret)
2635                         return ret;
2636                 eb = path->nodes[0];
2637         }
2638
2639         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2640         if (found_key->type != key.type ||
2641                         found_key->objectid != key.objectid)
2642                 return 1;
2643
2644         return 0;
2645 }
2646
2647 /*
2648  * look for key in the tree.  path is filled in with nodes along the way
2649  * if key is found, we return zero and you can find the item in the leaf
2650  * level of the path (level 0)
2651  *
2652  * If the key isn't found, the path points to the slot where it should
2653  * be inserted, and 1 is returned.  If there are other errors during the
2654  * search a negative error number is returned.
2655  *
2656  * if ins_len > 0, nodes and leaves will be split as we walk down the
2657  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2658  * possible)
2659  */
2660 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2661                       *root, struct btrfs_key *key, struct btrfs_path *p, int
2662                       ins_len, int cow)
2663 {
2664         struct extent_buffer *b;
2665         int slot;
2666         int ret;
2667         int err;
2668         int level;
2669         int lowest_unlock = 1;
2670         int root_lock;
2671         /* everything at write_lock_level or lower must be write locked */
2672         int write_lock_level = 0;
2673         u8 lowest_level = 0;
2674         int min_write_lock_level;
2675         int prev_cmp;
2676
2677         lowest_level = p->lowest_level;
2678         WARN_ON(lowest_level && ins_len > 0);
2679         WARN_ON(p->nodes[0] != NULL);
2680         BUG_ON(!cow && ins_len);
2681
2682         if (ins_len < 0) {
2683                 lowest_unlock = 2;
2684
2685                 /* when we are removing items, we might have to go up to level
2686                  * two as we update tree pointers  Make sure we keep write
2687                  * for those levels as well
2688                  */
2689                 write_lock_level = 2;
2690         } else if (ins_len > 0) {
2691                 /*
2692                  * for inserting items, make sure we have a write lock on
2693                  * level 1 so we can update keys
2694                  */
2695                 write_lock_level = 1;
2696         }
2697
2698         if (!cow)
2699                 write_lock_level = -1;
2700
2701         if (cow && (p->keep_locks || p->lowest_level))
2702                 write_lock_level = BTRFS_MAX_LEVEL;
2703
2704         min_write_lock_level = write_lock_level;
2705
2706 again:
2707         prev_cmp = -1;
2708         /*
2709          * we try very hard to do read locks on the root
2710          */
2711         root_lock = BTRFS_READ_LOCK;
2712         level = 0;
2713         if (p->search_commit_root) {
2714                 /*
2715                  * the commit roots are read only
2716                  * so we always do read locks
2717                  */
2718                 if (p->need_commit_sem)
2719                         down_read(&root->fs_info->commit_root_sem);
2720                 b = root->commit_root;
2721                 extent_buffer_get(b);
2722                 level = btrfs_header_level(b);
2723                 if (p->need_commit_sem)
2724                         up_read(&root->fs_info->commit_root_sem);
2725                 if (!p->skip_locking)
2726                         btrfs_tree_read_lock(b);
2727         } else {
2728                 if (p->skip_locking) {
2729                         b = btrfs_root_node(root);
2730                         level = btrfs_header_level(b);
2731                 } else {
2732                         /* we don't know the level of the root node
2733                          * until we actually have it read locked
2734                          */
2735                         b = btrfs_read_lock_root_node(root);
2736                         level = btrfs_header_level(b);
2737                         if (level <= write_lock_level) {
2738                                 /* whoops, must trade for write lock */
2739                                 btrfs_tree_read_unlock(b);
2740                                 free_extent_buffer(b);
2741                                 b = btrfs_lock_root_node(root);
2742                                 root_lock = BTRFS_WRITE_LOCK;
2743
2744                                 /* the level might have changed, check again */
2745                                 level = btrfs_header_level(b);
2746                         }
2747                 }
2748         }
2749         p->nodes[level] = b;
2750         if (!p->skip_locking)
2751                 p->locks[level] = root_lock;
2752
2753         while (b) {
2754                 level = btrfs_header_level(b);
2755
2756                 /*
2757                  * setup the path here so we can release it under lock
2758                  * contention with the cow code
2759                  */
2760                 if (cow) {
2761                         /*
2762                          * if we don't really need to cow this block
2763                          * then we don't want to set the path blocking,
2764                          * so we test it here
2765                          */
2766                         if (!should_cow_block(trans, root, b))
2767                                 goto cow_done;
2768
2769                         /*
2770                          * must have write locks on this node and the
2771                          * parent
2772                          */
2773                         if (level > write_lock_level ||
2774                             (level + 1 > write_lock_level &&
2775                             level + 1 < BTRFS_MAX_LEVEL &&
2776                             p->nodes[level + 1])) {
2777                                 write_lock_level = level + 1;
2778                                 btrfs_release_path(p);
2779                                 goto again;
2780                         }
2781
2782                         btrfs_set_path_blocking(p);
2783                         err = btrfs_cow_block(trans, root, b,
2784                                               p->nodes[level + 1],
2785                                               p->slots[level + 1], &b);
2786                         if (err) {
2787                                 ret = err;
2788                                 goto done;
2789                         }
2790                 }
2791 cow_done:
2792                 p->nodes[level] = b;
2793                 btrfs_clear_path_blocking(p, NULL, 0);
2794
2795                 /*
2796                  * we have a lock on b and as long as we aren't changing
2797                  * the tree, there is no way to for the items in b to change.
2798                  * It is safe to drop the lock on our parent before we
2799                  * go through the expensive btree search on b.
2800                  *
2801                  * If we're inserting or deleting (ins_len != 0), then we might
2802                  * be changing slot zero, which may require changing the parent.
2803                  * So, we can't drop the lock until after we know which slot
2804                  * we're operating on.
2805                  */
2806                 if (!ins_len && !p->keep_locks) {
2807                         int u = level + 1;
2808
2809                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2810                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2811                                 p->locks[u] = 0;
2812                         }
2813                 }
2814
2815                 ret = key_search(b, key, level, &prev_cmp, &slot);
2816
2817                 if (level != 0) {
2818                         int dec = 0;
2819                         if (ret && slot > 0) {
2820                                 dec = 1;
2821                                 slot -= 1;
2822                         }
2823                         p->slots[level] = slot;
2824                         err = setup_nodes_for_search(trans, root, p, b, level,
2825                                              ins_len, &write_lock_level);
2826                         if (err == -EAGAIN)
2827                                 goto again;
2828                         if (err) {
2829                                 ret = err;
2830                                 goto done;
2831                         }
2832                         b = p->nodes[level];
2833                         slot = p->slots[level];
2834
2835                         /*
2836                          * slot 0 is special, if we change the key
2837                          * we have to update the parent pointer
2838                          * which means we must have a write lock
2839                          * on the parent
2840                          */
2841                         if (slot == 0 && ins_len &&
2842                             write_lock_level < level + 1) {
2843                                 write_lock_level = level + 1;
2844                                 btrfs_release_path(p);
2845                                 goto again;
2846                         }
2847
2848                         unlock_up(p, level, lowest_unlock,
2849                                   min_write_lock_level, &write_lock_level);
2850
2851                         if (level == lowest_level) {
2852                                 if (dec)
2853                                         p->slots[level]++;
2854                                 goto done;
2855                         }
2856
2857                         err = read_block_for_search(trans, root, p,
2858                                                     &b, level, slot, key, 0);
2859                         if (err == -EAGAIN)
2860                                 goto again;
2861                         if (err) {
2862                                 ret = err;
2863                                 goto done;
2864                         }
2865
2866                         if (!p->skip_locking) {
2867                                 level = btrfs_header_level(b);
2868                                 if (level <= write_lock_level) {
2869                                         err = btrfs_try_tree_write_lock(b);
2870                                         if (!err) {
2871                                                 btrfs_set_path_blocking(p);
2872                                                 btrfs_tree_lock(b);
2873                                                 btrfs_clear_path_blocking(p, b,
2874                                                                   BTRFS_WRITE_LOCK);
2875                                         }
2876                                         p->locks[level] = BTRFS_WRITE_LOCK;
2877                                 } else {
2878                                         err = btrfs_tree_read_lock_atomic(b);
2879                                         if (!err) {
2880                                                 btrfs_set_path_blocking(p);
2881                                                 btrfs_tree_read_lock(b);
2882                                                 btrfs_clear_path_blocking(p, b,
2883                                                                   BTRFS_READ_LOCK);
2884                                         }
2885                                         p->locks[level] = BTRFS_READ_LOCK;
2886                                 }
2887                                 p->nodes[level] = b;
2888                         }
2889                 } else {
2890                         p->slots[level] = slot;
2891                         if (ins_len > 0 &&
2892                             btrfs_leaf_free_space(root, b) < ins_len) {
2893                                 if (write_lock_level < 1) {
2894                                         write_lock_level = 1;
2895                                         btrfs_release_path(p);
2896                                         goto again;
2897                                 }
2898
2899                                 btrfs_set_path_blocking(p);
2900                                 err = split_leaf(trans, root, key,
2901                                                  p, ins_len, ret == 0);
2902                                 btrfs_clear_path_blocking(p, NULL, 0);
2903
2904                                 BUG_ON(err > 0);
2905                                 if (err) {
2906                                         ret = err;
2907                                         goto done;
2908                                 }
2909                         }
2910                         if (!p->search_for_split)
2911                                 unlock_up(p, level, lowest_unlock,
2912                                           min_write_lock_level, &write_lock_level);
2913                         goto done;
2914                 }
2915         }
2916         ret = 1;
2917 done:
2918         /*
2919          * we don't really know what they plan on doing with the path
2920          * from here on, so for now just mark it as blocking
2921          */
2922         if (!p->leave_spinning)
2923                 btrfs_set_path_blocking(p);
2924         if (ret < 0 && !p->skip_release_on_error)
2925                 btrfs_release_path(p);
2926         return ret;
2927 }
2928
2929 /*
2930  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2931  * current state of the tree together with the operations recorded in the tree
2932  * modification log to search for the key in a previous version of this tree, as
2933  * denoted by the time_seq parameter.
2934  *
2935  * Naturally, there is no support for insert, delete or cow operations.
2936  *
2937  * The resulting path and return value will be set up as if we called
2938  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2939  */
2940 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2941                           struct btrfs_path *p, u64 time_seq)
2942 {
2943         struct extent_buffer *b;
2944         int slot;
2945         int ret;
2946         int err;
2947         int level;
2948         int lowest_unlock = 1;
2949         u8 lowest_level = 0;
2950         int prev_cmp = -1;
2951
2952         lowest_level = p->lowest_level;
2953         WARN_ON(p->nodes[0] != NULL);
2954
2955         if (p->search_commit_root) {
2956                 BUG_ON(time_seq);
2957                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2958         }
2959
2960 again:
2961         b = get_old_root(root, time_seq);
2962         level = btrfs_header_level(b);
2963         p->locks[level] = BTRFS_READ_LOCK;
2964
2965         while (b) {
2966                 level = btrfs_header_level(b);
2967                 p->nodes[level] = b;
2968                 btrfs_clear_path_blocking(p, NULL, 0);
2969
2970                 /*
2971                  * we have a lock on b and as long as we aren't changing
2972                  * the tree, there is no way to for the items in b to change.
2973                  * It is safe to drop the lock on our parent before we
2974                  * go through the expensive btree search on b.
2975                  */
2976                 btrfs_unlock_up_safe(p, level + 1);
2977
2978                 /*
2979                  * Since we can unwind eb's we want to do a real search every
2980                  * time.
2981                  */
2982                 prev_cmp = -1;
2983                 ret = key_search(b, key, level, &prev_cmp, &slot);
2984
2985                 if (level != 0) {
2986                         int dec = 0;
2987                         if (ret && slot > 0) {
2988                                 dec = 1;
2989                                 slot -= 1;
2990                         }
2991                         p->slots[level] = slot;
2992                         unlock_up(p, level, lowest_unlock, 0, NULL);
2993
2994                         if (level == lowest_level) {
2995                                 if (dec)
2996                                         p->slots[level]++;
2997                                 goto done;
2998                         }
2999
3000                         err = read_block_for_search(NULL, root, p, &b, level,
3001                                                     slot, key, time_seq);
3002                         if (err == -EAGAIN)
3003                                 goto again;
3004                         if (err) {
3005                                 ret = err;
3006                                 goto done;
3007                         }
3008
3009                         level = btrfs_header_level(b);
3010                         err = btrfs_tree_read_lock_atomic(b);
3011                         if (!err) {
3012                                 btrfs_set_path_blocking(p);
3013                                 btrfs_tree_read_lock(b);
3014                                 btrfs_clear_path_blocking(p, b,
3015                                                           BTRFS_READ_LOCK);
3016                         }
3017                         b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3018                         if (!b) {
3019                                 ret = -ENOMEM;
3020                                 goto done;
3021                         }
3022                         p->locks[level] = BTRFS_READ_LOCK;
3023                         p->nodes[level] = b;
3024                 } else {
3025                         p->slots[level] = slot;
3026                         unlock_up(p, level, lowest_unlock, 0, NULL);
3027                         goto done;
3028                 }
3029         }
3030         ret = 1;
3031 done:
3032         if (!p->leave_spinning)
3033                 btrfs_set_path_blocking(p);
3034         if (ret < 0)
3035                 btrfs_release_path(p);
3036
3037         return ret;
3038 }
3039
3040 /*
3041  * helper to use instead of search slot if no exact match is needed but
3042  * instead the next or previous item should be returned.
3043  * When find_higher is true, the next higher item is returned, the next lower
3044  * otherwise.
3045  * When return_any and find_higher are both true, and no higher item is found,
3046  * return the next lower instead.
3047  * When return_any is true and find_higher is false, and no lower item is found,
3048  * return the next higher instead.
3049  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3050  * < 0 on error
3051  */
3052 int btrfs_search_slot_for_read(struct btrfs_root *root,
3053                                struct btrfs_key *key, struct btrfs_path *p,
3054                                int find_higher, int return_any)
3055 {
3056         int ret;
3057         struct extent_buffer *leaf;
3058
3059 again:
3060         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3061         if (ret <= 0)
3062                 return ret;
3063         /*
3064          * a return value of 1 means the path is at the position where the
3065          * item should be inserted. Normally this is the next bigger item,
3066          * but in case the previous item is the last in a leaf, path points
3067          * to the first free slot in the previous leaf, i.e. at an invalid
3068          * item.
3069          */
3070         leaf = p->nodes[0];
3071
3072         if (find_higher) {
3073                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3074                         ret = btrfs_next_leaf(root, p);
3075                         if (ret <= 0)
3076                                 return ret;
3077                         if (!return_any)
3078                                 return 1;
3079                         /*
3080                          * no higher item found, return the next
3081                          * lower instead
3082                          */
3083                         return_any = 0;
3084                         find_higher = 0;
3085                         btrfs_release_path(p);
3086                         goto again;
3087                 }
3088         } else {
3089                 if (p->slots[0] == 0) {
3090                         ret = btrfs_prev_leaf(root, p);
3091                         if (ret < 0)
3092                                 return ret;
3093                         if (!ret) {
3094                                 leaf = p->nodes[0];
3095                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3096                                         p->slots[0]--;
3097                                 return 0;
3098                         }
3099                         if (!return_any)
3100                                 return 1;
3101                         /*
3102                          * no lower item found, return the next
3103                          * higher instead
3104                          */
3105                         return_any = 0;
3106                         find_higher = 1;
3107                         btrfs_release_path(p);
3108                         goto again;
3109                 } else {
3110                         --p->slots[0];
3111                 }
3112         }
3113         return 0;
3114 }
3115
3116 /*
3117  * adjust the pointers going up the tree, starting at level
3118  * making sure the right key of each node is points to 'key'.
3119  * This is used after shifting pointers to the left, so it stops
3120  * fixing up pointers when a given leaf/node is not in slot 0 of the
3121  * higher levels
3122  *
3123  */
3124 static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
3125                            struct btrfs_disk_key *key, int level)
3126 {
3127         int i;
3128         struct extent_buffer *t;
3129
3130         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3131                 int tslot = path->slots[i];
3132                 if (!path->nodes[i])
3133                         break;
3134                 t = path->nodes[i];
3135                 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
3136                 btrfs_set_node_key(t, key, tslot);
3137                 btrfs_mark_buffer_dirty(path->nodes[i]);
3138                 if (tslot != 0)
3139                         break;
3140         }
3141 }
3142
3143 /*
3144  * update item key.
3145  *
3146  * This function isn't completely safe. It's the caller's responsibility
3147  * that the new key won't break the order
3148  */
3149 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
3150                              struct btrfs_key *new_key)
3151 {
3152         struct btrfs_disk_key disk_key;
3153         struct extent_buffer *eb;
3154         int slot;
3155
3156         eb = path->nodes[0];
3157         slot = path->slots[0];
3158         if (slot > 0) {
3159                 btrfs_item_key(eb, &disk_key, slot - 1);
3160                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3161         }
3162         if (slot < btrfs_header_nritems(eb) - 1) {
3163                 btrfs_item_key(eb, &disk_key, slot + 1);
3164                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3165         }
3166
3167         btrfs_cpu_key_to_disk(&disk_key, new_key);
3168         btrfs_set_item_key(eb, &disk_key, slot);
3169         btrfs_mark_buffer_dirty(eb);
3170         if (slot == 0)
3171                 fixup_low_keys(root, path, &disk_key, 1);
3172 }
3173
3174 /*
3175  * try to push data from one node into the next node left in the
3176  * tree.
3177  *
3178  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3179  * error, and > 0 if there was no room in the left hand block.
3180  */
3181 static int push_node_left(struct btrfs_trans_handle *trans,
3182                           struct btrfs_root *root, struct extent_buffer *dst,
3183                           struct extent_buffer *src, int empty)
3184 {
3185         int push_items = 0;
3186         int src_nritems;
3187         int dst_nritems;
3188         int ret = 0;
3189
3190         src_nritems = btrfs_header_nritems(src);
3191         dst_nritems = btrfs_header_nritems(dst);
3192         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3193         WARN_ON(btrfs_header_generation(src) != trans->transid);
3194         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3195
3196         if (!empty && src_nritems <= 8)
3197                 return 1;
3198
3199         if (push_items <= 0)
3200                 return 1;
3201
3202         if (empty) {
3203                 push_items = min(src_nritems, push_items);
3204                 if (push_items < src_nritems) {
3205                         /* leave at least 8 pointers in the node if
3206                          * we aren't going to empty it
3207                          */
3208                         if (src_nritems - push_items < 8) {
3209                                 if (push_items <= 8)
3210                                         return 1;
3211                                 push_items -= 8;
3212                         }
3213                 }
3214         } else
3215                 push_items = min(src_nritems - 8, push_items);
3216
3217         ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3218                                    push_items);
3219         if (ret) {
3220                 btrfs_abort_transaction(trans, root, ret);
3221                 return ret;
3222         }
3223         copy_extent_buffer(dst, src,
3224                            btrfs_node_key_ptr_offset(dst_nritems),
3225                            btrfs_node_key_ptr_offset(0),
3226                            push_items * sizeof(struct btrfs_key_ptr));
3227
3228         if (push_items < src_nritems) {
3229                 /*
3230                  * don't call tree_mod_log_eb_move here, key removal was already
3231                  * fully logged by tree_mod_log_eb_copy above.
3232                  */
3233                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3234                                       btrfs_node_key_ptr_offset(push_items),
3235                                       (src_nritems - push_items) *
3236                                       sizeof(struct btrfs_key_ptr));
3237         }
3238         btrfs_set_header_nritems(src, src_nritems - push_items);
3239         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3240         btrfs_mark_buffer_dirty(src);
3241         btrfs_mark_buffer_dirty(dst);
3242
3243         return ret;
3244 }
3245
3246 /*
3247  * try to push data from one node into the next node right in the
3248  * tree.
3249  *
3250  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3251  * error, and > 0 if there was no room in the right hand block.
3252  *
3253  * this will  only push up to 1/2 the contents of the left node over
3254  */
3255 static int balance_node_right(struct btrfs_trans_handle *trans,
3256                               struct btrfs_root *root,
3257                               struct extent_buffer *dst,
3258                               struct extent_buffer *src)
3259 {
3260         int push_items = 0;
3261         int max_push;
3262         int src_nritems;
3263         int dst_nritems;
3264         int ret = 0;
3265
3266         WARN_ON(btrfs_header_generation(src) != trans->transid);
3267         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3268
3269         src_nritems = btrfs_header_nritems(src);
3270         dst_nritems = btrfs_header_nritems(dst);
3271         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3272         if (push_items <= 0)
3273                 return 1;
3274
3275         if (src_nritems < 4)
3276                 return 1;
3277
3278         max_push = src_nritems / 2 + 1;
3279         /* don't try to empty the node */
3280         if (max_push >= src_nritems)
3281                 return 1;
3282
3283         if (max_push < push_items)
3284                 push_items = max_push;
3285
3286         tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3287         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3288                                       btrfs_node_key_ptr_offset(0),
3289                                       (dst_nritems) *
3290                                       sizeof(struct btrfs_key_ptr));
3291
3292         ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3293                                    src_nritems - push_items, push_items);
3294         if (ret) {
3295                 btrfs_abort_transaction(trans, root, ret);
3296                 return ret;
3297         }
3298         copy_extent_buffer(dst, src,
3299                            btrfs_node_key_ptr_offset(0),
3300                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3301                            push_items * sizeof(struct btrfs_key_ptr));
3302
3303         btrfs_set_header_nritems(src, src_nritems - push_items);
3304         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3305
3306         btrfs_mark_buffer_dirty(src);
3307         btrfs_mark_buffer_dirty(dst);
3308
3309         return ret;
3310 }
3311
3312 /*
3313  * helper function to insert a new root level in the tree.
3314  * A new node is allocated, and a single item is inserted to
3315  * point to the existing root
3316  *
3317  * returns zero on success or < 0 on failure.
3318  */
3319 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3320                            struct btrfs_root *root,
3321                            struct btrfs_path *path, int level)
3322 {
3323         u64 lower_gen;
3324         struct extent_buffer *lower;
3325         struct extent_buffer *c;
3326         struct extent_buffer *old;
3327         struct btrfs_disk_key lower_key;
3328
3329         BUG_ON(path->nodes[level]);
3330         BUG_ON(path->nodes[level-1] != root->node);
3331
3332         lower = path->nodes[level-1];
3333         if (level == 1)
3334                 btrfs_item_key(lower, &lower_key, 0);
3335         else
3336                 btrfs_node_key(lower, &lower_key, 0);
3337
3338         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3339                                    &lower_key, level, root->node->start, 0);
3340         if (IS_ERR(c))
3341                 return PTR_ERR(c);
3342
3343         root_add_used(root, root->nodesize);
3344
3345         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3346         btrfs_set_header_nritems(c, 1);
3347         btrfs_set_header_level(c, level);
3348         btrfs_set_header_bytenr(c, c->start);
3349         btrfs_set_header_generation(c, trans->transid);
3350         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3351         btrfs_set_header_owner(c, root->root_key.objectid);
3352
3353         write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
3354                             BTRFS_FSID_SIZE);
3355
3356         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3357                             btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
3358
3359         btrfs_set_node_key(c, &lower_key, 0);
3360         btrfs_set_node_blockptr(c, 0, lower->start);
3361         lower_gen = btrfs_header_generation(lower);
3362         WARN_ON(lower_gen != trans->transid);
3363
3364         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3365
3366         btrfs_mark_buffer_dirty(c);
3367
3368         old = root->node;
3369         tree_mod_log_set_root_pointer(root, c, 0);
3370         rcu_assign_pointer(root->node, c);
3371
3372         /* the super has an extra ref to root->node */
3373         free_extent_buffer(old);
3374
3375         add_root_to_dirty_list(root);
3376         extent_buffer_get(c);
3377         path->nodes[level] = c;
3378         path->locks[level] = BTRFS_WRITE_LOCK;
3379         path->slots[level] = 0;
3380         return 0;
3381 }
3382
3383 /*
3384  * worker function to insert a single pointer in a node.
3385  * the node should have enough room for the pointer already
3386  *
3387  * slot and level indicate where you want the key to go, and
3388  * blocknr is the block the key points to.
3389  */
3390 static void insert_ptr(struct btrfs_trans_handle *trans,
3391                        struct btrfs_root *root, struct btrfs_path *path,
3392                        struct btrfs_disk_key *key, u64 bytenr,
3393                        int slot, int level)
3394 {
3395         struct extent_buffer *lower;
3396         int nritems;
3397         int ret;
3398
3399         BUG_ON(!path->nodes[level]);
3400         btrfs_assert_tree_locked(path->nodes[level]);
3401         lower = path->nodes[level];
3402         nritems = btrfs_header_nritems(lower);
3403         BUG_ON(slot > nritems);
3404         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3405         if (slot != nritems) {
3406                 if (level)
3407                         tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3408                                              slot, nritems - slot);
3409                 memmove_extent_buffer(lower,
3410                               btrfs_node_key_ptr_offset(slot + 1),
3411                               btrfs_node_key_ptr_offset(slot),
3412                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3413         }
3414         if (level) {
3415                 ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3416                                               MOD_LOG_KEY_ADD, GFP_NOFS);
3417                 BUG_ON(ret < 0);
3418         }
3419         btrfs_set_node_key(lower, key, slot);
3420         btrfs_set_node_blockptr(lower, slot, bytenr);
3421         WARN_ON(trans->transid == 0);
3422         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3423         btrfs_set_header_nritems(lower, nritems + 1);
3424         btrfs_mark_buffer_dirty(lower);
3425 }
3426
3427 /*
3428  * split the node at the specified level in path in two.
3429  * The path is corrected to point to the appropriate node after the split
3430  *
3431  * Before splitting this tries to make some room in the node by pushing
3432  * left and right, if either one works, it returns right away.
3433  *
3434  * returns 0 on success and < 0 on failure
3435  */
3436 static noinline int split_node(struct btrfs_trans_handle *trans,
3437                                struct btrfs_root *root,
3438                                struct btrfs_path *path, int level)
3439 {
3440         struct extent_buffer *c;
3441         struct extent_buffer *split;
3442         struct btrfs_disk_key disk_key;
3443         int mid;
3444         int ret;
3445         u32 c_nritems;
3446
3447         c = path->nodes[level];
3448         WARN_ON(btrfs_header_generation(c) != trans->transid);
3449         if (c == root->node) {
3450                 /*
3451                  * trying to split the root, lets make a new one
3452                  *
3453                  * tree mod log: We don't log_removal old root in
3454                  * insert_new_root, because that root buffer will be kept as a
3455                  * normal node. We are going to log removal of half of the
3456                  * elements below with tree_mod_log_eb_copy. We're holding a
3457                  * tree lock on the buffer, which is why we cannot race with
3458                  * other tree_mod_log users.
3459                  */
3460                 ret = insert_new_root(trans, root, path, level + 1);
3461                 if (ret)
3462                         return ret;
3463         } else {
3464                 ret = push_nodes_for_insert(trans, root, path, level);
3465                 c = path->nodes[level];
3466                 if (!ret && btrfs_header_nritems(c) <
3467                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3468                         return 0;
3469                 if (ret < 0)
3470                         return ret;
3471         }
3472
3473         c_nritems = btrfs_header_nritems(c);
3474         mid = (c_nritems + 1) / 2;
3475         btrfs_node_key(c, &disk_key, mid);
3476
3477         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3478                         &disk_key, level, c->start, 0);
3479         if (IS_ERR(split))
3480                 return PTR_ERR(split);
3481
3482         root_add_used(root, root->nodesize);
3483
3484         memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3485         btrfs_set_header_level(split, btrfs_header_level(c));
3486         btrfs_set_header_bytenr(split, split->start);
3487         btrfs_set_header_generation(split, trans->transid);
3488         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3489         btrfs_set_header_owner(split, root->root_key.objectid);
3490         write_extent_buffer(split, root->fs_info->fsid,
3491                             btrfs_header_fsid(), BTRFS_FSID_SIZE);
3492         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3493                             btrfs_header_chunk_tree_uuid(split),
3494                             BTRFS_UUID_SIZE);
3495
3496         ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3497                                    mid, c_nritems - mid);
3498         if (ret) {
3499                 btrfs_abort_transaction(trans, root, ret);
3500                 return ret;
3501         }
3502         copy_extent_buffer(split, c,
3503                            btrfs_node_key_ptr_offset(0),
3504                            btrfs_node_key_ptr_offset(mid),
3505                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3506         btrfs_set_header_nritems(split, c_nritems - mid);
3507         btrfs_set_header_nritems(c, mid);
3508         ret = 0;
3509
3510         btrfs_mark_buffer_dirty(c);
3511         btrfs_mark_buffer_dirty(split);
3512
3513         insert_ptr(trans, root, path, &disk_key, split->start,
3514                    path->slots[level + 1] + 1, level + 1);
3515
3516         if (path->slots[level] >= mid) {
3517                 path->slots[level] -= mid;
3518                 btrfs_tree_unlock(c);
3519                 free_extent_buffer(c);
3520                 path->nodes[level] = split;
3521                 path->slots[level + 1] += 1;
3522         } else {
3523                 btrfs_tree_unlock(split);
3524                 free_extent_buffer(split);
3525         }
3526         return ret;
3527 }
3528
3529 /*
3530  * how many bytes are required to store the items in a leaf.  start
3531  * and nr indicate which items in the leaf to check.  This totals up the
3532  * space used both by the item structs and the item data
3533  */
3534 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3535 {
3536         struct btrfs_item *start_item;
3537         struct btrfs_item *end_item;
3538         struct btrfs_map_token token;
3539         int data_len;
3540         int nritems = btrfs_header_nritems(l);
3541         int end = min(nritems, start + nr) - 1;
3542
3543         if (!nr)
3544                 return 0;
3545         btrfs_init_map_token(&token);
3546         start_item = btrfs_item_nr(start);
3547         end_item = btrfs_item_nr(end);
3548         data_len = btrfs_token_item_offset(l, start_item, &token) +
3549                 btrfs_token_item_size(l, start_item, &token);
3550         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3551         data_len += sizeof(struct btrfs_item) * nr;
3552         WARN_ON(data_len < 0);
3553         return data_len;
3554 }
3555
3556 /*
3557  * The space between the end of the leaf items and
3558  * the start of the leaf data.  IOW, how much room
3559  * the leaf has left for both items and data
3560  */
3561 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3562                                    struct extent_buffer *leaf)
3563 {
3564         int nritems = btrfs_header_nritems(leaf);
3565         int ret;
3566         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3567         if (ret < 0) {
3568                 btrfs_crit(root->fs_info,
3569                         "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3570                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3571                        leaf_space_used(leaf, 0, nritems), nritems);
3572         }
3573         return ret;
3574 }
3575
3576 /*
3577  * min slot controls the lowest index we're willing to push to the
3578  * right.  We'll push up to and including min_slot, but no lower
3579  */
3580 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3581                                       struct btrfs_root *root,
3582                                       struct btrfs_path *path,
3583                                       int data_size, int empty,
3584                                       struct extent_buffer *right,
3585                                       int free_space, u32 left_nritems,
3586                                       u32 min_slot)
3587 {
3588         struct extent_buffer *left = path->nodes[0];
3589         struct extent_buffer *upper = path->nodes[1];
3590         struct btrfs_map_token token;
3591         struct btrfs_disk_key disk_key;
3592         int slot;
3593         u32 i;
3594         int push_space = 0;
3595         int push_items = 0;
3596         struct btrfs_item *item;
3597         u32 nr;
3598         u32 right_nritems;
3599         u32 data_end;
3600         u32 this_item_size;
3601
3602         btrfs_init_map_token(&token);
3603
3604         if (empty)
3605                 nr = 0;
3606         else
3607                 nr = max_t(u32, 1, min_slot);
3608
3609         if (path->slots[0] >= left_nritems)
3610                 push_space += data_size;
3611
3612         slot = path->slots[1];
3613         i = left_nritems - 1;
3614         while (i >= nr) {
3615                 item = btrfs_item_nr(i);
3616
3617                 if (!empty && push_items > 0) {
3618                         if (path->slots[0] > i)
3619                                 break;
3620                         if (path->slots[0] == i) {
3621                                 int space = btrfs_leaf_free_space(root, left);
3622                                 if (space + push_space * 2 > free_space)
3623                                         break;
3624                         }
3625                 }
3626
3627                 if (path->slots[0] == i)
3628                         push_space += data_size;
3629
3630                 this_item_size = btrfs_item_size(left, item);
3631                 if (this_item_size + sizeof(*item) + push_space > free_space)
3632                         break;
3633
3634                 push_items++;
3635                 push_space += this_item_size + sizeof(*item);
3636                 if (i == 0)
3637                         break;
3638                 i--;
3639         }
3640
3641         if (push_items == 0)
3642                 goto out_unlock;
3643
3644         WARN_ON(!empty && push_items == left_nritems);
3645
3646         /* push left to right */
3647         right_nritems = btrfs_header_nritems(right);
3648
3649         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3650         push_space -= leaf_data_end(root, left);
3651
3652         /* make room in the right data area */
3653         data_end = leaf_data_end(root, right);
3654         memmove_extent_buffer(right,
3655                               btrfs_leaf_data(right) + data_end - push_space,
3656                               btrfs_leaf_data(right) + data_end,
3657                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
3658
3659         /* copy from the left data area */
3660         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3661                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3662                      btrfs_leaf_data(left) + leaf_data_end(root, left),
3663                      push_space);
3664
3665         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3666                               btrfs_item_nr_offset(0),
3667                               right_nritems * sizeof(struct btrfs_item));
3668
3669         /* copy the items from left to right */
3670         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3671                    btrfs_item_nr_offset(left_nritems - push_items),
3672                    push_items * sizeof(struct btrfs_item));
3673
3674         /* update the item pointers */
3675         right_nritems += push_items;
3676         btrfs_set_header_nritems(right, right_nritems);
3677         push_space = BTRFS_LEAF_DATA_SIZE(root);
3678         for (i = 0; i < right_nritems; i++) {
3679                 item = btrfs_item_nr(i);
3680                 push_space -= btrfs_token_item_size(right, item, &token);
3681                 btrfs_set_token_item_offset(right, item, push_space, &token);
3682         }
3683
3684         left_nritems -= push_items;
3685         btrfs_set_header_nritems(left, left_nritems);
3686
3687         if (left_nritems)
3688                 btrfs_mark_buffer_dirty(left);
3689         else
3690                 clean_tree_block(trans, root, left);
3691
3692         btrfs_mark_buffer_dirty(right);
3693
3694         btrfs_item_key(right, &disk_key, 0);
3695         btrfs_set_node_key(upper, &disk_key, slot + 1);
3696         btrfs_mark_buffer_dirty(upper);
3697
3698         /* then fixup the leaf pointer in the path */
3699         if (path->slots[0] >= left_nritems) {
3700                 path->slots[0] -= left_nritems;
3701                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3702                         clean_tree_block(trans, root, path->nodes[0]);
3703                 btrfs_tree_unlock(path->nodes[0]);
3704                 free_extent_buffer(path->nodes[0]);
3705                 path->nodes[0] = right;
3706                 path->slots[1] += 1;
3707         } else {
3708                 btrfs_tree_unlock(right);
3709                 free_extent_buffer(right);
3710         }
3711         return 0;
3712
3713 out_unlock:
3714         btrfs_tree_unlock(right);
3715         free_extent_buffer(right);
3716         return 1;
3717 }
3718
3719 /*
3720  * push some data in the path leaf to the right, trying to free up at
3721  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3722  *
3723  * returns 1 if the push failed because the other node didn't have enough
3724  * room, 0 if everything worked out and < 0 if there were major errors.
3725  *
3726  * this will push starting from min_slot to the end of the leaf.  It won't
3727  * push any slot lower than min_slot
3728  */
3729 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3730                            *root, struct btrfs_path *path,
3731                            int min_data_size, int data_size,
3732                            int empty, u32 min_slot)
3733 {
3734         struct extent_buffer *left = path->nodes[0];
3735         struct extent_buffer *right;
3736         struct extent_buffer *upper;
3737         int slot;
3738         int free_space;
3739         u32 left_nritems;
3740         int ret;
3741
3742         if (!path->nodes[1])
3743                 return 1;
3744
3745         slot = path->slots[1];
3746         upper = path->nodes[1];
3747         if (slot >= btrfs_header_nritems(upper) - 1)
3748                 return 1;
3749
3750         btrfs_assert_tree_locked(path->nodes[1]);
3751
3752         right = read_node_slot(root, upper, slot + 1);
3753         if (right == NULL)
3754                 return 1;
3755
3756         btrfs_tree_lock(right);
3757         btrfs_set_lock_blocking(right);
3758
3759         free_space = btrfs_leaf_free_space(root, right);
3760         if (free_space < data_size)
3761                 goto out_unlock;
3762
3763         /* cow and double check */
3764         ret = btrfs_cow_block(trans, root, right, upper,
3765                               slot + 1, &right);
3766         if (ret)
3767                 goto out_unlock;
3768
3769         free_space = btrfs_leaf_free_space(root, right);
3770         if (free_space < data_size)
3771                 goto out_unlock;
3772
3773         left_nritems = btrfs_header_nritems(left);
3774         if (left_nritems == 0)
3775                 goto out_unlock;
3776
3777         if (path->slots[0] == left_nritems && !empty) {
3778                 /* Key greater than all keys in the leaf, right neighbor has
3779                  * enough room for it and we're not emptying our leaf to delete
3780                  * it, therefore use right neighbor to insert the new item and
3781                  * no need to touch/dirty our left leaft. */
3782                 btrfs_tree_unlock(left);
3783                 free_extent_buffer(left);
3784                 path->nodes[0] = right;
3785                 path->slots[0] = 0;
3786                 path->slots[1]++;
3787                 return 0;
3788         }
3789
3790         return __push_leaf_right(trans, root, path, min_data_size, empty,
3791                                 right, free_space, left_nritems, min_slot);
3792 out_unlock:
3793         btrfs_tree_unlock(right);
3794         free_extent_buffer(right);
3795         return 1;
3796 }
3797
3798 /*
3799  * push some data in the path leaf to the left, trying to free up at
3800  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3801  *
3802  * max_slot can put a limit on how far into the leaf we'll push items.  The
3803  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3804  * items
3805  */
3806 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3807                                      struct btrfs_root *root,
3808                                      struct btrfs_path *path, int data_size,
3809                                      int empty, struct extent_buffer *left,
3810                                      int free_space, u32 right_nritems,
3811                                      u32 max_slot)
3812 {
3813         struct btrfs_disk_key disk_key;
3814         struct extent_buffer *right = path->nodes[0];
3815         int i;
3816         int push_space = 0;
3817         int push_items = 0;
3818         struct btrfs_item *item;
3819         u32 old_left_nritems;
3820         u32 nr;
3821         int ret = 0;
3822         u32 this_item_size;
3823         u32 old_left_item_size;
3824         struct btrfs_map_token token;
3825
3826         btrfs_init_map_token(&token);
3827
3828         if (empty)
3829                 nr = min(right_nritems, max_slot);
3830         else
3831                 nr = min(right_nritems - 1, max_slot);
3832
3833         for (i = 0; i < nr; i++) {
3834                 item = btrfs_item_nr(i);
3835
3836                 if (!empty && push_items > 0) {
3837                         if (path->slots[0] < i)
3838                                 break;
3839                         if (path->slots[0] == i) {
3840                                 int space = btrfs_leaf_free_space(root, right);
3841                                 if (space + push_space * 2 > free_space)
3842                                         break;
3843                         }
3844                 }
3845
3846                 if (path->slots[0] == i)
3847                         push_space += data_size;
3848
3849                 this_item_size = btrfs_item_size(right, item);
3850                 if (this_item_size + sizeof(*item) + push_space > free_space)
3851                         break;
3852
3853                 push_items++;
3854                 push_space += this_item_size + sizeof(*item);
3855         }
3856
3857         if (push_items == 0) {
3858                 ret = 1;
3859                 goto out;
3860         }
3861         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3862
3863         /* push data from right to left */
3864         copy_extent_buffer(left, right,
3865                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3866                            btrfs_item_nr_offset(0),
3867                            push_items * sizeof(struct btrfs_item));
3868
3869         push_space = BTRFS_LEAF_DATA_SIZE(root) -
3870                      btrfs_item_offset_nr(right, push_items - 1);
3871
3872         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3873                      leaf_data_end(root, left) - push_space,
3874                      btrfs_leaf_data(right) +
3875                      btrfs_item_offset_nr(right, push_items - 1),
3876                      push_space);
3877         old_left_nritems = btrfs_header_nritems(left);
3878         BUG_ON(old_left_nritems <= 0);
3879
3880         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3881         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3882                 u32 ioff;
3883
3884                 item = btrfs_item_nr(i);
3885
3886                 ioff = btrfs_token_item_offset(left, item, &token);
3887                 btrfs_set_token_item_offset(left, item,
3888                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3889                       &token);
3890         }
3891         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3892
3893         /* fixup right node */
3894         if (push_items > right_nritems)
3895                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3896                        right_nritems);
3897
3898         if (push_items < right_nritems) {
3899                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3900                                                   leaf_data_end(root, right);
3901                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
3902                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
3903                                       btrfs_leaf_data(right) +
3904                                       leaf_data_end(root, right), push_space);
3905
3906                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3907                               btrfs_item_nr_offset(push_items),
3908                              (btrfs_header_nritems(right) - push_items) *
3909                              sizeof(struct btrfs_item));
3910         }
3911         right_nritems -= push_items;
3912         btrfs_set_header_nritems(right, right_nritems);
3913         push_space = BTRFS_LEAF_DATA_SIZE(root);
3914         for (i = 0; i < right_nritems; i++) {
3915                 item = btrfs_item_nr(i);
3916
3917                 push_space = push_space - btrfs_token_item_size(right,
3918                                                                 item, &token);
3919                 btrfs_set_token_item_offset(right, item, push_space, &token);
3920         }
3921
3922         btrfs_mark_buffer_dirty(left);
3923         if (right_nritems)
3924                 btrfs_mark_buffer_dirty(right);
3925         else
3926                 clean_tree_block(trans, root, right);
3927
3928         btrfs_item_key(right, &disk_key, 0);
3929         fixup_low_keys(root, path, &disk_key, 1);
3930
3931         /* then fixup the leaf pointer in the path */
3932         if (path->slots[0] < push_items) {
3933                 path->slots[0] += old_left_nritems;
3934                 btrfs_tree_unlock(path->nodes[0]);
3935                 free_extent_buffer(path->nodes[0]);
3936                 path->nodes[0] = left;
3937                 path->slots[1] -= 1;
3938         } else {
3939                 btrfs_tree_unlock(left);
3940                 free_extent_buffer(left);
3941                 path->slots[0] -= push_items;
3942         }
3943         BUG_ON(path->slots[0] < 0);
3944         return ret;
3945 out:
3946         btrfs_tree_unlock(left);
3947         free_extent_buffer(left);
3948         return ret;
3949 }
3950
3951 /*
3952  * push some data in the path leaf to the left, trying to free up at
3953  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3954  *
3955  * max_slot can put a limit on how far into the leaf we'll push items.  The
3956  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3957  * items
3958  */
3959 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3960                           *root, struct btrfs_path *path, int min_data_size,
3961                           int data_size, int empty, u32 max_slot)
3962 {
3963         struct extent_buffer *right = path->nodes[0];
3964         struct extent_buffer *left;
3965         int slot;
3966         int free_space;
3967         u32 right_nritems;
3968         int ret = 0;
3969
3970         slot = path->slots[1];
3971         if (slot == 0)
3972                 return 1;
3973         if (!path->nodes[1])
3974                 return 1;
3975
3976         right_nritems = btrfs_header_nritems(right);
3977         if (right_nritems == 0)
3978                 return 1;
3979
3980         btrfs_assert_tree_locked(path->nodes[1]);
3981
3982         left = read_node_slot(root, path->nodes[1], slot - 1);
3983         if (left == NULL)
3984                 return 1;
3985
3986         btrfs_tree_lock(left);
3987         btrfs_set_lock_blocking(left);
3988
3989         free_space = btrfs_leaf_free_space(root, left);
3990         if (free_space < data_size) {
3991                 ret = 1;
3992                 goto out;
3993         }
3994
3995         /* cow and double check */
3996         ret = btrfs_cow_block(trans, root, left,
3997                               path->nodes[1], slot - 1, &left);
3998         if (ret) {
3999                 /* we hit -ENOSPC, but it isn't fatal here */
4000                 if (ret == -ENOSPC)
4001                         ret = 1;
4002                 goto out;
4003         }
4004
4005         free_space = btrfs_leaf_free_space(root, left);
4006         if (free_space < data_size) {
4007                 ret = 1;
4008                 goto out;
4009         }
4010
4011         return __push_leaf_left(trans, root, path, min_data_size,
4012                                empty, left, free_space, right_nritems,
4013                                max_slot);
4014 out:
4015         btrfs_tree_unlock(left);
4016         free_extent_buffer(left);
4017         return ret;
4018 }
4019
4020 /*
4021  * split the path's leaf in two, making sure there is at least data_size
4022  * available for the resulting leaf level of the path.
4023  */
4024 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4025                                     struct btrfs_root *root,
4026                                     struct btrfs_path *path,
4027                                     struct extent_buffer *l,
4028                                     struct extent_buffer *right,
4029                                     int slot, int mid, int nritems)
4030 {
4031         int data_copy_size;
4032         int rt_data_off;
4033         int i;
4034         struct btrfs_disk_key disk_key;
4035         struct btrfs_map_token token;
4036
4037         btrfs_init_map_token(&token);
4038
4039         nritems = nritems - mid;
4040         btrfs_set_header_nritems(right, nritems);
4041         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4042
4043         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4044                            btrfs_item_nr_offset(mid),
4045                            nritems * sizeof(struct btrfs_item));
4046
4047         copy_extent_buffer(right, l,
4048                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4049                      data_copy_size, btrfs_leaf_data(l) +
4050                      leaf_data_end(root, l), data_copy_size);
4051
4052         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4053                       btrfs_item_end_nr(l, mid);
4054
4055         for (i = 0; i < nritems; i++) {
4056                 struct btrfs_item *item = btrfs_item_nr(i);
4057                 u32 ioff;
4058
4059                 ioff = btrfs_token_item_offset(right, item, &token);
4060                 btrfs_set_token_item_offset(right, item,
4061                                             ioff + rt_data_off, &token);
4062         }
4063
4064         btrfs_set_header_nritems(l, mid);
4065         btrfs_item_key(right, &disk_key, 0);
4066         insert_ptr(trans, root, path, &disk_key, right->start,
4067                    path->slots[1] + 1, 1);
4068
4069         btrfs_mark_buffer_dirty(right);
4070         btrfs_mark_buffer_dirty(l);
4071         BUG_ON(path->slots[0] != slot);
4072
4073         if (mid <= slot) {
4074                 btrfs_tree_unlock(path->nodes[0]);
4075                 free_extent_buffer(path->nodes[0]);
4076                 path->nodes[0] = right;
4077                 path->slots[0] -= mid;
4078                 path->slots[1] += 1;
4079         } else {
4080                 btrfs_tree_unlock(right);
4081                 free_extent_buffer(right);
4082         }
4083
4084         BUG_ON(path->slots[0] < 0);
4085 }
4086
4087 /*
4088  * double splits happen when we need to insert a big item in the middle
4089  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4090  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4091  *          A                 B                 C
4092  *
4093  * We avoid this by trying to push the items on either side of our target
4094  * into the adjacent leaves.  If all goes well we can avoid the double split
4095  * completely.
4096  */
4097 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4098                                           struct btrfs_root *root,
4099                                           struct btrfs_path *path,
4100                                           int data_size)
4101 {
4102         int ret;
4103         int progress = 0;
4104         int slot;
4105         u32 nritems;
4106         int space_needed = data_size;
4107
4108         slot = path->slots[0];
4109         if (slot < btrfs_header_nritems(path->nodes[0]))
4110                 space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
4111
4112         /*
4113          * try to push all the items after our slot into the
4114          * right leaf
4115          */
4116         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4117         if (ret < 0)
4118                 return ret;
4119
4120         if (ret == 0)
4121                 progress++;
4122
4123         nritems = btrfs_header_nritems(path->nodes[0]);
4124         /*
4125          * our goal is to get our slot at the start or end of a leaf.  If
4126          * we've done so we're done
4127          */
4128         if (path->slots[0] == 0 || path->slots[0] == nritems)
4129                 return 0;
4130
4131         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4132                 return 0;
4133
4134         /* try to push all the items before our slot into the next leaf */
4135         slot = path->slots[0];
4136         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4137         if (ret < 0)
4138                 return ret;
4139
4140         if (ret == 0)
4141                 progress++;
4142
4143         if (progress)
4144                 return 0;
4145         return 1;
4146 }
4147
4148 /*
4149  * split the path's leaf in two, making sure there is at least data_size
4150  * available for the resulting leaf level of the path.
4151  *
4152  * returns 0 if all went well and < 0 on failure.
4153  */
4154 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4155                                struct btrfs_root *root,
4156                                struct btrfs_key *ins_key,
4157                                struct btrfs_path *path, int data_size,
4158                                int extend)
4159 {
4160         struct btrfs_disk_key disk_key;
4161         struct extent_buffer *l;
4162         u32 nritems;
4163         int mid;
4164         int slot;
4165         struct extent_buffer *right;
4166         int ret = 0;
4167         int wret;
4168         int split;
4169         int num_doubles = 0;
4170         int tried_avoid_double = 0;
4171
4172         l = path->nodes[0];
4173         slot = path->slots[0];
4174         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4175             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4176                 return -EOVERFLOW;
4177
4178         /* first try to make some room by pushing left and right */
4179         if (data_size && path->nodes[1]) {
4180                 int space_needed = data_size;
4181
4182                 if (slot < btrfs_header_nritems(l))
4183                         space_needed -= btrfs_leaf_free_space(root, l);
4184
4185                 wret = push_leaf_right(trans, root, path, space_needed,
4186                                        space_needed, 0, 0);
4187                 if (wret < 0)
4188                         return wret;
4189                 if (wret) {
4190                         wret = push_leaf_left(trans, root, path, space_needed,
4191                                               space_needed, 0, (u32)-1);
4192                         if (wret < 0)
4193                                 return wret;
4194                 }
4195                 l = path->nodes[0];
4196
4197                 /* did the pushes work? */
4198                 if (btrfs_leaf_free_space(root, l) >= data_size)
4199                         return 0;
4200         }
4201
4202         if (!path->nodes[1]) {
4203                 ret = insert_new_root(trans, root, path, 1);
4204                 if (ret)
4205                         return ret;
4206         }
4207 again:
4208         split = 1;
4209         l = path->nodes[0];
4210         slot = path->slots[0];
4211         nritems = btrfs_header_nritems(l);
4212         mid = (nritems + 1) / 2;
4213
4214         if (mid <= slot) {
4215                 if (nritems == 1 ||
4216                     leaf_space_used(l, mid, nritems - mid) + data_size >
4217                         BTRFS_LEAF_DATA_SIZE(root)) {
4218                         if (slot >= nritems) {
4219                                 split = 0;
4220                         } else {
4221                                 mid = slot;
4222                                 if (mid != nritems &&
4223                                     leaf_space_used(l, mid, nritems - mid) +
4224                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4225                                         if (data_size && !tried_avoid_double)
4226                                                 goto push_for_double;
4227                                         split = 2;
4228                                 }
4229                         }
4230                 }
4231         } else {
4232                 if (leaf_space_used(l, 0, mid) + data_size >
4233                         BTRFS_LEAF_DATA_SIZE(root)) {
4234                         if (!extend && data_size && slot == 0) {
4235                                 split = 0;
4236                         } else if ((extend || !data_size) && slot == 0) {
4237                                 mid = 1;
4238                         } else {
4239                                 mid = slot;
4240                                 if (mid != nritems &&
4241                                     leaf_space_used(l, mid, nritems - mid) +
4242                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4243                                         if (data_size && !tried_avoid_double)
4244                                                 goto push_for_double;
4245                                         split = 2;
4246                                 }
4247                         }
4248                 }
4249         }
4250
4251         if (split == 0)
4252                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4253         else
4254                 btrfs_item_key(l, &disk_key, mid);
4255
4256         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4257                         &disk_key, 0, l->start, 0);
4258         if (IS_ERR(right))
4259                 return PTR_ERR(right);
4260
4261         root_add_used(root, root->nodesize);
4262
4263         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4264         btrfs_set_header_bytenr(right, right->start);
4265         btrfs_set_header_generation(right, trans->transid);
4266         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4267         btrfs_set_header_owner(right, root->root_key.objectid);
4268         btrfs_set_header_level(right, 0);
4269         write_extent_buffer(right, root->fs_info->fsid,
4270                             btrfs_header_fsid(), BTRFS_FSID_SIZE);
4271
4272         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
4273                             btrfs_header_chunk_tree_uuid(right),
4274                             BTRFS_UUID_SIZE);
4275
4276         if (split == 0) {
4277                 if (mid <= slot) {
4278                         btrfs_set_header_nritems(right, 0);
4279                         insert_ptr(trans, root, path, &disk_key, right->start,
4280                                    path->slots[1] + 1, 1);
4281                         btrfs_tree_unlock(path->nodes[0]);
4282                         free_extent_buffer(path->nodes[0]);
4283                         path->nodes[0] = right;
4284                         path->slots[0] = 0;
4285                         path->slots[1] += 1;
4286                 } else {
4287                         btrfs_set_header_nritems(right, 0);
4288                         insert_ptr(trans, root, path, &disk_key, right->start,
4289                                           path->slots[1], 1);
4290                         btrfs_tree_unlock(path->nodes[0]);
4291                         free_extent_buffer(path->nodes[0]);
4292                         path->nodes[0] = right;
4293                         path->slots[0] = 0;
4294                         if (path->slots[1] == 0)
4295                                 fixup_low_keys(root, path, &disk_key, 1);
4296                 }
4297                 btrfs_mark_buffer_dirty(right);
4298                 return ret;
4299         }
4300
4301         copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4302
4303         if (split == 2) {
4304                 BUG_ON(num_doubles != 0);
4305                 num_doubles++;
4306                 goto again;
4307         }
4308
4309         return 0;
4310
4311 push_for_double:
4312         push_for_double_split(trans, root, path, data_size);
4313         tried_avoid_double = 1;
4314         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4315                 return 0;
4316         goto again;
4317 }
4318
4319 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4320                                          struct btrfs_root *root,
4321                                          struct btrfs_path *path, int ins_len)
4322 {
4323         struct btrfs_key key;
4324         struct extent_buffer *leaf;
4325         struct btrfs_file_extent_item *fi;
4326         u64 extent_len = 0;
4327         u32 item_size;
4328         int ret;
4329
4330         leaf = path->nodes[0];
4331         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4332
4333         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4334                key.type != BTRFS_EXTENT_CSUM_KEY);
4335
4336         if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4337                 return 0;
4338
4339         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4340         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4341                 fi = btrfs_item_ptr(leaf, path->slots[0],
4342                                     struct btrfs_file_extent_item);
4343                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4344         }
4345         btrfs_release_path(path);
4346
4347         path->keep_locks = 1;
4348         path->search_for_split = 1;
4349         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4350         path->search_for_split = 0;
4351         if (ret < 0)
4352                 goto err;
4353
4354         ret = -EAGAIN;
4355         leaf = path->nodes[0];
4356         /* if our item isn't there or got smaller, return now */
4357         if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4358                 goto err;
4359
4360         /* the leaf has  changed, it now has room.  return now */
4361         if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4362                 goto err;
4363
4364         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4365                 fi = btrfs_item_ptr(leaf, path->slots[0],
4366                                     struct btrfs_file_extent_item);
4367                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4368                         goto err;
4369         }
4370
4371         btrfs_set_path_blocking(path);
4372         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4373         if (ret)
4374                 goto err;
4375
4376         path->keep_locks = 0;
4377         btrfs_unlock_up_safe(path, 1);
4378         return 0;
4379 err:
4380         path->keep_locks = 0;
4381         return ret;
4382 }
4383
4384 static noinline int split_item(struct btrfs_trans_handle *trans,
4385                                struct btrfs_root *root,
4386                                struct btrfs_path *path,
4387                                struct btrfs_key *new_key,
4388                                unsigned long split_offset)
4389 {
4390         struct extent_buffer *leaf;
4391         struct btrfs_item *item;
4392         struct btrfs_item *new_item;
4393         int slot;
4394         char *buf;
4395         u32 nritems;
4396         u32 item_size;
4397         u32 orig_offset;
4398         struct btrfs_disk_key disk_key;
4399
4400         leaf = path->nodes[0];
4401         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4402
4403         btrfs_set_path_blocking(path);
4404
4405         item = btrfs_item_nr(path->slots[0]);
4406         orig_offset = btrfs_item_offset(leaf, item);
4407         item_size = btrfs_item_size(leaf, item);
4408
4409         buf = kmalloc(item_size, GFP_NOFS);
4410         if (!buf)
4411                 return -ENOMEM;
4412
4413         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4414                             path->slots[0]), item_size);
4415
4416         slot = path->slots[0] + 1;
4417         nritems = btrfs_header_nritems(leaf);
4418         if (slot != nritems) {
4419                 /* shift the items */
4420                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4421                                 btrfs_item_nr_offset(slot),
4422                                 (nritems - slot) * sizeof(struct btrfs_item));
4423         }
4424
4425         btrfs_cpu_key_to_disk(&disk_key, new_key);
4426         btrfs_set_item_key(leaf, &disk_key, slot);
4427
4428         new_item = btrfs_item_nr(slot);
4429
4430         btrfs_set_item_offset(leaf, new_item, orig_offset);
4431         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4432
4433         btrfs_set_item_offset(leaf, item,
4434                               orig_offset + item_size - split_offset);
4435         btrfs_set_item_size(leaf, item, split_offset);
4436
4437         btrfs_set_header_nritems(leaf, nritems + 1);
4438
4439         /* write the data for the start of the original item */
4440         write_extent_buffer(leaf, buf,
4441                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4442                             split_offset);
4443
4444         /* write the data for the new item */
4445         write_extent_buffer(leaf, buf + split_offset,
4446                             btrfs_item_ptr_offset(leaf, slot),
4447                             item_size - split_offset);
4448         btrfs_mark_buffer_dirty(leaf);
4449
4450         BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4451         kfree(buf);
4452         return 0;
4453 }
4454
4455 /*
4456  * This function splits a single item into two items,
4457  * giving 'new_key' to the new item and splitting the
4458  * old one at split_offset (from the start of the item).
4459  *
4460  * The path may be released by this operation.  After
4461  * the split, the path is pointing to the old item.  The
4462  * new item is going to be in the same node as the old one.
4463  *
4464  * Note, the item being split must be smaller enough to live alone on
4465  * a tree block with room for one extra struct btrfs_item
4466  *
4467  * This allows us to split the item in place, keeping a lock on the
4468  * leaf the entire time.
4469  */
4470 int btrfs_split_item(struct btrfs_trans_handle *trans,
4471                      struct btrfs_root *root,
4472                      struct btrfs_path *path,
4473                      struct btrfs_key *new_key,
4474                      unsigned long split_offset)
4475 {
4476         int ret;
4477         ret = setup_leaf_for_split(trans, root, path,
4478                                    sizeof(struct btrfs_item));
4479         if (ret)
4480                 return ret;
4481
4482         ret = split_item(trans, root, path, new_key, split_offset);
4483         return ret;
4484 }
4485
4486 /*
4487  * This function duplicate a item, giving 'new_key' to the new item.
4488  * It guarantees both items live in the same tree leaf and the new item
4489  * is contiguous with the original item.
4490  *
4491  * This allows us to split file extent in place, keeping a lock on the
4492  * leaf the entire time.
4493  */
4494 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4495                          struct btrfs_root *root,
4496                          struct btrfs_path *path,
4497                          struct btrfs_key *new_key)
4498 {
4499         struct extent_buffer *leaf;
4500         int ret;
4501         u32 item_size;
4502
4503         leaf = path->nodes[0];
4504         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4505         ret = setup_leaf_for_split(trans, root, path,
4506                                    item_size + sizeof(struct btrfs_item));
4507         if (ret)
4508                 return ret;
4509
4510         path->slots[0]++;
4511         setup_items_for_insert(root, path, new_key, &item_size,
4512                                item_size, item_size +
4513                                sizeof(struct btrfs_item), 1);
4514         leaf = path->nodes[0];
4515         memcpy_extent_buffer(leaf,
4516                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4517                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4518                              item_size);
4519         return 0;
4520 }
4521
4522 /*
4523  * make the item pointed to by the path smaller.  new_size indicates
4524  * how small to make it, and from_end tells us if we just chop bytes
4525  * off the end of the item or if we shift the item to chop bytes off
4526  * the front.
4527  */
4528 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4529                          u32 new_size, int from_end)
4530 {
4531         int slot;
4532         struct extent_buffer *leaf;
4533         struct btrfs_item *item;
4534         u32 nritems;
4535         unsigned int data_end;
4536         unsigned int old_data_start;
4537         unsigned int old_size;
4538         unsigned int size_diff;
4539         int i;
4540         struct btrfs_map_token token;
4541
4542         btrfs_init_map_token(&token);
4543
4544         leaf = path->nodes[0];
4545         slot = path->slots[0];
4546
4547         old_size = btrfs_item_size_nr(leaf, slot);
4548         if (old_size == new_size)
4549                 return;
4550
4551         nritems = btrfs_header_nritems(leaf);
4552         data_end = leaf_data_end(root, leaf);
4553
4554         old_data_start = btrfs_item_offset_nr(leaf, slot);
4555
4556         size_diff = old_size - new_size;
4557
4558         BUG_ON(slot < 0);
4559         BUG_ON(slot >= nritems);
4560
4561         /*
4562          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4563          */
4564         /* first correct the data pointers */
4565         for (i = slot; i < nritems; i++) {
4566                 u32 ioff;
4567                 item = btrfs_item_nr(i);
4568
4569                 ioff = btrfs_token_item_offset(leaf, item, &token);
4570                 btrfs_set_token_item_offset(leaf, item,
4571                                             ioff + size_diff, &token);
4572         }
4573
4574         /* shift the data */
4575         if (from_end) {
4576                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4577                               data_end + size_diff, btrfs_leaf_data(leaf) +
4578                               data_end, old_data_start + new_size - data_end);
4579         } else {
4580                 struct btrfs_disk_key disk_key;
4581                 u64 offset;
4582
4583                 btrfs_item_key(leaf, &disk_key, slot);
4584
4585                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4586                         unsigned long ptr;
4587                         struct btrfs_file_extent_item *fi;
4588
4589                         fi = btrfs_item_ptr(leaf, slot,
4590                                             struct btrfs_file_extent_item);
4591                         fi = (struct btrfs_file_extent_item *)(
4592                              (unsigned long)fi - size_diff);
4593
4594                         if (btrfs_file_extent_type(leaf, fi) ==
4595                             BTRFS_FILE_EXTENT_INLINE) {
4596                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4597                                 memmove_extent_buffer(leaf, ptr,
4598                                       (unsigned long)fi,
4599                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4600                         }
4601                 }
4602
4603                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4604                               data_end + size_diff, btrfs_leaf_data(leaf) +
4605                               data_end, old_data_start - data_end);
4606
4607                 offset = btrfs_disk_key_offset(&disk_key);
4608                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4609                 btrfs_set_item_key(leaf, &disk_key, slot);
4610                 if (slot == 0)
4611                         fixup_low_keys(root, path, &disk_key, 1);
4612         }
4613
4614         item = btrfs_item_nr(slot);
4615         btrfs_set_item_size(leaf, item, new_size);
4616         btrfs_mark_buffer_dirty(leaf);
4617
4618         if (btrfs_leaf_free_space(root, leaf) < 0) {
4619                 btrfs_print_leaf(root, leaf);
4620                 BUG();
4621         }
4622 }
4623
4624 /*
4625  * make the item pointed to by the path bigger, data_size is the added size.
4626  */
4627 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4628                        u32 data_size)
4629 {
4630         int slot;
4631         struct extent_buffer *leaf;
4632         struct btrfs_item *item;
4633         u32 nritems;
4634         unsigned int data_end;
4635         unsigned int old_data;
4636         unsigned int old_size;
4637         int i;
4638         struct btrfs_map_token token;
4639
4640         btrfs_init_map_token(&token);
4641
4642         leaf = path->nodes[0];
4643
4644         nritems = btrfs_header_nritems(leaf);
4645         data_end = leaf_data_end(root, leaf);
4646
4647         if (btrfs_leaf_free_space(root, leaf) < data_size) {
4648                 btrfs_print_leaf(root, leaf);
4649                 BUG();
4650         }
4651         slot = path->slots[0];
4652         old_data = btrfs_item_end_nr(leaf, slot);
4653
4654         BUG_ON(slot < 0);
4655         if (slot >= nritems) {
4656                 btrfs_print_leaf(root, leaf);
4657                 btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4658                        slot, nritems);
4659                 BUG_ON(1);
4660         }
4661
4662         /*
4663          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4664          */
4665         /* first correct the data pointers */
4666         for (i = slot; i < nritems; i++) {
4667                 u32 ioff;
4668                 item = btrfs_item_nr(i);
4669
4670                 ioff = btrfs_token_item_offset(leaf, item, &token);
4671                 btrfs_set_token_item_offset(leaf, item,
4672                                             ioff - data_size, &token);
4673         }
4674
4675         /* shift the data */
4676         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4677                       data_end - data_size, btrfs_leaf_data(leaf) +
4678                       data_end, old_data - data_end);
4679
4680         data_end = old_data;
4681         old_size = btrfs_item_size_nr(leaf, slot);
4682         item = btrfs_item_nr(slot);
4683         btrfs_set_item_size(leaf, item, old_size + data_size);
4684         btrfs_mark_buffer_dirty(leaf);
4685
4686         if (btrfs_leaf_free_space(root, leaf) < 0) {
4687                 btrfs_print_leaf(root, leaf);
4688                 BUG();
4689         }
4690 }
4691
4692 /*
4693  * this is a helper for btrfs_insert_empty_items, the main goal here is
4694  * to save stack depth by doing the bulk of the work in a function
4695  * that doesn't call btrfs_search_slot
4696  */
4697 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4698                             struct btrfs_key *cpu_key, u32 *data_size,
4699                             u32 total_data, u32 total_size, int nr)
4700 {
4701         struct btrfs_item *item;
4702         int i;
4703         u32 nritems;
4704         unsigned int data_end;
4705         struct btrfs_disk_key disk_key;
4706         struct extent_buffer *leaf;
4707         int slot;
4708         struct btrfs_map_token token;
4709
4710         if (path->slots[0] == 0) {
4711                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4712                 fixup_low_keys(root, path, &disk_key, 1);
4713         }
4714         btrfs_unlock_up_safe(path, 1);
4715
4716         btrfs_init_map_token(&token);
4717
4718         leaf = path->nodes[0];
4719         slot = path->slots[0];
4720
4721         nritems = btrfs_header_nritems(leaf);
4722         data_end = leaf_data_end(root, leaf);
4723
4724         if (btrfs_leaf_free_space(root, leaf) < total_size) {
4725                 btrfs_print_leaf(root, leaf);
4726                 btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
4727                        total_size, btrfs_leaf_free_space(root, leaf));
4728                 BUG();
4729         }
4730
4731         if (slot != nritems) {
4732                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4733
4734                 if (old_data < data_end) {
4735                         btrfs_print_leaf(root, leaf);
4736                         btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
4737                                slot, old_data, data_end);
4738                         BUG_ON(1);
4739                 }
4740                 /*
4741                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4742                  */
4743                 /* first correct the data pointers */
4744                 for (i = slot; i < nritems; i++) {
4745                         u32 ioff;
4746
4747                         item = btrfs_item_nr( i);
4748                         ioff = btrfs_token_item_offset(leaf, item, &token);
4749                         btrfs_set_token_item_offset(leaf, item,
4750                                                     ioff - total_data, &token);
4751                 }
4752                 /* shift the items */
4753                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4754                               btrfs_item_nr_offset(slot),
4755                               (nritems - slot) * sizeof(struct btrfs_item));
4756
4757                 /* shift the data */
4758                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4759                               data_end - total_data, btrfs_leaf_data(leaf) +
4760                               data_end, old_data - data_end);
4761                 data_end = old_data;
4762         }
4763
4764         /* setup the item for the new data */
4765         for (i = 0; i < nr; i++) {
4766                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4767                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4768                 item = btrfs_item_nr(slot + i);
4769                 btrfs_set_token_item_offset(leaf, item,
4770                                             data_end - data_size[i], &token);
4771                 data_end -= data_size[i];
4772                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4773         }
4774
4775         btrfs_set_header_nritems(leaf, nritems + nr);
4776         btrfs_mark_buffer_dirty(leaf);
4777
4778         if (btrfs_leaf_free_space(root, leaf) < 0) {
4779                 btrfs_print_leaf(root, leaf);
4780                 BUG();
4781         }
4782 }
4783
4784 /*
4785  * Given a key and some data, insert items into the tree.
4786  * This does all the path init required, making room in the tree if needed.
4787  */
4788 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4789                             struct btrfs_root *root,
4790                             struct btrfs_path *path,
4791                             struct btrfs_key *cpu_key, u32 *data_size,
4792                             int nr)
4793 {
4794         int ret = 0;
4795         int slot;
4796         int i;
4797         u32 total_size = 0;
4798         u32 total_data = 0;
4799
4800         for (i = 0; i < nr; i++)
4801                 total_data += data_size[i];
4802
4803         total_size = total_data + (nr * sizeof(struct btrfs_item));
4804         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4805         if (ret == 0)
4806                 return -EEXIST;
4807         if (ret < 0)
4808                 return ret;
4809
4810         slot = path->slots[0];
4811         BUG_ON(slot < 0);
4812
4813         setup_items_for_insert(root, path, cpu_key, data_size,
4814                                total_data, total_size, nr);
4815         return 0;
4816 }
4817
4818 /*
4819  * Given a key and some data, insert an item into the tree.
4820  * This does all the path init required, making room in the tree if needed.
4821  */
4822 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4823                       *root, struct btrfs_key *cpu_key, void *data, u32
4824                       data_size)
4825 {
4826         int ret = 0;
4827         struct btrfs_path *path;
4828         struct extent_buffer *leaf;
4829         unsigned long ptr;
4830
4831         path = btrfs_alloc_path();
4832         if (!path)
4833                 return -ENOMEM;
4834         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4835         if (!ret) {
4836                 leaf = path->nodes[0];
4837                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4838                 write_extent_buffer(leaf, data, ptr, data_size);
4839                 btrfs_mark_buffer_dirty(leaf);
4840         }
4841         btrfs_free_path(path);
4842         return ret;
4843 }
4844
4845 /*
4846  * delete the pointer from a given node.
4847  *
4848  * the tree should have been previously balanced so the deletion does not
4849  * empty a node.
4850  */
4851 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4852                     int level, int slot)
4853 {
4854         struct extent_buffer *parent = path->nodes[level];
4855         u32 nritems;
4856         int ret;
4857
4858         nritems = btrfs_header_nritems(parent);
4859         if (slot != nritems - 1) {
4860                 if (level)
4861                         tree_mod_log_eb_move(root->fs_info, parent, slot,
4862                                              slot + 1, nritems - slot - 1);
4863                 memmove_extent_buffer(parent,
4864                               btrfs_node_key_ptr_offset(slot),
4865                               btrfs_node_key_ptr_offset(slot + 1),
4866                               sizeof(struct btrfs_key_ptr) *
4867                               (nritems - slot - 1));
4868         } else if (level) {
4869                 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4870                                               MOD_LOG_KEY_REMOVE, GFP_NOFS);
4871                 BUG_ON(ret < 0);
4872         }
4873
4874         nritems--;
4875         btrfs_set_header_nritems(parent, nritems);
4876         if (nritems == 0 && parent == root->node) {
4877                 BUG_ON(btrfs_header_level(root->node) != 1);
4878                 /* just turn the root into a leaf and break */
4879                 btrfs_set_header_level(root->node, 0);
4880         } else if (slot == 0) {
4881                 struct btrfs_disk_key disk_key;
4882
4883                 btrfs_node_key(parent, &disk_key, 0);
4884                 fixup_low_keys(root, path, &disk_key, level + 1);
4885         }
4886         btrfs_mark_buffer_dirty(parent);
4887 }
4888
4889 /*
4890  * a helper function to delete the leaf pointed to by path->slots[1] and
4891  * path->nodes[1].
4892  *
4893  * This deletes the pointer in path->nodes[1] and frees the leaf
4894  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4895  *
4896  * The path must have already been setup for deleting the leaf, including
4897  * all the proper balancing.  path->nodes[1] must be locked.
4898  */
4899 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4900                                     struct btrfs_root *root,
4901                                     struct btrfs_path *path,
4902                                     struct extent_buffer *leaf)
4903 {
4904         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4905         del_ptr(root, path, 1, path->slots[1]);
4906
4907         /*
4908          * btrfs_free_extent is expensive, we want to make sure we
4909          * aren't holding any locks when we call it
4910          */
4911         btrfs_unlock_up_safe(path, 0);
4912
4913         root_sub_used(root, leaf->len);
4914
4915         extent_buffer_get(leaf);
4916         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4917         free_extent_buffer_stale(leaf);
4918 }
4919 /*
4920  * delete the item at the leaf level in path.  If that empties
4921  * the leaf, remove it from the tree
4922  */
4923 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4924                     struct btrfs_path *path, int slot, int nr)
4925 {
4926         struct extent_buffer *leaf;
4927         struct btrfs_item *item;
4928         int last_off;
4929         int dsize = 0;
4930         int ret = 0;
4931         int wret;
4932         int i;
4933         u32 nritems;
4934         struct btrfs_map_token token;
4935
4936         btrfs_init_map_token(&token);
4937
4938         leaf = path->nodes[0];
4939         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4940
4941         for (i = 0; i < nr; i++)
4942                 dsize += btrfs_item_size_nr(leaf, slot + i);
4943
4944         nritems = btrfs_header_nritems(leaf);
4945
4946         if (slot + nr != nritems) {
4947                 int data_end = leaf_data_end(root, leaf);
4948
4949                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4950                               data_end + dsize,
4951                               btrfs_leaf_data(leaf) + data_end,
4952                               last_off - data_end);
4953
4954                 for (i = slot + nr; i < nritems; i++) {
4955                         u32 ioff;
4956
4957                         item = btrfs_item_nr(i);
4958                         ioff = btrfs_token_item_offset(leaf, item, &token);
4959                         btrfs_set_token_item_offset(leaf, item,
4960                                                     ioff + dsize, &token);
4961                 }
4962
4963                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4964                               btrfs_item_nr_offset(slot + nr),
4965                               sizeof(struct btrfs_item) *
4966                               (nritems - slot - nr));
4967         }
4968         btrfs_set_header_nritems(leaf, nritems - nr);
4969         nritems -= nr;
4970
4971         /* delete the leaf if we've emptied it */
4972         if (nritems == 0) {
4973                 if (leaf == root->node) {
4974                         btrfs_set_header_level(leaf, 0);
4975                 } else {
4976                         btrfs_set_path_blocking(path);
4977                         clean_tree_block(trans, root, leaf);
4978                         btrfs_del_leaf(trans, root, path, leaf);
4979                 }
4980         } else {
4981                 int used = leaf_space_used(leaf, 0, nritems);
4982                 if (slot == 0) {
4983                         struct btrfs_disk_key disk_key;
4984
4985                         btrfs_item_key(leaf, &disk_key, 0);
4986                         fixup_low_keys(root, path, &disk_key, 1);
4987                 }
4988
4989                 /* delete the leaf if it is mostly empty */
4990                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4991                         /* push_leaf_left fixes the path.
4992                          * make sure the path still points to our leaf
4993                          * for possible call to del_ptr below
4994                          */
4995                         slot = path->slots[1];
4996                         extent_buffer_get(leaf);
4997
4998                         btrfs_set_path_blocking(path);
4999                         wret = push_leaf_left(trans, root, path, 1, 1,
5000                                               1, (u32)-1);
5001                         if (wret < 0 && wret != -ENOSPC)
5002                                 ret = wret;
5003
5004                         if (path->nodes[0] == leaf &&
5005                             btrfs_header_nritems(leaf)) {
5006                                 wret = push_leaf_right(trans, root, path, 1,
5007                                                        1, 1, 0);
5008                                 if (wret < 0 && wret != -ENOSPC)
5009                                         ret = wret;
5010                         }
5011
5012                         if (btrfs_header_nritems(leaf) == 0) {
5013                                 path->slots[1] = slot;
5014                                 btrfs_del_leaf(trans, root, path, leaf);
5015                                 free_extent_buffer(leaf);
5016                                 ret = 0;
5017                         } else {
5018                                 /* if we're still in the path, make sure
5019                                  * we're dirty.  Otherwise, one of the
5020                                  * push_leaf functions must have already
5021                                  * dirtied this buffer
5022                                  */
5023                                 if (path->nodes[0] == leaf)
5024                                         btrfs_mark_buffer_dirty(leaf);
5025                                 free_extent_buffer(leaf);
5026                         }
5027                 } else {
5028                         btrfs_mark_buffer_dirty(leaf);
5029                 }
5030         }
5031         return ret;
5032 }
5033
5034 /*
5035  * search the tree again to find a leaf with lesser keys
5036  * returns 0 if it found something or 1 if there are no lesser leaves.
5037  * returns < 0 on io errors.
5038  *
5039  * This may release the path, and so you may lose any locks held at the
5040  * time you call it.
5041  */
5042 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5043 {
5044         struct btrfs_key key;
5045         struct btrfs_disk_key found_key;
5046         int ret;
5047
5048         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5049
5050         if (key.offset > 0) {
5051                 key.offset--;
5052         } else if (key.type > 0) {
5053                 key.type--;
5054                 key.offset = (u64)-1;
5055         } else if (key.objectid > 0) {
5056                 key.objectid--;
5057                 key.type = (u8)-1;
5058                 key.offset = (u64)-1;
5059         } else {
5060                 return 1;
5061         }
5062
5063         btrfs_release_path(path);
5064         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5065         if (ret < 0)
5066                 return ret;
5067         btrfs_item_key(path->nodes[0], &found_key, 0);
5068         ret = comp_keys(&found_key, &key);
5069         /*
5070          * We might have had an item with the previous key in the tree right
5071          * before we released our path. And after we released our path, that
5072          * item might have been pushed to the first slot (0) of the leaf we
5073          * were holding due to a tree balance. Alternatively, an item with the
5074          * previous key can exist as the only element of a leaf (big fat item).
5075          * Therefore account for these 2 cases, so that our callers (like
5076          * btrfs_previous_item) don't miss an existing item with a key matching
5077          * the previous key we computed above.
5078          */
5079         if (ret <= 0)
5080                 return 0;
5081         return 1;
5082 }
5083
5084 /*
5085  * A helper function to walk down the tree starting at min_key, and looking
5086  * for nodes or leaves that are have a minimum transaction id.
5087  * This is used by the btree defrag code, and tree logging
5088  *
5089  * This does not cow, but it does stuff the starting key it finds back
5090  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5091  * key and get a writable path.
5092  *
5093  * This does lock as it descends, and path->keep_locks should be set
5094  * to 1 by the caller.
5095  *
5096  * This honors path->lowest_level to prevent descent past a given level
5097  * of the tree.
5098  *
5099  * min_trans indicates the oldest transaction that you are interested
5100  * in walking through.  Any nodes or leaves older than min_trans are
5101  * skipped over (without reading them).
5102  *
5103  * returns zero if something useful was found, < 0 on error and 1 if there
5104  * was nothing in the tree that matched the search criteria.
5105  */
5106 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5107                          struct btrfs_path *path,
5108                          u64 min_trans)
5109 {
5110         struct extent_buffer *cur;
5111         struct btrfs_key found_key;
5112         int slot;
5113         int sret;
5114         u32 nritems;
5115         int level;
5116         int ret = 1;
5117         int keep_locks = path->keep_locks;
5118
5119         path->keep_locks = 1;
5120 again:
5121         cur = btrfs_read_lock_root_node(root);
5122         level = btrfs_header_level(cur);
5123         WARN_ON(path->nodes[level]);
5124         path->nodes[level] = cur;
5125         path->locks[level] = BTRFS_READ_LOCK;
5126
5127         if (btrfs_header_generation(cur) < min_trans) {
5128                 ret = 1;
5129                 goto out;
5130         }
5131         while (1) {
5132                 nritems = btrfs_header_nritems(cur);
5133                 level = btrfs_header_level(cur);
5134                 sret = bin_search(cur, min_key, level, &slot);
5135
5136                 /* at the lowest level, we're done, setup the path and exit */
5137                 if (level == path->lowest_level) {
5138                         if (slot >= nritems)
5139                                 goto find_next_key;
5140                         ret = 0;
5141                         path->slots[level] = slot;
5142                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5143                         goto out;
5144                 }
5145                 if (sret && slot > 0)
5146                         slot--;
5147                 /*
5148                  * check this node pointer against the min_trans parameters.
5149                  * If it is too old, old, skip to the next one.
5150                  */
5151                 while (slot < nritems) {
5152                         u64 gen;
5153
5154                         gen = btrfs_node_ptr_generation(cur, slot);
5155                         if (gen < min_trans) {
5156                                 slot++;
5157                                 continue;
5158                         }
5159                         break;
5160                 }
5161 find_next_key:
5162                 /*
5163                  * we didn't find a candidate key in this node, walk forward
5164                  * and find another one
5165                  */
5166                 if (slot >= nritems) {
5167                         path->slots[level] = slot;
5168                         btrfs_set_path_blocking(path);
5169                         sret = btrfs_find_next_key(root, path, min_key, level,
5170                                                   min_trans);
5171                         if (sret == 0) {
5172                                 btrfs_release_path(path);
5173                                 goto again;
5174                         } else {
5175                                 goto out;
5176                         }
5177                 }
5178                 /* save our key for returning back */
5179                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5180                 path->slots[level] = slot;
5181                 if (level == path->lowest_level) {
5182                         ret = 0;
5183                         goto out;
5184                 }
5185                 btrfs_set_path_blocking(path);
5186                 cur = read_node_slot(root, cur, slot);
5187                 BUG_ON(!cur); /* -ENOMEM */
5188
5189                 btrfs_tree_read_lock(cur);
5190
5191                 path->locks[level - 1] = BTRFS_READ_LOCK;
5192                 path->nodes[level - 1] = cur;
5193                 unlock_up(path, level, 1, 0, NULL);
5194                 btrfs_clear_path_blocking(path, NULL, 0);
5195         }
5196 out:
5197         path->keep_locks = keep_locks;
5198         if (ret == 0) {
5199                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5200                 btrfs_set_path_blocking(path);
5201                 memcpy(min_key, &found_key, sizeof(found_key));
5202         }
5203         return ret;
5204 }
5205
5206 static void tree_move_down(struct btrfs_root *root,
5207                            struct btrfs_path *path,
5208                            int *level, int root_level)
5209 {
5210         BUG_ON(*level == 0);
5211         path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5212                                         path->slots[*level]);
5213         path->slots[*level - 1] = 0;
5214         (*level)--;
5215 }
5216
5217 static int tree_move_next_or_upnext(struct btrfs_root *root,
5218                                     struct btrfs_path *path,
5219                                     int *level, int root_level)
5220 {
5221         int ret = 0;
5222         int nritems;
5223         nritems = btrfs_header_nritems(path->nodes[*level]);
5224
5225         path->slots[*level]++;
5226
5227         while (path->slots[*level] >= nritems) {
5228                 if (*level == root_level)
5229                         return -1;
5230
5231                 /* move upnext */
5232                 path->slots[*level] = 0;
5233                 free_extent_buffer(path->nodes[*level]);
5234                 path->nodes[*level] = NULL;
5235                 (*level)++;
5236                 path->slots[*level]++;
5237
5238                 nritems = btrfs_header_nritems(path->nodes[*level]);
5239                 ret = 1;
5240         }
5241         return ret;
5242 }
5243
5244 /*
5245  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5246  * or down.
5247  */
5248 static int tree_advance(struct btrfs_root *root,
5249                         struct btrfs_path *path,
5250                         int *level, int root_level,
5251                         int allow_down,
5252                         struct btrfs_key *key)
5253 {
5254         int ret;
5255
5256         if (*level == 0 || !allow_down) {
5257                 ret = tree_move_next_or_upnext(root, path, level, root_level);
5258         } else {
5259                 tree_move_down(root, path, level, root_level);
5260                 ret = 0;
5261         }
5262         if (ret >= 0) {
5263                 if (*level == 0)
5264                         btrfs_item_key_to_cpu(path->nodes[*level], key,
5265                                         path->slots[*level]);
5266                 else
5267                         btrfs_node_key_to_cpu(path->nodes[*level], key,
5268                                         path->slots[*level]);
5269         }
5270         return ret;
5271 }
5272
5273 static int tree_compare_item(struct btrfs_root *left_root,
5274                              struct btrfs_path *left_path,
5275                              struct btrfs_path *right_path,
5276                              char *tmp_buf)
5277 {
5278         int cmp;
5279         int len1, len2;
5280         unsigned long off1, off2;
5281
5282         len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5283         len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5284         if (len1 != len2)
5285                 return 1;
5286
5287         off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5288         off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5289                                 right_path->slots[0]);
5290
5291         read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5292
5293         cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5294         if (cmp)
5295                 return 1;
5296         return 0;
5297 }
5298
5299 #define ADVANCE 1
5300 #define ADVANCE_ONLY_NEXT -1
5301
5302 /*
5303  * This function compares two trees and calls the provided callback for
5304  * every changed/new/deleted item it finds.
5305  * If shared tree blocks are encountered, whole subtrees are skipped, making
5306  * the compare pretty fast on snapshotted subvolumes.
5307  *
5308  * This currently works on commit roots only. As commit roots are read only,
5309  * we don't do any locking. The commit roots are protected with transactions.
5310  * Transactions are ended and rejoined when a commit is tried in between.
5311  *
5312  * This function checks for modifications done to the trees while comparing.
5313  * If it detects a change, it aborts immediately.
5314  */
5315 int btrfs_compare_trees(struct btrfs_root *left_root,
5316                         struct btrfs_root *right_root,
5317                         btrfs_changed_cb_t changed_cb, void *ctx)
5318 {
5319         int ret;
5320         int cmp;
5321         struct btrfs_path *left_path = NULL;
5322         struct btrfs_path *right_path = NULL;
5323         struct btrfs_key left_key;
5324         struct btrfs_key right_key;
5325         char *tmp_buf = NULL;
5326         int left_root_level;
5327         int right_root_level;
5328         int left_level;
5329         int right_level;
5330         int left_end_reached;
5331         int right_end_reached;
5332         int advance_left;
5333         int advance_right;
5334         u64 left_blockptr;
5335         u64 right_blockptr;
5336         u64 left_gen;
5337         u64 right_gen;
5338
5339         left_path = btrfs_alloc_path();
5340         if (!left_path) {
5341                 ret = -ENOMEM;
5342                 goto out;
5343         }
5344         right_path = btrfs_alloc_path();
5345         if (!right_path) {
5346                 ret = -ENOMEM;
5347                 goto out;
5348         }
5349
5350         tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
5351         if (!tmp_buf) {
5352                 ret = -ENOMEM;
5353                 goto out;
5354         }
5355
5356         left_path->search_commit_root = 1;
5357         left_path->skip_locking = 1;
5358         right_path->search_commit_root = 1;
5359         right_path->skip_locking = 1;
5360
5361         /*
5362          * Strategy: Go to the first items of both trees. Then do
5363          *
5364          * If both trees are at level 0
5365          *   Compare keys of current items
5366          *     If left < right treat left item as new, advance left tree
5367          *       and repeat
5368          *     If left > right treat right item as deleted, advance right tree
5369          *       and repeat
5370          *     If left == right do deep compare of items, treat as changed if
5371          *       needed, advance both trees and repeat
5372          * If both trees are at the same level but not at level 0
5373          *   Compare keys of current nodes/leafs
5374          *     If left < right advance left tree and repeat
5375          *     If left > right advance right tree and repeat
5376          *     If left == right compare blockptrs of the next nodes/leafs
5377          *       If they match advance both trees but stay at the same level
5378          *         and repeat
5379          *       If they don't match advance both trees while allowing to go
5380          *         deeper and repeat
5381          * If tree levels are different
5382          *   Advance the tree that needs it and repeat
5383          *
5384          * Advancing a tree means:
5385          *   If we are at level 0, try to go to the next slot. If that's not
5386          *   possible, go one level up and repeat. Stop when we found a level
5387          *   where we could go to the next slot. We may at this point be on a
5388          *   node or a leaf.
5389          *
5390          *   If we are not at level 0 and not on shared tree blocks, go one
5391          *   level deeper.
5392          *
5393          *   If we are not at level 0 and on shared tree blocks, go one slot to
5394          *   the right if possible or go up and right.
5395          */
5396
5397         down_read(&left_root->fs_info->commit_root_sem);
5398         left_level = btrfs_header_level(left_root->commit_root);
5399         left_root_level = left_level;
5400         left_path->nodes[left_level] = left_root->commit_root;
5401         extent_buffer_get(left_path->nodes[left_level]);
5402
5403         right_level = btrfs_header_level(right_root->commit_root);
5404         right_root_level = right_level;
5405         right_path->nodes[right_level] = right_root->commit_root;
5406         extent_buffer_get(right_path->nodes[right_level]);
5407         up_read(&left_root->fs_info->commit_root_sem);
5408
5409         if (left_level == 0)
5410                 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5411                                 &left_key, left_path->slots[left_level]);
5412         else
5413                 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5414                                 &left_key, left_path->slots[left_level]);
5415         if (right_level == 0)
5416                 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5417                                 &right_key, right_path->slots[right_level]);
5418         else
5419                 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5420                                 &right_key, right_path->slots[right_level]);
5421
5422         left_end_reached = right_end_reached = 0;
5423         advance_left = advance_right = 0;
5424
5425         while (1) {
5426                 if (advance_left && !left_end_reached) {
5427                         ret = tree_advance(left_root, left_path, &left_level,
5428                                         left_root_level,
5429                                         advance_left != ADVANCE_ONLY_NEXT,
5430                                         &left_key);
5431                         if (ret < 0)
5432                                 left_end_reached = ADVANCE;
5433                         advance_left = 0;
5434                 }
5435                 if (advance_right && !right_end_reached) {
5436                         ret = tree_advance(right_root, right_path, &right_level,
5437                                         right_root_level,
5438                                         advance_right != ADVANCE_ONLY_NEXT,
5439                                         &right_key);
5440                         if (ret < 0)
5441                                 right_end_reached = ADVANCE;
5442                         advance_right = 0;
5443                 }
5444
5445                 if (left_end_reached && right_end_reached) {
5446                         ret = 0;
5447                         goto out;
5448                 } else if (left_end_reached) {
5449                         if (right_level == 0) {
5450                                 ret = changed_cb(left_root, right_root,
5451                                                 left_path, right_path,
5452                                                 &right_key,
5453                                                 BTRFS_COMPARE_TREE_DELETED,
5454                                                 ctx);
5455                                 if (ret < 0)
5456                                         goto out;
5457                         }
5458                         advance_right = ADVANCE;
5459                         continue;
5460                 } else if (right_end_reached) {
5461                         if (left_level == 0) {
5462                                 ret = changed_cb(left_root, right_root,
5463                                                 left_path, right_path,
5464                                                 &left_key,
5465                                                 BTRFS_COMPARE_TREE_NEW,
5466                                                 ctx);
5467                                 if (ret < 0)
5468                                         goto out;
5469                         }
5470                         advance_left = ADVANCE;
5471                         continue;
5472                 }
5473
5474                 if (left_level == 0 && right_level == 0) {
5475                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5476                         if (cmp < 0) {
5477                                 ret = changed_cb(left_root, right_root,
5478                                                 left_path, right_path,
5479                                                 &left_key,
5480                                                 BTRFS_COMPARE_TREE_NEW,
5481                                                 ctx);
5482                                 if (ret < 0)
5483                                         goto out;
5484                                 advance_left = ADVANCE;
5485                         } else if (cmp > 0) {
5486                                 ret = changed_cb(left_root, right_root,
5487                                                 left_path, right_path,
5488                                                 &right_key,
5489                                                 BTRFS_COMPARE_TREE_DELETED,
5490                                                 ctx);
5491                                 if (ret < 0)
5492                                         goto out;
5493                                 advance_right = ADVANCE;
5494                         } else {
5495                                 enum btrfs_compare_tree_result result;
5496
5497                                 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5498                                 ret = tree_compare_item(left_root, left_path,
5499                                                 right_path, tmp_buf);
5500                                 if (ret)
5501                                         result = BTRFS_COMPARE_TREE_CHANGED;
5502                                 else
5503                                         result = BTRFS_COMPARE_TREE_SAME;
5504                                 ret = changed_cb(left_root, right_root,
5505                                                  left_path, right_path,
5506                                                  &left_key, result, ctx);
5507                                 if (ret < 0)
5508                                         goto out;
5509                                 advance_left = ADVANCE;
5510                                 advance_right = ADVANCE;
5511                         }
5512                 } else if (left_level == right_level) {
5513                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5514                         if (cmp < 0) {
5515                                 advance_left = ADVANCE;
5516                         } else if (cmp > 0) {
5517                                 advance_right = ADVANCE;
5518                         } else {
5519                                 left_blockptr = btrfs_node_blockptr(
5520                                                 left_path->nodes[left_level],
5521                                                 left_path->slots[left_level]);
5522                                 right_blockptr = btrfs_node_blockptr(
5523                                                 right_path->nodes[right_level],
5524                                                 right_path->slots[right_level]);
5525                                 left_gen = btrfs_node_ptr_generation(
5526                                                 left_path->nodes[left_level],
5527                                                 left_path->slots[left_level]);
5528                                 right_gen = btrfs_node_ptr_generation(
5529                                                 right_path->nodes[right_level],
5530                                                 right_path->slots[right_level]);
5531                                 if (left_blockptr == right_blockptr &&
5532                                     left_gen == right_gen) {
5533                                         /*
5534                                          * As we're on a shared block, don't
5535                                          * allow to go deeper.
5536                                          */
5537                                         advance_left = ADVANCE_ONLY_NEXT;
5538                                         advance_right = ADVANCE_ONLY_NEXT;
5539                                 } else {
5540                                         advance_left = ADVANCE;
5541                                         advance_right = ADVANCE;
5542                                 }
5543                         }
5544                 } else if (left_level < right_level) {
5545                         advance_right = ADVANCE;
5546                 } else {
5547                         advance_left = ADVANCE;
5548                 }
5549         }
5550
5551 out:
5552         btrfs_free_path(left_path);
5553         btrfs_free_path(right_path);
5554         kfree(tmp_buf);
5555         return ret;
5556 }
5557
5558 /*
5559  * this is similar to btrfs_next_leaf, but does not try to preserve
5560  * and fixup the path.  It looks for and returns the next key in the
5561  * tree based on the current path and the min_trans parameters.
5562  *
5563  * 0 is returned if another key is found, < 0 if there are any errors
5564  * and 1 is returned if there are no higher keys in the tree
5565  *
5566  * path->keep_locks should be set to 1 on the search made before
5567  * calling this function.
5568  */
5569 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5570                         struct btrfs_key *key, int level, u64 min_trans)
5571 {
5572         int slot;
5573         struct extent_buffer *c;
5574
5575         WARN_ON(!path->keep_locks);
5576         while (level < BTRFS_MAX_LEVEL) {
5577                 if (!path->nodes[level])
5578                         return 1;
5579
5580                 slot = path->slots[level] + 1;
5581                 c = path->nodes[level];
5582 next:
5583                 if (slot >= btrfs_header_nritems(c)) {
5584                         int ret;
5585                         int orig_lowest;
5586                         struct btrfs_key cur_key;
5587                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5588                             !path->nodes[level + 1])
5589                                 return 1;
5590
5591                         if (path->locks[level + 1]) {
5592                                 level++;
5593                                 continue;
5594                         }
5595
5596                         slot = btrfs_header_nritems(c) - 1;
5597                         if (level == 0)
5598                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5599                         else
5600                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5601
5602                         orig_lowest = path->lowest_level;
5603                         btrfs_release_path(path);
5604                         path->lowest_level = level;
5605                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5606                                                 0, 0);
5607                         path->lowest_level = orig_lowest;
5608                         if (ret < 0)
5609                                 return ret;
5610
5611                         c = path->nodes[level];
5612                         slot = path->slots[level];
5613                         if (ret == 0)
5614                                 slot++;
5615                         goto next;
5616                 }
5617
5618                 if (level == 0)
5619                         btrfs_item_key_to_cpu(c, key, slot);
5620                 else {
5621                         u64 gen = btrfs_node_ptr_generation(c, slot);
5622
5623                         if (gen < min_trans) {
5624                                 slot++;
5625                                 goto next;
5626                         }
5627                         btrfs_node_key_to_cpu(c, key, slot);
5628                 }
5629                 return 0;
5630         }
5631         return 1;
5632 }
5633
5634 /*
5635  * search the tree again to find a leaf with greater keys
5636  * returns 0 if it found something or 1 if there are no greater leaves.
5637  * returns < 0 on io errors.
5638  */
5639 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5640 {
5641         return btrfs_next_old_leaf(root, path, 0);
5642 }
5643
5644 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5645                         u64 time_seq)
5646 {
5647         int slot;
5648         int level;
5649         struct extent_buffer *c;
5650         struct extent_buffer *next;
5651         struct btrfs_key key;
5652         u32 nritems;
5653         int ret;
5654         int old_spinning = path->leave_spinning;
5655         int next_rw_lock = 0;
5656
5657         nritems = btrfs_header_nritems(path->nodes[0]);
5658         if (nritems == 0)
5659                 return 1;
5660
5661         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5662 again:
5663         level = 1;
5664         next = NULL;
5665         next_rw_lock = 0;
5666         btrfs_release_path(path);
5667
5668         path->keep_locks = 1;
5669         path->leave_spinning = 1;
5670
5671         if (time_seq)
5672                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5673         else
5674                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5675         path->keep_locks = 0;
5676
5677         if (ret < 0)
5678                 return ret;
5679
5680         nritems = btrfs_header_nritems(path->nodes[0]);
5681         /*
5682          * by releasing the path above we dropped all our locks.  A balance
5683          * could have added more items next to the key that used to be
5684          * at the very end of the block.  So, check again here and
5685          * advance the path if there are now more items available.
5686          */
5687         if (nritems > 0 && path->slots[0] < nritems - 1) {
5688                 if (ret == 0)
5689                         path->slots[0]++;
5690                 ret = 0;
5691                 goto done;
5692         }
5693         /*
5694          * So the above check misses one case:
5695          * - after releasing the path above, someone has removed the item that
5696          *   used to be at the very end of the block, and balance between leafs
5697          *   gets another one with bigger key.offset to replace it.
5698          *
5699          * This one should be returned as well, or we can get leaf corruption
5700          * later(esp. in __btrfs_drop_extents()).
5701          *
5702          * And a bit more explanation about this check,
5703          * with ret > 0, the key isn't found, the path points to the slot
5704          * where it should be inserted, so the path->slots[0] item must be the
5705          * bigger one.
5706          */
5707         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5708                 ret = 0;
5709                 goto done;
5710         }
5711
5712         while (level < BTRFS_MAX_LEVEL) {
5713                 if (!path->nodes[level]) {
5714                         ret = 1;
5715                         goto done;
5716                 }
5717
5718                 slot = path->slots[level] + 1;
5719                 c = path->nodes[level];
5720                 if (slot >= btrfs_header_nritems(c)) {
5721                         level++;
5722                         if (level == BTRFS_MAX_LEVEL) {
5723                                 ret = 1;
5724                                 goto done;
5725                         }
5726                         continue;
5727                 }
5728
5729                 if (next) {
5730                         btrfs_tree_unlock_rw(next, next_rw_lock);
5731                         free_extent_buffer(next);
5732                 }
5733
5734                 next = c;
5735                 next_rw_lock = path->locks[level];
5736                 ret = read_block_for_search(NULL, root, path, &next, level,
5737                                             slot, &key, 0);
5738                 if (ret == -EAGAIN)
5739                         goto again;
5740
5741                 if (ret < 0) {
5742                         btrfs_release_path(path);
5743                         goto done;
5744                 }
5745
5746                 if (!path->skip_locking) {
5747                         ret = btrfs_try_tree_read_lock(next);
5748                         if (!ret && time_seq) {
5749                                 /*
5750                                  * If we don't get the lock, we may be racing
5751                                  * with push_leaf_left, holding that lock while
5752                                  * itself waiting for the leaf we've currently
5753                                  * locked. To solve this situation, we give up
5754                                  * on our lock and cycle.
5755                                  */
5756                                 free_extent_buffer(next);
5757                                 btrfs_release_path(path);
5758                                 cond_resched();
5759                                 goto again;
5760                         }
5761                         if (!ret) {
5762                                 btrfs_set_path_blocking(path);
5763                                 btrfs_tree_read_lock(next);
5764                                 btrfs_clear_path_blocking(path, next,
5765                                                           BTRFS_READ_LOCK);
5766                         }
5767                         next_rw_lock = BTRFS_READ_LOCK;
5768                 }
5769                 break;
5770         }
5771         path->slots[level] = slot;
5772         while (1) {
5773                 level--;
5774                 c = path->nodes[level];
5775                 if (path->locks[level])
5776                         btrfs_tree_unlock_rw(c, path->locks[level]);
5777
5778                 free_extent_buffer(c);
5779                 path->nodes[level] = next;
5780                 path->slots[level] = 0;
5781                 if (!path->skip_locking)
5782                         path->locks[level] = next_rw_lock;
5783                 if (!level)
5784                         break;
5785
5786                 ret = read_block_for_search(NULL, root, path, &next, level,
5787                                             0, &key, 0);
5788                 if (ret == -EAGAIN)
5789                         goto again;
5790
5791                 if (ret < 0) {
5792                         btrfs_release_path(path);
5793                         goto done;
5794                 }
5795
5796                 if (!path->skip_locking) {
5797                         ret = btrfs_try_tree_read_lock(next);
5798                         if (!ret) {
5799                                 btrfs_set_path_blocking(path);
5800                                 btrfs_tree_read_lock(next);
5801                                 btrfs_clear_path_blocking(path, next,
5802                                                           BTRFS_READ_LOCK);
5803                         }
5804                         next_rw_lock = BTRFS_READ_LOCK;
5805                 }
5806         }
5807         ret = 0;
5808 done:
5809         unlock_up(path, 0, 1, 0, NULL);
5810         path->leave_spinning = old_spinning;
5811         if (!old_spinning)
5812                 btrfs_set_path_blocking(path);
5813
5814         return ret;
5815 }
5816
5817 /*
5818  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5819  * searching until it gets past min_objectid or finds an item of 'type'
5820  *
5821  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5822  */
5823 int btrfs_previous_item(struct btrfs_root *root,
5824                         struct btrfs_path *path, u64 min_objectid,
5825                         int type)
5826 {
5827         struct btrfs_key found_key;
5828         struct extent_buffer *leaf;
5829         u32 nritems;
5830         int ret;
5831
5832         while (1) {
5833                 if (path->slots[0] == 0) {
5834                         btrfs_set_path_blocking(path);
5835                         ret = btrfs_prev_leaf(root, path);
5836                         if (ret != 0)
5837                                 return ret;
5838                 } else {
5839                         path->slots[0]--;
5840                 }
5841                 leaf = path->nodes[0];
5842                 nritems = btrfs_header_nritems(leaf);
5843                 if (nritems == 0)
5844                         return 1;
5845                 if (path->slots[0] == nritems)
5846                         path->slots[0]--;
5847
5848                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5849                 if (found_key.objectid < min_objectid)
5850                         break;
5851                 if (found_key.type == type)
5852                         return 0;
5853                 if (found_key.objectid == min_objectid &&
5854                     found_key.type < type)
5855                         break;
5856         }
5857         return 1;
5858 }
5859
5860 /*
5861  * search in extent tree to find a previous Metadata/Data extent item with
5862  * min objecitd.
5863  *
5864  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5865  */
5866 int btrfs_previous_extent_item(struct btrfs_root *root,
5867                         struct btrfs_path *path, u64 min_objectid)
5868 {
5869         struct btrfs_key found_key;
5870         struct extent_buffer *leaf;
5871         u32 nritems;
5872         int ret;
5873
5874         while (1) {
5875                 if (path->slots[0] == 0) {
5876                         btrfs_set_path_blocking(path);
5877                         ret = btrfs_prev_leaf(root, path);
5878                         if (ret != 0)
5879                                 return ret;
5880                 } else {
5881                         path->slots[0]--;
5882                 }
5883                 leaf = path->nodes[0];
5884                 nritems = btrfs_header_nritems(leaf);
5885                 if (nritems == 0)
5886                         return 1;
5887                 if (path->slots[0] == nritems)
5888                         path->slots[0]--;
5889
5890                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5891                 if (found_key.objectid < min_objectid)
5892                         break;
5893                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5894                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5895                         return 0;
5896                 if (found_key.objectid == min_objectid &&
5897                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5898                         break;
5899         }
5900         return 1;
5901 }