]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - fs/btrfs/ctree.c
Merge branch 'alpha' (alpha architecture patches)
[karo-tx-linux.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29                       *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31                       *root, struct btrfs_key *ins_key,
32                       struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34                           struct btrfs_root *root, struct extent_buffer *dst,
35                           struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37                               struct btrfs_root *root,
38                               struct extent_buffer *dst_buf,
39                               struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
41                     struct btrfs_path *path, int level, int slot,
42                     int tree_mod_log);
43 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44                                  struct extent_buffer *eb);
45 struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
46                                           u32 blocksize, u64 parent_transid,
47                                           u64 time_seq);
48 struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
49                                                 u64 bytenr, u32 blocksize,
50                                                 u64 time_seq);
51
52 struct btrfs_path *btrfs_alloc_path(void)
53 {
54         struct btrfs_path *path;
55         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
56         return path;
57 }
58
59 /*
60  * set all locked nodes in the path to blocking locks.  This should
61  * be done before scheduling
62  */
63 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
64 {
65         int i;
66         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
67                 if (!p->nodes[i] || !p->locks[i])
68                         continue;
69                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
70                 if (p->locks[i] == BTRFS_READ_LOCK)
71                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
72                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
73                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
74         }
75 }
76
77 /*
78  * reset all the locked nodes in the patch to spinning locks.
79  *
80  * held is used to keep lockdep happy, when lockdep is enabled
81  * we set held to a blocking lock before we go around and
82  * retake all the spinlocks in the path.  You can safely use NULL
83  * for held
84  */
85 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
86                                         struct extent_buffer *held, int held_rw)
87 {
88         int i;
89
90 #ifdef CONFIG_DEBUG_LOCK_ALLOC
91         /* lockdep really cares that we take all of these spinlocks
92          * in the right order.  If any of the locks in the path are not
93          * currently blocking, it is going to complain.  So, make really
94          * really sure by forcing the path to blocking before we clear
95          * the path blocking.
96          */
97         if (held) {
98                 btrfs_set_lock_blocking_rw(held, held_rw);
99                 if (held_rw == BTRFS_WRITE_LOCK)
100                         held_rw = BTRFS_WRITE_LOCK_BLOCKING;
101                 else if (held_rw == BTRFS_READ_LOCK)
102                         held_rw = BTRFS_READ_LOCK_BLOCKING;
103         }
104         btrfs_set_path_blocking(p);
105 #endif
106
107         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
108                 if (p->nodes[i] && p->locks[i]) {
109                         btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
110                         if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
111                                 p->locks[i] = BTRFS_WRITE_LOCK;
112                         else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
113                                 p->locks[i] = BTRFS_READ_LOCK;
114                 }
115         }
116
117 #ifdef CONFIG_DEBUG_LOCK_ALLOC
118         if (held)
119                 btrfs_clear_lock_blocking_rw(held, held_rw);
120 #endif
121 }
122
123 /* this also releases the path */
124 void btrfs_free_path(struct btrfs_path *p)
125 {
126         if (!p)
127                 return;
128         btrfs_release_path(p);
129         kmem_cache_free(btrfs_path_cachep, p);
130 }
131
132 /*
133  * path release drops references on the extent buffers in the path
134  * and it drops any locks held by this path
135  *
136  * It is safe to call this on paths that no locks or extent buffers held.
137  */
138 noinline void btrfs_release_path(struct btrfs_path *p)
139 {
140         int i;
141
142         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
143                 p->slots[i] = 0;
144                 if (!p->nodes[i])
145                         continue;
146                 if (p->locks[i]) {
147                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
148                         p->locks[i] = 0;
149                 }
150                 free_extent_buffer(p->nodes[i]);
151                 p->nodes[i] = NULL;
152         }
153 }
154
155 /*
156  * safely gets a reference on the root node of a tree.  A lock
157  * is not taken, so a concurrent writer may put a different node
158  * at the root of the tree.  See btrfs_lock_root_node for the
159  * looping required.
160  *
161  * The extent buffer returned by this has a reference taken, so
162  * it won't disappear.  It may stop being the root of the tree
163  * at any time because there are no locks held.
164  */
165 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
166 {
167         struct extent_buffer *eb;
168
169         while (1) {
170                 rcu_read_lock();
171                 eb = rcu_dereference(root->node);
172
173                 /*
174                  * RCU really hurts here, we could free up the root node because
175                  * it was cow'ed but we may not get the new root node yet so do
176                  * the inc_not_zero dance and if it doesn't work then
177                  * synchronize_rcu and try again.
178                  */
179                 if (atomic_inc_not_zero(&eb->refs)) {
180                         rcu_read_unlock();
181                         break;
182                 }
183                 rcu_read_unlock();
184                 synchronize_rcu();
185         }
186         return eb;
187 }
188
189 /* loop around taking references on and locking the root node of the
190  * tree until you end up with a lock on the root.  A locked buffer
191  * is returned, with a reference held.
192  */
193 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
194 {
195         struct extent_buffer *eb;
196
197         while (1) {
198                 eb = btrfs_root_node(root);
199                 btrfs_tree_lock(eb);
200                 if (eb == root->node)
201                         break;
202                 btrfs_tree_unlock(eb);
203                 free_extent_buffer(eb);
204         }
205         return eb;
206 }
207
208 /* loop around taking references on and locking the root node of the
209  * tree until you end up with a lock on the root.  A locked buffer
210  * is returned, with a reference held.
211  */
212 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
213 {
214         struct extent_buffer *eb;
215
216         while (1) {
217                 eb = btrfs_root_node(root);
218                 btrfs_tree_read_lock(eb);
219                 if (eb == root->node)
220                         break;
221                 btrfs_tree_read_unlock(eb);
222                 free_extent_buffer(eb);
223         }
224         return eb;
225 }
226
227 /* cowonly root (everything not a reference counted cow subvolume), just get
228  * put onto a simple dirty list.  transaction.c walks this to make sure they
229  * get properly updated on disk.
230  */
231 static void add_root_to_dirty_list(struct btrfs_root *root)
232 {
233         spin_lock(&root->fs_info->trans_lock);
234         if (root->track_dirty && list_empty(&root->dirty_list)) {
235                 list_add(&root->dirty_list,
236                          &root->fs_info->dirty_cowonly_roots);
237         }
238         spin_unlock(&root->fs_info->trans_lock);
239 }
240
241 /*
242  * used by snapshot creation to make a copy of a root for a tree with
243  * a given objectid.  The buffer with the new root node is returned in
244  * cow_ret, and this func returns zero on success or a negative error code.
245  */
246 int btrfs_copy_root(struct btrfs_trans_handle *trans,
247                       struct btrfs_root *root,
248                       struct extent_buffer *buf,
249                       struct extent_buffer **cow_ret, u64 new_root_objectid)
250 {
251         struct extent_buffer *cow;
252         int ret = 0;
253         int level;
254         struct btrfs_disk_key disk_key;
255
256         WARN_ON(root->ref_cows && trans->transid !=
257                 root->fs_info->running_transaction->transid);
258         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
259
260         level = btrfs_header_level(buf);
261         if (level == 0)
262                 btrfs_item_key(buf, &disk_key, 0);
263         else
264                 btrfs_node_key(buf, &disk_key, 0);
265
266         cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
267                                      new_root_objectid, &disk_key, level,
268                                      buf->start, 0);
269         if (IS_ERR(cow))
270                 return PTR_ERR(cow);
271
272         copy_extent_buffer(cow, buf, 0, 0, cow->len);
273         btrfs_set_header_bytenr(cow, cow->start);
274         btrfs_set_header_generation(cow, trans->transid);
275         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
276         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
277                                      BTRFS_HEADER_FLAG_RELOC);
278         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
279                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
280         else
281                 btrfs_set_header_owner(cow, new_root_objectid);
282
283         write_extent_buffer(cow, root->fs_info->fsid,
284                             (unsigned long)btrfs_header_fsid(cow),
285                             BTRFS_FSID_SIZE);
286
287         WARN_ON(btrfs_header_generation(buf) > trans->transid);
288         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
289                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
290         else
291                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
292
293         if (ret)
294                 return ret;
295
296         btrfs_mark_buffer_dirty(cow);
297         *cow_ret = cow;
298         return 0;
299 }
300
301 enum mod_log_op {
302         MOD_LOG_KEY_REPLACE,
303         MOD_LOG_KEY_ADD,
304         MOD_LOG_KEY_REMOVE,
305         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
306         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
307         MOD_LOG_MOVE_KEYS,
308         MOD_LOG_ROOT_REPLACE,
309 };
310
311 struct tree_mod_move {
312         int dst_slot;
313         int nr_items;
314 };
315
316 struct tree_mod_root {
317         u64 logical;
318         u8 level;
319 };
320
321 struct tree_mod_elem {
322         struct rb_node node;
323         u64 index;              /* shifted logical */
324         u64 seq;
325         enum mod_log_op op;
326
327         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
328         int slot;
329
330         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
331         u64 generation;
332
333         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
334         struct btrfs_disk_key key;
335         u64 blockptr;
336
337         /* this is used for op == MOD_LOG_MOVE_KEYS */
338         struct tree_mod_move move;
339
340         /* this is used for op == MOD_LOG_ROOT_REPLACE */
341         struct tree_mod_root old_root;
342 };
343
344 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
345 {
346         read_lock(&fs_info->tree_mod_log_lock);
347 }
348
349 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
350 {
351         read_unlock(&fs_info->tree_mod_log_lock);
352 }
353
354 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
355 {
356         write_lock(&fs_info->tree_mod_log_lock);
357 }
358
359 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
360 {
361         write_unlock(&fs_info->tree_mod_log_lock);
362 }
363
364 /*
365  * This adds a new blocker to the tree mod log's blocker list if the @elem
366  * passed does not already have a sequence number set. So when a caller expects
367  * to record tree modifications, it should ensure to set elem->seq to zero
368  * before calling btrfs_get_tree_mod_seq.
369  * Returns a fresh, unused tree log modification sequence number, even if no new
370  * blocker was added.
371  */
372 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
373                            struct seq_list *elem)
374 {
375         u64 seq;
376
377         tree_mod_log_write_lock(fs_info);
378         spin_lock(&fs_info->tree_mod_seq_lock);
379         if (!elem->seq) {
380                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
381                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
382         }
383         seq = btrfs_inc_tree_mod_seq(fs_info);
384         spin_unlock(&fs_info->tree_mod_seq_lock);
385         tree_mod_log_write_unlock(fs_info);
386
387         return seq;
388 }
389
390 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
391                             struct seq_list *elem)
392 {
393         struct rb_root *tm_root;
394         struct rb_node *node;
395         struct rb_node *next;
396         struct seq_list *cur_elem;
397         struct tree_mod_elem *tm;
398         u64 min_seq = (u64)-1;
399         u64 seq_putting = elem->seq;
400
401         if (!seq_putting)
402                 return;
403
404         spin_lock(&fs_info->tree_mod_seq_lock);
405         list_del(&elem->list);
406         elem->seq = 0;
407
408         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
409                 if (cur_elem->seq < min_seq) {
410                         if (seq_putting > cur_elem->seq) {
411                                 /*
412                                  * blocker with lower sequence number exists, we
413                                  * cannot remove anything from the log
414                                  */
415                                 spin_unlock(&fs_info->tree_mod_seq_lock);
416                                 return;
417                         }
418                         min_seq = cur_elem->seq;
419                 }
420         }
421         spin_unlock(&fs_info->tree_mod_seq_lock);
422
423         /*
424          * we removed the lowest blocker from the blocker list, so there may be
425          * more processible delayed refs.
426          */
427         wake_up(&fs_info->tree_mod_seq_wait);
428
429         /*
430          * anything that's lower than the lowest existing (read: blocked)
431          * sequence number can be removed from the tree.
432          */
433         tree_mod_log_write_lock(fs_info);
434         tm_root = &fs_info->tree_mod_log;
435         for (node = rb_first(tm_root); node; node = next) {
436                 next = rb_next(node);
437                 tm = container_of(node, struct tree_mod_elem, node);
438                 if (tm->seq > min_seq)
439                         continue;
440                 rb_erase(node, tm_root);
441                 kfree(tm);
442         }
443         tree_mod_log_write_unlock(fs_info);
444 }
445
446 /*
447  * key order of the log:
448  *       index -> sequence
449  *
450  * the index is the shifted logical of the *new* root node for root replace
451  * operations, or the shifted logical of the affected block for all other
452  * operations.
453  */
454 static noinline int
455 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
456 {
457         struct rb_root *tm_root;
458         struct rb_node **new;
459         struct rb_node *parent = NULL;
460         struct tree_mod_elem *cur;
461
462         BUG_ON(!tm || !tm->seq);
463
464         tm_root = &fs_info->tree_mod_log;
465         new = &tm_root->rb_node;
466         while (*new) {
467                 cur = container_of(*new, struct tree_mod_elem, node);
468                 parent = *new;
469                 if (cur->index < tm->index)
470                         new = &((*new)->rb_left);
471                 else if (cur->index > tm->index)
472                         new = &((*new)->rb_right);
473                 else if (cur->seq < tm->seq)
474                         new = &((*new)->rb_left);
475                 else if (cur->seq > tm->seq)
476                         new = &((*new)->rb_right);
477                 else {
478                         kfree(tm);
479                         return -EEXIST;
480                 }
481         }
482
483         rb_link_node(&tm->node, parent, new);
484         rb_insert_color(&tm->node, tm_root);
485         return 0;
486 }
487
488 /*
489  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
490  * returns zero with the tree_mod_log_lock acquired. The caller must hold
491  * this until all tree mod log insertions are recorded in the rb tree and then
492  * call tree_mod_log_write_unlock() to release.
493  */
494 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
495                                     struct extent_buffer *eb) {
496         smp_mb();
497         if (list_empty(&(fs_info)->tree_mod_seq_list))
498                 return 1;
499         if (eb && btrfs_header_level(eb) == 0)
500                 return 1;
501
502         tree_mod_log_write_lock(fs_info);
503         if (list_empty(&fs_info->tree_mod_seq_list)) {
504                 /*
505                  * someone emptied the list while we were waiting for the lock.
506                  * we must not add to the list when no blocker exists.
507                  */
508                 tree_mod_log_write_unlock(fs_info);
509                 return 1;
510         }
511
512         return 0;
513 }
514
515 /*
516  * This allocates memory and gets a tree modification sequence number.
517  *
518  * Returns <0 on error.
519  * Returns >0 (the added sequence number) on success.
520  */
521 static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
522                                  struct tree_mod_elem **tm_ret)
523 {
524         struct tree_mod_elem *tm;
525
526         /*
527          * once we switch from spin locks to something different, we should
528          * honor the flags parameter here.
529          */
530         tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
531         if (!tm)
532                 return -ENOMEM;
533
534         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
535         return tm->seq;
536 }
537
538 static inline int
539 __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
540                           struct extent_buffer *eb, int slot,
541                           enum mod_log_op op, gfp_t flags)
542 {
543         int ret;
544         struct tree_mod_elem *tm;
545
546         ret = tree_mod_alloc(fs_info, flags, &tm);
547         if (ret < 0)
548                 return ret;
549
550         tm->index = eb->start >> PAGE_CACHE_SHIFT;
551         if (op != MOD_LOG_KEY_ADD) {
552                 btrfs_node_key(eb, &tm->key, slot);
553                 tm->blockptr = btrfs_node_blockptr(eb, slot);
554         }
555         tm->op = op;
556         tm->slot = slot;
557         tm->generation = btrfs_node_ptr_generation(eb, slot);
558
559         return __tree_mod_log_insert(fs_info, tm);
560 }
561
562 static noinline int
563 tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
564                              struct extent_buffer *eb, int slot,
565                              enum mod_log_op op, gfp_t flags)
566 {
567         int ret;
568
569         if (tree_mod_dont_log(fs_info, eb))
570                 return 0;
571
572         ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
573
574         tree_mod_log_write_unlock(fs_info);
575         return ret;
576 }
577
578 static noinline int
579 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
580                         int slot, enum mod_log_op op)
581 {
582         return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
583 }
584
585 static noinline int
586 tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
587                              struct extent_buffer *eb, int slot,
588                              enum mod_log_op op)
589 {
590         return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
591 }
592
593 static noinline int
594 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
595                          struct extent_buffer *eb, int dst_slot, int src_slot,
596                          int nr_items, gfp_t flags)
597 {
598         struct tree_mod_elem *tm;
599         int ret;
600         int i;
601
602         if (tree_mod_dont_log(fs_info, eb))
603                 return 0;
604
605         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
606                 ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
607                                               MOD_LOG_KEY_REMOVE_WHILE_MOVING);
608                 BUG_ON(ret < 0);
609         }
610
611         ret = tree_mod_alloc(fs_info, flags, &tm);
612         if (ret < 0)
613                 goto out;
614
615         tm->index = eb->start >> PAGE_CACHE_SHIFT;
616         tm->slot = src_slot;
617         tm->move.dst_slot = dst_slot;
618         tm->move.nr_items = nr_items;
619         tm->op = MOD_LOG_MOVE_KEYS;
620
621         ret = __tree_mod_log_insert(fs_info, tm);
622 out:
623         tree_mod_log_write_unlock(fs_info);
624         return ret;
625 }
626
627 static inline void
628 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
629 {
630         int i;
631         u32 nritems;
632         int ret;
633
634         nritems = btrfs_header_nritems(eb);
635         for (i = nritems - 1; i >= 0; i--) {
636                 ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
637                                               MOD_LOG_KEY_REMOVE_WHILE_FREEING);
638                 BUG_ON(ret < 0);
639         }
640 }
641
642 static noinline int
643 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
644                          struct extent_buffer *old_root,
645                          struct extent_buffer *new_root, gfp_t flags)
646 {
647         struct tree_mod_elem *tm;
648         int ret;
649
650         if (tree_mod_dont_log(fs_info, NULL))
651                 return 0;
652
653         __tree_mod_log_free_eb(fs_info, old_root);
654
655         ret = tree_mod_alloc(fs_info, flags, &tm);
656         if (ret < 0)
657                 goto out;
658
659         tm->index = new_root->start >> PAGE_CACHE_SHIFT;
660         tm->old_root.logical = old_root->start;
661         tm->old_root.level = btrfs_header_level(old_root);
662         tm->generation = btrfs_header_generation(old_root);
663         tm->op = MOD_LOG_ROOT_REPLACE;
664
665         ret = __tree_mod_log_insert(fs_info, tm);
666 out:
667         tree_mod_log_write_unlock(fs_info);
668         return ret;
669 }
670
671 static struct tree_mod_elem *
672 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
673                       int smallest)
674 {
675         struct rb_root *tm_root;
676         struct rb_node *node;
677         struct tree_mod_elem *cur = NULL;
678         struct tree_mod_elem *found = NULL;
679         u64 index = start >> PAGE_CACHE_SHIFT;
680
681         tree_mod_log_read_lock(fs_info);
682         tm_root = &fs_info->tree_mod_log;
683         node = tm_root->rb_node;
684         while (node) {
685                 cur = container_of(node, struct tree_mod_elem, node);
686                 if (cur->index < index) {
687                         node = node->rb_left;
688                 } else if (cur->index > index) {
689                         node = node->rb_right;
690                 } else if (cur->seq < min_seq) {
691                         node = node->rb_left;
692                 } else if (!smallest) {
693                         /* we want the node with the highest seq */
694                         if (found)
695                                 BUG_ON(found->seq > cur->seq);
696                         found = cur;
697                         node = node->rb_left;
698                 } else if (cur->seq > min_seq) {
699                         /* we want the node with the smallest seq */
700                         if (found)
701                                 BUG_ON(found->seq < cur->seq);
702                         found = cur;
703                         node = node->rb_right;
704                 } else {
705                         found = cur;
706                         break;
707                 }
708         }
709         tree_mod_log_read_unlock(fs_info);
710
711         return found;
712 }
713
714 /*
715  * this returns the element from the log with the smallest time sequence
716  * value that's in the log (the oldest log item). any element with a time
717  * sequence lower than min_seq will be ignored.
718  */
719 static struct tree_mod_elem *
720 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
721                            u64 min_seq)
722 {
723         return __tree_mod_log_search(fs_info, start, min_seq, 1);
724 }
725
726 /*
727  * this returns the element from the log with the largest time sequence
728  * value that's in the log (the most recent log item). any element with
729  * a time sequence lower than min_seq will be ignored.
730  */
731 static struct tree_mod_elem *
732 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
733 {
734         return __tree_mod_log_search(fs_info, start, min_seq, 0);
735 }
736
737 static noinline void
738 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
739                      struct extent_buffer *src, unsigned long dst_offset,
740                      unsigned long src_offset, int nr_items)
741 {
742         int ret;
743         int i;
744
745         if (tree_mod_dont_log(fs_info, NULL))
746                 return;
747
748         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
749                 tree_mod_log_write_unlock(fs_info);
750                 return;
751         }
752
753         for (i = 0; i < nr_items; i++) {
754                 ret = tree_mod_log_insert_key_locked(fs_info, src,
755                                                      i + src_offset,
756                                                      MOD_LOG_KEY_REMOVE);
757                 BUG_ON(ret < 0);
758                 ret = tree_mod_log_insert_key_locked(fs_info, dst,
759                                                      i + dst_offset,
760                                                      MOD_LOG_KEY_ADD);
761                 BUG_ON(ret < 0);
762         }
763
764         tree_mod_log_write_unlock(fs_info);
765 }
766
767 static inline void
768 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
769                      int dst_offset, int src_offset, int nr_items)
770 {
771         int ret;
772         ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
773                                        nr_items, GFP_NOFS);
774         BUG_ON(ret < 0);
775 }
776
777 static noinline void
778 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
779                           struct extent_buffer *eb,
780                           struct btrfs_disk_key *disk_key, int slot, int atomic)
781 {
782         int ret;
783
784         ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
785                                            MOD_LOG_KEY_REPLACE,
786                                            atomic ? GFP_ATOMIC : GFP_NOFS);
787         BUG_ON(ret < 0);
788 }
789
790 static noinline void
791 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
792 {
793         if (tree_mod_dont_log(fs_info, eb))
794                 return;
795
796         __tree_mod_log_free_eb(fs_info, eb);
797
798         tree_mod_log_write_unlock(fs_info);
799 }
800
801 static noinline void
802 tree_mod_log_set_root_pointer(struct btrfs_root *root,
803                               struct extent_buffer *new_root_node)
804 {
805         int ret;
806         ret = tree_mod_log_insert_root(root->fs_info, root->node,
807                                        new_root_node, GFP_NOFS);
808         BUG_ON(ret < 0);
809 }
810
811 /*
812  * check if the tree block can be shared by multiple trees
813  */
814 int btrfs_block_can_be_shared(struct btrfs_root *root,
815                               struct extent_buffer *buf)
816 {
817         /*
818          * Tree blocks not in refernece counted trees and tree roots
819          * are never shared. If a block was allocated after the last
820          * snapshot and the block was not allocated by tree relocation,
821          * we know the block is not shared.
822          */
823         if (root->ref_cows &&
824             buf != root->node && buf != root->commit_root &&
825             (btrfs_header_generation(buf) <=
826              btrfs_root_last_snapshot(&root->root_item) ||
827              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
828                 return 1;
829 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
830         if (root->ref_cows &&
831             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
832                 return 1;
833 #endif
834         return 0;
835 }
836
837 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
838                                        struct btrfs_root *root,
839                                        struct extent_buffer *buf,
840                                        struct extent_buffer *cow,
841                                        int *last_ref)
842 {
843         u64 refs;
844         u64 owner;
845         u64 flags;
846         u64 new_flags = 0;
847         int ret;
848
849         /*
850          * Backrefs update rules:
851          *
852          * Always use full backrefs for extent pointers in tree block
853          * allocated by tree relocation.
854          *
855          * If a shared tree block is no longer referenced by its owner
856          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
857          * use full backrefs for extent pointers in tree block.
858          *
859          * If a tree block is been relocating
860          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
861          * use full backrefs for extent pointers in tree block.
862          * The reason for this is some operations (such as drop tree)
863          * are only allowed for blocks use full backrefs.
864          */
865
866         if (btrfs_block_can_be_shared(root, buf)) {
867                 ret = btrfs_lookup_extent_info(trans, root, buf->start,
868                                                buf->len, &refs, &flags);
869                 if (ret)
870                         return ret;
871                 if (refs == 0) {
872                         ret = -EROFS;
873                         btrfs_std_error(root->fs_info, ret);
874                         return ret;
875                 }
876         } else {
877                 refs = 1;
878                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
879                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
880                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
881                 else
882                         flags = 0;
883         }
884
885         owner = btrfs_header_owner(buf);
886         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
887                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
888
889         if (refs > 1) {
890                 if ((owner == root->root_key.objectid ||
891                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
892                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
893                         ret = btrfs_inc_ref(trans, root, buf, 1, 1);
894                         BUG_ON(ret); /* -ENOMEM */
895
896                         if (root->root_key.objectid ==
897                             BTRFS_TREE_RELOC_OBJECTID) {
898                                 ret = btrfs_dec_ref(trans, root, buf, 0, 1);
899                                 BUG_ON(ret); /* -ENOMEM */
900                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
901                                 BUG_ON(ret); /* -ENOMEM */
902                         }
903                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
904                 } else {
905
906                         if (root->root_key.objectid ==
907                             BTRFS_TREE_RELOC_OBJECTID)
908                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
909                         else
910                                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
911                         BUG_ON(ret); /* -ENOMEM */
912                 }
913                 if (new_flags != 0) {
914                         ret = btrfs_set_disk_extent_flags(trans, root,
915                                                           buf->start,
916                                                           buf->len,
917                                                           new_flags, 0);
918                         if (ret)
919                                 return ret;
920                 }
921         } else {
922                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
923                         if (root->root_key.objectid ==
924                             BTRFS_TREE_RELOC_OBJECTID)
925                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
926                         else
927                                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
928                         BUG_ON(ret); /* -ENOMEM */
929                         ret = btrfs_dec_ref(trans, root, buf, 1, 1);
930                         BUG_ON(ret); /* -ENOMEM */
931                 }
932                 /*
933                  * don't log freeing in case we're freeing the root node, this
934                  * is done by tree_mod_log_set_root_pointer later
935                  */
936                 if (buf != root->node && btrfs_header_level(buf) != 0)
937                         tree_mod_log_free_eb(root->fs_info, buf);
938                 clean_tree_block(trans, root, buf);
939                 *last_ref = 1;
940         }
941         return 0;
942 }
943
944 /*
945  * does the dirty work in cow of a single block.  The parent block (if
946  * supplied) is updated to point to the new cow copy.  The new buffer is marked
947  * dirty and returned locked.  If you modify the block it needs to be marked
948  * dirty again.
949  *
950  * search_start -- an allocation hint for the new block
951  *
952  * empty_size -- a hint that you plan on doing more cow.  This is the size in
953  * bytes the allocator should try to find free next to the block it returns.
954  * This is just a hint and may be ignored by the allocator.
955  */
956 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
957                              struct btrfs_root *root,
958                              struct extent_buffer *buf,
959                              struct extent_buffer *parent, int parent_slot,
960                              struct extent_buffer **cow_ret,
961                              u64 search_start, u64 empty_size)
962 {
963         struct btrfs_disk_key disk_key;
964         struct extent_buffer *cow;
965         int level, ret;
966         int last_ref = 0;
967         int unlock_orig = 0;
968         u64 parent_start;
969
970         if (*cow_ret == buf)
971                 unlock_orig = 1;
972
973         btrfs_assert_tree_locked(buf);
974
975         WARN_ON(root->ref_cows && trans->transid !=
976                 root->fs_info->running_transaction->transid);
977         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
978
979         level = btrfs_header_level(buf);
980
981         if (level == 0)
982                 btrfs_item_key(buf, &disk_key, 0);
983         else
984                 btrfs_node_key(buf, &disk_key, 0);
985
986         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
987                 if (parent)
988                         parent_start = parent->start;
989                 else
990                         parent_start = 0;
991         } else
992                 parent_start = 0;
993
994         cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
995                                      root->root_key.objectid, &disk_key,
996                                      level, search_start, empty_size);
997         if (IS_ERR(cow))
998                 return PTR_ERR(cow);
999
1000         /* cow is set to blocking by btrfs_init_new_buffer */
1001
1002         copy_extent_buffer(cow, buf, 0, 0, cow->len);
1003         btrfs_set_header_bytenr(cow, cow->start);
1004         btrfs_set_header_generation(cow, trans->transid);
1005         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1006         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1007                                      BTRFS_HEADER_FLAG_RELOC);
1008         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1009                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1010         else
1011                 btrfs_set_header_owner(cow, root->root_key.objectid);
1012
1013         write_extent_buffer(cow, root->fs_info->fsid,
1014                             (unsigned long)btrfs_header_fsid(cow),
1015                             BTRFS_FSID_SIZE);
1016
1017         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1018         if (ret) {
1019                 btrfs_abort_transaction(trans, root, ret);
1020                 return ret;
1021         }
1022
1023         if (root->ref_cows)
1024                 btrfs_reloc_cow_block(trans, root, buf, cow);
1025
1026         if (buf == root->node) {
1027                 WARN_ON(parent && parent != buf);
1028                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1029                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1030                         parent_start = buf->start;
1031                 else
1032                         parent_start = 0;
1033
1034                 extent_buffer_get(cow);
1035                 tree_mod_log_set_root_pointer(root, cow);
1036                 rcu_assign_pointer(root->node, cow);
1037
1038                 btrfs_free_tree_block(trans, root, buf, parent_start,
1039                                       last_ref);
1040                 free_extent_buffer(buf);
1041                 add_root_to_dirty_list(root);
1042         } else {
1043                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1044                         parent_start = parent->start;
1045                 else
1046                         parent_start = 0;
1047
1048                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1049                 tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1050                                         MOD_LOG_KEY_REPLACE);
1051                 btrfs_set_node_blockptr(parent, parent_slot,
1052                                         cow->start);
1053                 btrfs_set_node_ptr_generation(parent, parent_slot,
1054                                               trans->transid);
1055                 btrfs_mark_buffer_dirty(parent);
1056                 btrfs_free_tree_block(trans, root, buf, parent_start,
1057                                       last_ref);
1058         }
1059         if (unlock_orig)
1060                 btrfs_tree_unlock(buf);
1061         free_extent_buffer_stale(buf);
1062         btrfs_mark_buffer_dirty(cow);
1063         *cow_ret = cow;
1064         return 0;
1065 }
1066
1067 /*
1068  * returns the logical address of the oldest predecessor of the given root.
1069  * entries older than time_seq are ignored.
1070  */
1071 static struct tree_mod_elem *
1072 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1073                            struct btrfs_root *root, u64 time_seq)
1074 {
1075         struct tree_mod_elem *tm;
1076         struct tree_mod_elem *found = NULL;
1077         u64 root_logical = root->node->start;
1078         int looped = 0;
1079
1080         if (!time_seq)
1081                 return 0;
1082
1083         /*
1084          * the very last operation that's logged for a root is the replacement
1085          * operation (if it is replaced at all). this has the index of the *new*
1086          * root, making it the very first operation that's logged for this root.
1087          */
1088         while (1) {
1089                 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1090                                                 time_seq);
1091                 if (!looped && !tm)
1092                         return 0;
1093                 /*
1094                  * if there are no tree operation for the oldest root, we simply
1095                  * return it. this should only happen if that (old) root is at
1096                  * level 0.
1097                  */
1098                 if (!tm)
1099                         break;
1100
1101                 /*
1102                  * if there's an operation that's not a root replacement, we
1103                  * found the oldest version of our root. normally, we'll find a
1104                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1105                  */
1106                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1107                         break;
1108
1109                 found = tm;
1110                 root_logical = tm->old_root.logical;
1111                 BUG_ON(root_logical == root->node->start);
1112                 looped = 1;
1113         }
1114
1115         /* if there's no old root to return, return what we found instead */
1116         if (!found)
1117                 found = tm;
1118
1119         return found;
1120 }
1121
1122 /*
1123  * tm is a pointer to the first operation to rewind within eb. then, all
1124  * previous operations will be rewinded (until we reach something older than
1125  * time_seq).
1126  */
1127 static void
1128 __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1129                       struct tree_mod_elem *first_tm)
1130 {
1131         u32 n;
1132         struct rb_node *next;
1133         struct tree_mod_elem *tm = first_tm;
1134         unsigned long o_dst;
1135         unsigned long o_src;
1136         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1137
1138         n = btrfs_header_nritems(eb);
1139         while (tm && tm->seq >= time_seq) {
1140                 /*
1141                  * all the operations are recorded with the operator used for
1142                  * the modification. as we're going backwards, we do the
1143                  * opposite of each operation here.
1144                  */
1145                 switch (tm->op) {
1146                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1147                         BUG_ON(tm->slot < n);
1148                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1149                 case MOD_LOG_KEY_REMOVE:
1150                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1151                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1152                         btrfs_set_node_ptr_generation(eb, tm->slot,
1153                                                       tm->generation);
1154                         n++;
1155                         break;
1156                 case MOD_LOG_KEY_REPLACE:
1157                         BUG_ON(tm->slot >= n);
1158                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1159                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1160                         btrfs_set_node_ptr_generation(eb, tm->slot,
1161                                                       tm->generation);
1162                         break;
1163                 case MOD_LOG_KEY_ADD:
1164                         /* if a move operation is needed it's in the log */
1165                         n--;
1166                         break;
1167                 case MOD_LOG_MOVE_KEYS:
1168                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1169                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1170                         memmove_extent_buffer(eb, o_dst, o_src,
1171                                               tm->move.nr_items * p_size);
1172                         break;
1173                 case MOD_LOG_ROOT_REPLACE:
1174                         /*
1175                          * this operation is special. for roots, this must be
1176                          * handled explicitly before rewinding.
1177                          * for non-roots, this operation may exist if the node
1178                          * was a root: root A -> child B; then A gets empty and
1179                          * B is promoted to the new root. in the mod log, we'll
1180                          * have a root-replace operation for B, a tree block
1181                          * that is no root. we simply ignore that operation.
1182                          */
1183                         break;
1184                 }
1185                 next = rb_next(&tm->node);
1186                 if (!next)
1187                         break;
1188                 tm = container_of(next, struct tree_mod_elem, node);
1189                 if (tm->index != first_tm->index)
1190                         break;
1191         }
1192         btrfs_set_header_nritems(eb, n);
1193 }
1194
1195 static struct extent_buffer *
1196 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1197                     u64 time_seq)
1198 {
1199         struct extent_buffer *eb_rewin;
1200         struct tree_mod_elem *tm;
1201
1202         if (!time_seq)
1203                 return eb;
1204
1205         if (btrfs_header_level(eb) == 0)
1206                 return eb;
1207
1208         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1209         if (!tm)
1210                 return eb;
1211
1212         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1213                 BUG_ON(tm->slot != 0);
1214                 eb_rewin = alloc_dummy_extent_buffer(eb->start,
1215                                                 fs_info->tree_root->nodesize);
1216                 BUG_ON(!eb_rewin);
1217                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1218                 btrfs_set_header_backref_rev(eb_rewin,
1219                                              btrfs_header_backref_rev(eb));
1220                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1221                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1222         } else {
1223                 eb_rewin = btrfs_clone_extent_buffer(eb);
1224                 BUG_ON(!eb_rewin);
1225         }
1226
1227         extent_buffer_get(eb_rewin);
1228         free_extent_buffer(eb);
1229
1230         __tree_mod_log_rewind(eb_rewin, time_seq, tm);
1231
1232         return eb_rewin;
1233 }
1234
1235 /*
1236  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1237  * value. If there are no changes, the current root->root_node is returned. If
1238  * anything changed in between, there's a fresh buffer allocated on which the
1239  * rewind operations are done. In any case, the returned buffer is read locked.
1240  * Returns NULL on error (with no locks held).
1241  */
1242 static inline struct extent_buffer *
1243 get_old_root(struct btrfs_root *root, u64 time_seq)
1244 {
1245         struct tree_mod_elem *tm;
1246         struct extent_buffer *eb;
1247         struct tree_mod_root *old_root = NULL;
1248         u64 old_generation = 0;
1249         u64 logical;
1250
1251         eb = btrfs_read_lock_root_node(root);
1252         tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
1253         if (!tm)
1254                 return root->node;
1255
1256         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1257                 old_root = &tm->old_root;
1258                 old_generation = tm->generation;
1259                 logical = old_root->logical;
1260         } else {
1261                 logical = root->node->start;
1262         }
1263
1264         tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1265         if (old_root)
1266                 eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1267         else
1268                 eb = btrfs_clone_extent_buffer(root->node);
1269         btrfs_tree_read_unlock(root->node);
1270         free_extent_buffer(root->node);
1271         if (!eb)
1272                 return NULL;
1273         btrfs_tree_read_lock(eb);
1274         if (old_root) {
1275                 btrfs_set_header_bytenr(eb, eb->start);
1276                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1277                 btrfs_set_header_owner(eb, root->root_key.objectid);
1278                 btrfs_set_header_level(eb, old_root->level);
1279                 btrfs_set_header_generation(eb, old_generation);
1280         }
1281         if (tm)
1282                 __tree_mod_log_rewind(eb, time_seq, tm);
1283         else
1284                 WARN_ON(btrfs_header_level(eb) != 0);
1285         extent_buffer_get(eb);
1286
1287         return eb;
1288 }
1289
1290 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1291                                    struct btrfs_root *root,
1292                                    struct extent_buffer *buf)
1293 {
1294         /* ensure we can see the force_cow */
1295         smp_rmb();
1296
1297         /*
1298          * We do not need to cow a block if
1299          * 1) this block is not created or changed in this transaction;
1300          * 2) this block does not belong to TREE_RELOC tree;
1301          * 3) the root is not forced COW.
1302          *
1303          * What is forced COW:
1304          *    when we create snapshot during commiting the transaction,
1305          *    after we've finished coping src root, we must COW the shared
1306          *    block to ensure the metadata consistency.
1307          */
1308         if (btrfs_header_generation(buf) == trans->transid &&
1309             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1310             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1311               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1312             !root->force_cow)
1313                 return 0;
1314         return 1;
1315 }
1316
1317 /*
1318  * cows a single block, see __btrfs_cow_block for the real work.
1319  * This version of it has extra checks so that a block isn't cow'd more than
1320  * once per transaction, as long as it hasn't been written yet
1321  */
1322 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1323                     struct btrfs_root *root, struct extent_buffer *buf,
1324                     struct extent_buffer *parent, int parent_slot,
1325                     struct extent_buffer **cow_ret)
1326 {
1327         u64 search_start;
1328         int ret;
1329
1330         if (trans->transaction != root->fs_info->running_transaction) {
1331                 printk(KERN_CRIT "trans %llu running %llu\n",
1332                        (unsigned long long)trans->transid,
1333                        (unsigned long long)
1334                        root->fs_info->running_transaction->transid);
1335                 WARN_ON(1);
1336         }
1337         if (trans->transid != root->fs_info->generation) {
1338                 printk(KERN_CRIT "trans %llu running %llu\n",
1339                        (unsigned long long)trans->transid,
1340                        (unsigned long long)root->fs_info->generation);
1341                 WARN_ON(1);
1342         }
1343
1344         if (!should_cow_block(trans, root, buf)) {
1345                 *cow_ret = buf;
1346                 return 0;
1347         }
1348
1349         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1350
1351         if (parent)
1352                 btrfs_set_lock_blocking(parent);
1353         btrfs_set_lock_blocking(buf);
1354
1355         ret = __btrfs_cow_block(trans, root, buf, parent,
1356                                  parent_slot, cow_ret, search_start, 0);
1357
1358         trace_btrfs_cow_block(root, buf, *cow_ret);
1359
1360         return ret;
1361 }
1362
1363 /*
1364  * helper function for defrag to decide if two blocks pointed to by a
1365  * node are actually close by
1366  */
1367 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1368 {
1369         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1370                 return 1;
1371         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1372                 return 1;
1373         return 0;
1374 }
1375
1376 /*
1377  * compare two keys in a memcmp fashion
1378  */
1379 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1380 {
1381         struct btrfs_key k1;
1382
1383         btrfs_disk_key_to_cpu(&k1, disk);
1384
1385         return btrfs_comp_cpu_keys(&k1, k2);
1386 }
1387
1388 /*
1389  * same as comp_keys only with two btrfs_key's
1390  */
1391 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1392 {
1393         if (k1->objectid > k2->objectid)
1394                 return 1;
1395         if (k1->objectid < k2->objectid)
1396                 return -1;
1397         if (k1->type > k2->type)
1398                 return 1;
1399         if (k1->type < k2->type)
1400                 return -1;
1401         if (k1->offset > k2->offset)
1402                 return 1;
1403         if (k1->offset < k2->offset)
1404                 return -1;
1405         return 0;
1406 }
1407
1408 /*
1409  * this is used by the defrag code to go through all the
1410  * leaves pointed to by a node and reallocate them so that
1411  * disk order is close to key order
1412  */
1413 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1414                        struct btrfs_root *root, struct extent_buffer *parent,
1415                        int start_slot, int cache_only, u64 *last_ret,
1416                        struct btrfs_key *progress)
1417 {
1418         struct extent_buffer *cur;
1419         u64 blocknr;
1420         u64 gen;
1421         u64 search_start = *last_ret;
1422         u64 last_block = 0;
1423         u64 other;
1424         u32 parent_nritems;
1425         int end_slot;
1426         int i;
1427         int err = 0;
1428         int parent_level;
1429         int uptodate;
1430         u32 blocksize;
1431         int progress_passed = 0;
1432         struct btrfs_disk_key disk_key;
1433
1434         parent_level = btrfs_header_level(parent);
1435         if (cache_only && parent_level != 1)
1436                 return 0;
1437
1438         if (trans->transaction != root->fs_info->running_transaction)
1439                 WARN_ON(1);
1440         if (trans->transid != root->fs_info->generation)
1441                 WARN_ON(1);
1442
1443         parent_nritems = btrfs_header_nritems(parent);
1444         blocksize = btrfs_level_size(root, parent_level - 1);
1445         end_slot = parent_nritems;
1446
1447         if (parent_nritems == 1)
1448                 return 0;
1449
1450         btrfs_set_lock_blocking(parent);
1451
1452         for (i = start_slot; i < end_slot; i++) {
1453                 int close = 1;
1454
1455                 btrfs_node_key(parent, &disk_key, i);
1456                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1457                         continue;
1458
1459                 progress_passed = 1;
1460                 blocknr = btrfs_node_blockptr(parent, i);
1461                 gen = btrfs_node_ptr_generation(parent, i);
1462                 if (last_block == 0)
1463                         last_block = blocknr;
1464
1465                 if (i > 0) {
1466                         other = btrfs_node_blockptr(parent, i - 1);
1467                         close = close_blocks(blocknr, other, blocksize);
1468                 }
1469                 if (!close && i < end_slot - 2) {
1470                         other = btrfs_node_blockptr(parent, i + 1);
1471                         close = close_blocks(blocknr, other, blocksize);
1472                 }
1473                 if (close) {
1474                         last_block = blocknr;
1475                         continue;
1476                 }
1477
1478                 cur = btrfs_find_tree_block(root, blocknr, blocksize);
1479                 if (cur)
1480                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1481                 else
1482                         uptodate = 0;
1483                 if (!cur || !uptodate) {
1484                         if (cache_only) {
1485                                 free_extent_buffer(cur);
1486                                 continue;
1487                         }
1488                         if (!cur) {
1489                                 cur = read_tree_block(root, blocknr,
1490                                                          blocksize, gen);
1491                                 if (!cur)
1492                                         return -EIO;
1493                         } else if (!uptodate) {
1494                                 err = btrfs_read_buffer(cur, gen);
1495                                 if (err) {
1496                                         free_extent_buffer(cur);
1497                                         return err;
1498                                 }
1499                         }
1500                 }
1501                 if (search_start == 0)
1502                         search_start = last_block;
1503
1504                 btrfs_tree_lock(cur);
1505                 btrfs_set_lock_blocking(cur);
1506                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1507                                         &cur, search_start,
1508                                         min(16 * blocksize,
1509                                             (end_slot - i) * blocksize));
1510                 if (err) {
1511                         btrfs_tree_unlock(cur);
1512                         free_extent_buffer(cur);
1513                         break;
1514                 }
1515                 search_start = cur->start;
1516                 last_block = cur->start;
1517                 *last_ret = search_start;
1518                 btrfs_tree_unlock(cur);
1519                 free_extent_buffer(cur);
1520         }
1521         return err;
1522 }
1523
1524 /*
1525  * The leaf data grows from end-to-front in the node.
1526  * this returns the address of the start of the last item,
1527  * which is the stop of the leaf data stack
1528  */
1529 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1530                                          struct extent_buffer *leaf)
1531 {
1532         u32 nr = btrfs_header_nritems(leaf);
1533         if (nr == 0)
1534                 return BTRFS_LEAF_DATA_SIZE(root);
1535         return btrfs_item_offset_nr(leaf, nr - 1);
1536 }
1537
1538
1539 /*
1540  * search for key in the extent_buffer.  The items start at offset p,
1541  * and they are item_size apart.  There are 'max' items in p.
1542  *
1543  * the slot in the array is returned via slot, and it points to
1544  * the place where you would insert key if it is not found in
1545  * the array.
1546  *
1547  * slot may point to max if the key is bigger than all of the keys
1548  */
1549 static noinline int generic_bin_search(struct extent_buffer *eb,
1550                                        unsigned long p,
1551                                        int item_size, struct btrfs_key *key,
1552                                        int max, int *slot)
1553 {
1554         int low = 0;
1555         int high = max;
1556         int mid;
1557         int ret;
1558         struct btrfs_disk_key *tmp = NULL;
1559         struct btrfs_disk_key unaligned;
1560         unsigned long offset;
1561         char *kaddr = NULL;
1562         unsigned long map_start = 0;
1563         unsigned long map_len = 0;
1564         int err;
1565
1566         while (low < high) {
1567                 mid = (low + high) / 2;
1568                 offset = p + mid * item_size;
1569
1570                 if (!kaddr || offset < map_start ||
1571                     (offset + sizeof(struct btrfs_disk_key)) >
1572                     map_start + map_len) {
1573
1574                         err = map_private_extent_buffer(eb, offset,
1575                                                 sizeof(struct btrfs_disk_key),
1576                                                 &kaddr, &map_start, &map_len);
1577
1578                         if (!err) {
1579                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1580                                                         map_start);
1581                         } else {
1582                                 read_extent_buffer(eb, &unaligned,
1583                                                    offset, sizeof(unaligned));
1584                                 tmp = &unaligned;
1585                         }
1586
1587                 } else {
1588                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1589                                                         map_start);
1590                 }
1591                 ret = comp_keys(tmp, key);
1592
1593                 if (ret < 0)
1594                         low = mid + 1;
1595                 else if (ret > 0)
1596                         high = mid;
1597                 else {
1598                         *slot = mid;
1599                         return 0;
1600                 }
1601         }
1602         *slot = low;
1603         return 1;
1604 }
1605
1606 /*
1607  * simple bin_search frontend that does the right thing for
1608  * leaves vs nodes
1609  */
1610 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1611                       int level, int *slot)
1612 {
1613         if (level == 0)
1614                 return generic_bin_search(eb,
1615                                           offsetof(struct btrfs_leaf, items),
1616                                           sizeof(struct btrfs_item),
1617                                           key, btrfs_header_nritems(eb),
1618                                           slot);
1619         else
1620                 return generic_bin_search(eb,
1621                                           offsetof(struct btrfs_node, ptrs),
1622                                           sizeof(struct btrfs_key_ptr),
1623                                           key, btrfs_header_nritems(eb),
1624                                           slot);
1625 }
1626
1627 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1628                      int level, int *slot)
1629 {
1630         return bin_search(eb, key, level, slot);
1631 }
1632
1633 static void root_add_used(struct btrfs_root *root, u32 size)
1634 {
1635         spin_lock(&root->accounting_lock);
1636         btrfs_set_root_used(&root->root_item,
1637                             btrfs_root_used(&root->root_item) + size);
1638         spin_unlock(&root->accounting_lock);
1639 }
1640
1641 static void root_sub_used(struct btrfs_root *root, u32 size)
1642 {
1643         spin_lock(&root->accounting_lock);
1644         btrfs_set_root_used(&root->root_item,
1645                             btrfs_root_used(&root->root_item) - size);
1646         spin_unlock(&root->accounting_lock);
1647 }
1648
1649 /* given a node and slot number, this reads the blocks it points to.  The
1650  * extent buffer is returned with a reference taken (but unlocked).
1651  * NULL is returned on error.
1652  */
1653 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1654                                    struct extent_buffer *parent, int slot)
1655 {
1656         int level = btrfs_header_level(parent);
1657         if (slot < 0)
1658                 return NULL;
1659         if (slot >= btrfs_header_nritems(parent))
1660                 return NULL;
1661
1662         BUG_ON(level == 0);
1663
1664         return read_tree_block(root, btrfs_node_blockptr(parent, slot),
1665                        btrfs_level_size(root, level - 1),
1666                        btrfs_node_ptr_generation(parent, slot));
1667 }
1668
1669 /*
1670  * node level balancing, used to make sure nodes are in proper order for
1671  * item deletion.  We balance from the top down, so we have to make sure
1672  * that a deletion won't leave an node completely empty later on.
1673  */
1674 static noinline int balance_level(struct btrfs_trans_handle *trans,
1675                          struct btrfs_root *root,
1676                          struct btrfs_path *path, int level)
1677 {
1678         struct extent_buffer *right = NULL;
1679         struct extent_buffer *mid;
1680         struct extent_buffer *left = NULL;
1681         struct extent_buffer *parent = NULL;
1682         int ret = 0;
1683         int wret;
1684         int pslot;
1685         int orig_slot = path->slots[level];
1686         u64 orig_ptr;
1687
1688         if (level == 0)
1689                 return 0;
1690
1691         mid = path->nodes[level];
1692
1693         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1694                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1695         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1696
1697         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1698
1699         if (level < BTRFS_MAX_LEVEL - 1) {
1700                 parent = path->nodes[level + 1];
1701                 pslot = path->slots[level + 1];
1702         }
1703
1704         /*
1705          * deal with the case where there is only one pointer in the root
1706          * by promoting the node below to a root
1707          */
1708         if (!parent) {
1709                 struct extent_buffer *child;
1710
1711                 if (btrfs_header_nritems(mid) != 1)
1712                         return 0;
1713
1714                 /* promote the child to a root */
1715                 child = read_node_slot(root, mid, 0);
1716                 if (!child) {
1717                         ret = -EROFS;
1718                         btrfs_std_error(root->fs_info, ret);
1719                         goto enospc;
1720                 }
1721
1722                 btrfs_tree_lock(child);
1723                 btrfs_set_lock_blocking(child);
1724                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1725                 if (ret) {
1726                         btrfs_tree_unlock(child);
1727                         free_extent_buffer(child);
1728                         goto enospc;
1729                 }
1730
1731                 tree_mod_log_set_root_pointer(root, child);
1732                 rcu_assign_pointer(root->node, child);
1733
1734                 add_root_to_dirty_list(root);
1735                 btrfs_tree_unlock(child);
1736
1737                 path->locks[level] = 0;
1738                 path->nodes[level] = NULL;
1739                 clean_tree_block(trans, root, mid);
1740                 btrfs_tree_unlock(mid);
1741                 /* once for the path */
1742                 free_extent_buffer(mid);
1743
1744                 root_sub_used(root, mid->len);
1745                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1746                 /* once for the root ptr */
1747                 free_extent_buffer_stale(mid);
1748                 return 0;
1749         }
1750         if (btrfs_header_nritems(mid) >
1751             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1752                 return 0;
1753
1754         left = read_node_slot(root, parent, pslot - 1);
1755         if (left) {
1756                 btrfs_tree_lock(left);
1757                 btrfs_set_lock_blocking(left);
1758                 wret = btrfs_cow_block(trans, root, left,
1759                                        parent, pslot - 1, &left);
1760                 if (wret) {
1761                         ret = wret;
1762                         goto enospc;
1763                 }
1764         }
1765         right = read_node_slot(root, parent, pslot + 1);
1766         if (right) {
1767                 btrfs_tree_lock(right);
1768                 btrfs_set_lock_blocking(right);
1769                 wret = btrfs_cow_block(trans, root, right,
1770                                        parent, pslot + 1, &right);
1771                 if (wret) {
1772                         ret = wret;
1773                         goto enospc;
1774                 }
1775         }
1776
1777         /* first, try to make some room in the middle buffer */
1778         if (left) {
1779                 orig_slot += btrfs_header_nritems(left);
1780                 wret = push_node_left(trans, root, left, mid, 1);
1781                 if (wret < 0)
1782                         ret = wret;
1783         }
1784
1785         /*
1786          * then try to empty the right most buffer into the middle
1787          */
1788         if (right) {
1789                 wret = push_node_left(trans, root, mid, right, 1);
1790                 if (wret < 0 && wret != -ENOSPC)
1791                         ret = wret;
1792                 if (btrfs_header_nritems(right) == 0) {
1793                         clean_tree_block(trans, root, right);
1794                         btrfs_tree_unlock(right);
1795                         del_ptr(trans, root, path, level + 1, pslot + 1, 1);
1796                         root_sub_used(root, right->len);
1797                         btrfs_free_tree_block(trans, root, right, 0, 1);
1798                         free_extent_buffer_stale(right);
1799                         right = NULL;
1800                 } else {
1801                         struct btrfs_disk_key right_key;
1802                         btrfs_node_key(right, &right_key, 0);
1803                         tree_mod_log_set_node_key(root->fs_info, parent,
1804                                                   &right_key, pslot + 1, 0);
1805                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1806                         btrfs_mark_buffer_dirty(parent);
1807                 }
1808         }
1809         if (btrfs_header_nritems(mid) == 1) {
1810                 /*
1811                  * we're not allowed to leave a node with one item in the
1812                  * tree during a delete.  A deletion from lower in the tree
1813                  * could try to delete the only pointer in this node.
1814                  * So, pull some keys from the left.
1815                  * There has to be a left pointer at this point because
1816                  * otherwise we would have pulled some pointers from the
1817                  * right
1818                  */
1819                 if (!left) {
1820                         ret = -EROFS;
1821                         btrfs_std_error(root->fs_info, ret);
1822                         goto enospc;
1823                 }
1824                 wret = balance_node_right(trans, root, mid, left);
1825                 if (wret < 0) {
1826                         ret = wret;
1827                         goto enospc;
1828                 }
1829                 if (wret == 1) {
1830                         wret = push_node_left(trans, root, left, mid, 1);
1831                         if (wret < 0)
1832                                 ret = wret;
1833                 }
1834                 BUG_ON(wret == 1);
1835         }
1836         if (btrfs_header_nritems(mid) == 0) {
1837                 clean_tree_block(trans, root, mid);
1838                 btrfs_tree_unlock(mid);
1839                 del_ptr(trans, root, path, level + 1, pslot, 1);
1840                 root_sub_used(root, mid->len);
1841                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1842                 free_extent_buffer_stale(mid);
1843                 mid = NULL;
1844         } else {
1845                 /* update the parent key to reflect our changes */
1846                 struct btrfs_disk_key mid_key;
1847                 btrfs_node_key(mid, &mid_key, 0);
1848                 tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
1849                                           pslot, 0);
1850                 btrfs_set_node_key(parent, &mid_key, pslot);
1851                 btrfs_mark_buffer_dirty(parent);
1852         }
1853
1854         /* update the path */
1855         if (left) {
1856                 if (btrfs_header_nritems(left) > orig_slot) {
1857                         extent_buffer_get(left);
1858                         /* left was locked after cow */
1859                         path->nodes[level] = left;
1860                         path->slots[level + 1] -= 1;
1861                         path->slots[level] = orig_slot;
1862                         if (mid) {
1863                                 btrfs_tree_unlock(mid);
1864                                 free_extent_buffer(mid);
1865                         }
1866                 } else {
1867                         orig_slot -= btrfs_header_nritems(left);
1868                         path->slots[level] = orig_slot;
1869                 }
1870         }
1871         /* double check we haven't messed things up */
1872         if (orig_ptr !=
1873             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1874                 BUG();
1875 enospc:
1876         if (right) {
1877                 btrfs_tree_unlock(right);
1878                 free_extent_buffer(right);
1879         }
1880         if (left) {
1881                 if (path->nodes[level] != left)
1882                         btrfs_tree_unlock(left);
1883                 free_extent_buffer(left);
1884         }
1885         return ret;
1886 }
1887
1888 /* Node balancing for insertion.  Here we only split or push nodes around
1889  * when they are completely full.  This is also done top down, so we
1890  * have to be pessimistic.
1891  */
1892 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1893                                           struct btrfs_root *root,
1894                                           struct btrfs_path *path, int level)
1895 {
1896         struct extent_buffer *right = NULL;
1897         struct extent_buffer *mid;
1898         struct extent_buffer *left = NULL;
1899         struct extent_buffer *parent = NULL;
1900         int ret = 0;
1901         int wret;
1902         int pslot;
1903         int orig_slot = path->slots[level];
1904
1905         if (level == 0)
1906                 return 1;
1907
1908         mid = path->nodes[level];
1909         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1910
1911         if (level < BTRFS_MAX_LEVEL - 1) {
1912                 parent = path->nodes[level + 1];
1913                 pslot = path->slots[level + 1];
1914         }
1915
1916         if (!parent)
1917                 return 1;
1918
1919         left = read_node_slot(root, parent, pslot - 1);
1920
1921         /* first, try to make some room in the middle buffer */
1922         if (left) {
1923                 u32 left_nr;
1924
1925                 btrfs_tree_lock(left);
1926                 btrfs_set_lock_blocking(left);
1927
1928                 left_nr = btrfs_header_nritems(left);
1929                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1930                         wret = 1;
1931                 } else {
1932                         ret = btrfs_cow_block(trans, root, left, parent,
1933                                               pslot - 1, &left);
1934                         if (ret)
1935                                 wret = 1;
1936                         else {
1937                                 wret = push_node_left(trans, root,
1938                                                       left, mid, 0);
1939                         }
1940                 }
1941                 if (wret < 0)
1942                         ret = wret;
1943                 if (wret == 0) {
1944                         struct btrfs_disk_key disk_key;
1945                         orig_slot += left_nr;
1946                         btrfs_node_key(mid, &disk_key, 0);
1947                         tree_mod_log_set_node_key(root->fs_info, parent,
1948                                                   &disk_key, pslot, 0);
1949                         btrfs_set_node_key(parent, &disk_key, pslot);
1950                         btrfs_mark_buffer_dirty(parent);
1951                         if (btrfs_header_nritems(left) > orig_slot) {
1952                                 path->nodes[level] = left;
1953                                 path->slots[level + 1] -= 1;
1954                                 path->slots[level] = orig_slot;
1955                                 btrfs_tree_unlock(mid);
1956                                 free_extent_buffer(mid);
1957                         } else {
1958                                 orig_slot -=
1959                                         btrfs_header_nritems(left);
1960                                 path->slots[level] = orig_slot;
1961                                 btrfs_tree_unlock(left);
1962                                 free_extent_buffer(left);
1963                         }
1964                         return 0;
1965                 }
1966                 btrfs_tree_unlock(left);
1967                 free_extent_buffer(left);
1968         }
1969         right = read_node_slot(root, parent, pslot + 1);
1970
1971         /*
1972          * then try to empty the right most buffer into the middle
1973          */
1974         if (right) {
1975                 u32 right_nr;
1976
1977                 btrfs_tree_lock(right);
1978                 btrfs_set_lock_blocking(right);
1979
1980                 right_nr = btrfs_header_nritems(right);
1981                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1982                         wret = 1;
1983                 } else {
1984                         ret = btrfs_cow_block(trans, root, right,
1985                                               parent, pslot + 1,
1986                                               &right);
1987                         if (ret)
1988                                 wret = 1;
1989                         else {
1990                                 wret = balance_node_right(trans, root,
1991                                                           right, mid);
1992                         }
1993                 }
1994                 if (wret < 0)
1995                         ret = wret;
1996                 if (wret == 0) {
1997                         struct btrfs_disk_key disk_key;
1998
1999                         btrfs_node_key(right, &disk_key, 0);
2000                         tree_mod_log_set_node_key(root->fs_info, parent,
2001                                                   &disk_key, pslot + 1, 0);
2002                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2003                         btrfs_mark_buffer_dirty(parent);
2004
2005                         if (btrfs_header_nritems(mid) <= orig_slot) {
2006                                 path->nodes[level] = right;
2007                                 path->slots[level + 1] += 1;
2008                                 path->slots[level] = orig_slot -
2009                                         btrfs_header_nritems(mid);
2010                                 btrfs_tree_unlock(mid);
2011                                 free_extent_buffer(mid);
2012                         } else {
2013                                 btrfs_tree_unlock(right);
2014                                 free_extent_buffer(right);
2015                         }
2016                         return 0;
2017                 }
2018                 btrfs_tree_unlock(right);
2019                 free_extent_buffer(right);
2020         }
2021         return 1;
2022 }
2023
2024 /*
2025  * readahead one full node of leaves, finding things that are close
2026  * to the block in 'slot', and triggering ra on them.
2027  */
2028 static void reada_for_search(struct btrfs_root *root,
2029                              struct btrfs_path *path,
2030                              int level, int slot, u64 objectid)
2031 {
2032         struct extent_buffer *node;
2033         struct btrfs_disk_key disk_key;
2034         u32 nritems;
2035         u64 search;
2036         u64 target;
2037         u64 nread = 0;
2038         u64 gen;
2039         int direction = path->reada;
2040         struct extent_buffer *eb;
2041         u32 nr;
2042         u32 blocksize;
2043         u32 nscan = 0;
2044
2045         if (level != 1)
2046                 return;
2047
2048         if (!path->nodes[level])
2049                 return;
2050
2051         node = path->nodes[level];
2052
2053         search = btrfs_node_blockptr(node, slot);
2054         blocksize = btrfs_level_size(root, level - 1);
2055         eb = btrfs_find_tree_block(root, search, blocksize);
2056         if (eb) {
2057                 free_extent_buffer(eb);
2058                 return;
2059         }
2060
2061         target = search;
2062
2063         nritems = btrfs_header_nritems(node);
2064         nr = slot;
2065
2066         while (1) {
2067                 if (direction < 0) {
2068                         if (nr == 0)
2069                                 break;
2070                         nr--;
2071                 } else if (direction > 0) {
2072                         nr++;
2073                         if (nr >= nritems)
2074                                 break;
2075                 }
2076                 if (path->reada < 0 && objectid) {
2077                         btrfs_node_key(node, &disk_key, nr);
2078                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2079                                 break;
2080                 }
2081                 search = btrfs_node_blockptr(node, nr);
2082                 if ((search <= target && target - search <= 65536) ||
2083                     (search > target && search - target <= 65536)) {
2084                         gen = btrfs_node_ptr_generation(node, nr);
2085                         readahead_tree_block(root, search, blocksize, gen);
2086                         nread += blocksize;
2087                 }
2088                 nscan++;
2089                 if ((nread > 65536 || nscan > 32))
2090                         break;
2091         }
2092 }
2093
2094 /*
2095  * returns -EAGAIN if it had to drop the path, or zero if everything was in
2096  * cache
2097  */
2098 static noinline int reada_for_balance(struct btrfs_root *root,
2099                                       struct btrfs_path *path, int level)
2100 {
2101         int slot;
2102         int nritems;
2103         struct extent_buffer *parent;
2104         struct extent_buffer *eb;
2105         u64 gen;
2106         u64 block1 = 0;
2107         u64 block2 = 0;
2108         int ret = 0;
2109         int blocksize;
2110
2111         parent = path->nodes[level + 1];
2112         if (!parent)
2113                 return 0;
2114
2115         nritems = btrfs_header_nritems(parent);
2116         slot = path->slots[level + 1];
2117         blocksize = btrfs_level_size(root, level);
2118
2119         if (slot > 0) {
2120                 block1 = btrfs_node_blockptr(parent, slot - 1);
2121                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2122                 eb = btrfs_find_tree_block(root, block1, blocksize);
2123                 /*
2124                  * if we get -eagain from btrfs_buffer_uptodate, we
2125                  * don't want to return eagain here.  That will loop
2126                  * forever
2127                  */
2128                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2129                         block1 = 0;
2130                 free_extent_buffer(eb);
2131         }
2132         if (slot + 1 < nritems) {
2133                 block2 = btrfs_node_blockptr(parent, slot + 1);
2134                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2135                 eb = btrfs_find_tree_block(root, block2, blocksize);
2136                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2137                         block2 = 0;
2138                 free_extent_buffer(eb);
2139         }
2140         if (block1 || block2) {
2141                 ret = -EAGAIN;
2142
2143                 /* release the whole path */
2144                 btrfs_release_path(path);
2145
2146                 /* read the blocks */
2147                 if (block1)
2148                         readahead_tree_block(root, block1, blocksize, 0);
2149                 if (block2)
2150                         readahead_tree_block(root, block2, blocksize, 0);
2151
2152                 if (block1) {
2153                         eb = read_tree_block(root, block1, blocksize, 0);
2154                         free_extent_buffer(eb);
2155                 }
2156                 if (block2) {
2157                         eb = read_tree_block(root, block2, blocksize, 0);
2158                         free_extent_buffer(eb);
2159                 }
2160         }
2161         return ret;
2162 }
2163
2164
2165 /*
2166  * when we walk down the tree, it is usually safe to unlock the higher layers
2167  * in the tree.  The exceptions are when our path goes through slot 0, because
2168  * operations on the tree might require changing key pointers higher up in the
2169  * tree.
2170  *
2171  * callers might also have set path->keep_locks, which tells this code to keep
2172  * the lock if the path points to the last slot in the block.  This is part of
2173  * walking through the tree, and selecting the next slot in the higher block.
2174  *
2175  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2176  * if lowest_unlock is 1, level 0 won't be unlocked
2177  */
2178 static noinline void unlock_up(struct btrfs_path *path, int level,
2179                                int lowest_unlock, int min_write_lock_level,
2180                                int *write_lock_level)
2181 {
2182         int i;
2183         int skip_level = level;
2184         int no_skips = 0;
2185         struct extent_buffer *t;
2186
2187         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2188                 if (!path->nodes[i])
2189                         break;
2190                 if (!path->locks[i])
2191                         break;
2192                 if (!no_skips && path->slots[i] == 0) {
2193                         skip_level = i + 1;
2194                         continue;
2195                 }
2196                 if (!no_skips && path->keep_locks) {
2197                         u32 nritems;
2198                         t = path->nodes[i];
2199                         nritems = btrfs_header_nritems(t);
2200                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2201                                 skip_level = i + 1;
2202                                 continue;
2203                         }
2204                 }
2205                 if (skip_level < i && i >= lowest_unlock)
2206                         no_skips = 1;
2207
2208                 t = path->nodes[i];
2209                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2210                         btrfs_tree_unlock_rw(t, path->locks[i]);
2211                         path->locks[i] = 0;
2212                         if (write_lock_level &&
2213                             i > min_write_lock_level &&
2214                             i <= *write_lock_level) {
2215                                 *write_lock_level = i - 1;
2216                         }
2217                 }
2218         }
2219 }
2220
2221 /*
2222  * This releases any locks held in the path starting at level and
2223  * going all the way up to the root.
2224  *
2225  * btrfs_search_slot will keep the lock held on higher nodes in a few
2226  * corner cases, such as COW of the block at slot zero in the node.  This
2227  * ignores those rules, and it should only be called when there are no
2228  * more updates to be done higher up in the tree.
2229  */
2230 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2231 {
2232         int i;
2233
2234         if (path->keep_locks)
2235                 return;
2236
2237         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2238                 if (!path->nodes[i])
2239                         continue;
2240                 if (!path->locks[i])
2241                         continue;
2242                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2243                 path->locks[i] = 0;
2244         }
2245 }
2246
2247 /*
2248  * helper function for btrfs_search_slot.  The goal is to find a block
2249  * in cache without setting the path to blocking.  If we find the block
2250  * we return zero and the path is unchanged.
2251  *
2252  * If we can't find the block, we set the path blocking and do some
2253  * reada.  -EAGAIN is returned and the search must be repeated.
2254  */
2255 static int
2256 read_block_for_search(struct btrfs_trans_handle *trans,
2257                        struct btrfs_root *root, struct btrfs_path *p,
2258                        struct extent_buffer **eb_ret, int level, int slot,
2259                        struct btrfs_key *key, u64 time_seq)
2260 {
2261         u64 blocknr;
2262         u64 gen;
2263         u32 blocksize;
2264         struct extent_buffer *b = *eb_ret;
2265         struct extent_buffer *tmp;
2266         int ret;
2267
2268         blocknr = btrfs_node_blockptr(b, slot);
2269         gen = btrfs_node_ptr_generation(b, slot);
2270         blocksize = btrfs_level_size(root, level - 1);
2271
2272         tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2273         if (tmp) {
2274                 /* first we do an atomic uptodate check */
2275                 if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
2276                         if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2277                                 /*
2278                                  * we found an up to date block without
2279                                  * sleeping, return
2280                                  * right away
2281                                  */
2282                                 *eb_ret = tmp;
2283                                 return 0;
2284                         }
2285                         /* the pages were up to date, but we failed
2286                          * the generation number check.  Do a full
2287                          * read for the generation number that is correct.
2288                          * We must do this without dropping locks so
2289                          * we can trust our generation number
2290                          */
2291                         free_extent_buffer(tmp);
2292                         btrfs_set_path_blocking(p);
2293
2294                         /* now we're allowed to do a blocking uptodate check */
2295                         tmp = read_tree_block(root, blocknr, blocksize, gen);
2296                         if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
2297                                 *eb_ret = tmp;
2298                                 return 0;
2299                         }
2300                         free_extent_buffer(tmp);
2301                         btrfs_release_path(p);
2302                         return -EIO;
2303                 }
2304         }
2305
2306         /*
2307          * reduce lock contention at high levels
2308          * of the btree by dropping locks before
2309          * we read.  Don't release the lock on the current
2310          * level because we need to walk this node to figure
2311          * out which blocks to read.
2312          */
2313         btrfs_unlock_up_safe(p, level + 1);
2314         btrfs_set_path_blocking(p);
2315
2316         free_extent_buffer(tmp);
2317         if (p->reada)
2318                 reada_for_search(root, p, level, slot, key->objectid);
2319
2320         btrfs_release_path(p);
2321
2322         ret = -EAGAIN;
2323         tmp = read_tree_block(root, blocknr, blocksize, 0);
2324         if (tmp) {
2325                 /*
2326                  * If the read above didn't mark this buffer up to date,
2327                  * it will never end up being up to date.  Set ret to EIO now
2328                  * and give up so that our caller doesn't loop forever
2329                  * on our EAGAINs.
2330                  */
2331                 if (!btrfs_buffer_uptodate(tmp, 0, 0))
2332                         ret = -EIO;
2333                 free_extent_buffer(tmp);
2334         }
2335         return ret;
2336 }
2337
2338 /*
2339  * helper function for btrfs_search_slot.  This does all of the checks
2340  * for node-level blocks and does any balancing required based on
2341  * the ins_len.
2342  *
2343  * If no extra work was required, zero is returned.  If we had to
2344  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2345  * start over
2346  */
2347 static int
2348 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2349                        struct btrfs_root *root, struct btrfs_path *p,
2350                        struct extent_buffer *b, int level, int ins_len,
2351                        int *write_lock_level)
2352 {
2353         int ret;
2354         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2355             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2356                 int sret;
2357
2358                 if (*write_lock_level < level + 1) {
2359                         *write_lock_level = level + 1;
2360                         btrfs_release_path(p);
2361                         goto again;
2362                 }
2363
2364                 sret = reada_for_balance(root, p, level);
2365                 if (sret)
2366                         goto again;
2367
2368                 btrfs_set_path_blocking(p);
2369                 sret = split_node(trans, root, p, level);
2370                 btrfs_clear_path_blocking(p, NULL, 0);
2371
2372                 BUG_ON(sret > 0);
2373                 if (sret) {
2374                         ret = sret;
2375                         goto done;
2376                 }
2377                 b = p->nodes[level];
2378         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2379                    BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2380                 int sret;
2381
2382                 if (*write_lock_level < level + 1) {
2383                         *write_lock_level = level + 1;
2384                         btrfs_release_path(p);
2385                         goto again;
2386                 }
2387
2388                 sret = reada_for_balance(root, p, level);
2389                 if (sret)
2390                         goto again;
2391
2392                 btrfs_set_path_blocking(p);
2393                 sret = balance_level(trans, root, p, level);
2394                 btrfs_clear_path_blocking(p, NULL, 0);
2395
2396                 if (sret) {
2397                         ret = sret;
2398                         goto done;
2399                 }
2400                 b = p->nodes[level];
2401                 if (!b) {
2402                         btrfs_release_path(p);
2403                         goto again;
2404                 }
2405                 BUG_ON(btrfs_header_nritems(b) == 1);
2406         }
2407         return 0;
2408
2409 again:
2410         ret = -EAGAIN;
2411 done:
2412         return ret;
2413 }
2414
2415 /*
2416  * look for key in the tree.  path is filled in with nodes along the way
2417  * if key is found, we return zero and you can find the item in the leaf
2418  * level of the path (level 0)
2419  *
2420  * If the key isn't found, the path points to the slot where it should
2421  * be inserted, and 1 is returned.  If there are other errors during the
2422  * search a negative error number is returned.
2423  *
2424  * if ins_len > 0, nodes and leaves will be split as we walk down the
2425  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2426  * possible)
2427  */
2428 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2429                       *root, struct btrfs_key *key, struct btrfs_path *p, int
2430                       ins_len, int cow)
2431 {
2432         struct extent_buffer *b;
2433         int slot;
2434         int ret;
2435         int err;
2436         int level;
2437         int lowest_unlock = 1;
2438         int root_lock;
2439         /* everything at write_lock_level or lower must be write locked */
2440         int write_lock_level = 0;
2441         u8 lowest_level = 0;
2442         int min_write_lock_level;
2443
2444         lowest_level = p->lowest_level;
2445         WARN_ON(lowest_level && ins_len > 0);
2446         WARN_ON(p->nodes[0] != NULL);
2447
2448         if (ins_len < 0) {
2449                 lowest_unlock = 2;
2450
2451                 /* when we are removing items, we might have to go up to level
2452                  * two as we update tree pointers  Make sure we keep write
2453                  * for those levels as well
2454                  */
2455                 write_lock_level = 2;
2456         } else if (ins_len > 0) {
2457                 /*
2458                  * for inserting items, make sure we have a write lock on
2459                  * level 1 so we can update keys
2460                  */
2461                 write_lock_level = 1;
2462         }
2463
2464         if (!cow)
2465                 write_lock_level = -1;
2466
2467         if (cow && (p->keep_locks || p->lowest_level))
2468                 write_lock_level = BTRFS_MAX_LEVEL;
2469
2470         min_write_lock_level = write_lock_level;
2471
2472 again:
2473         /*
2474          * we try very hard to do read locks on the root
2475          */
2476         root_lock = BTRFS_READ_LOCK;
2477         level = 0;
2478         if (p->search_commit_root) {
2479                 /*
2480                  * the commit roots are read only
2481                  * so we always do read locks
2482                  */
2483                 b = root->commit_root;
2484                 extent_buffer_get(b);
2485                 level = btrfs_header_level(b);
2486                 if (!p->skip_locking)
2487                         btrfs_tree_read_lock(b);
2488         } else {
2489                 if (p->skip_locking) {
2490                         b = btrfs_root_node(root);
2491                         level = btrfs_header_level(b);
2492                 } else {
2493                         /* we don't know the level of the root node
2494                          * until we actually have it read locked
2495                          */
2496                         b = btrfs_read_lock_root_node(root);
2497                         level = btrfs_header_level(b);
2498                         if (level <= write_lock_level) {
2499                                 /* whoops, must trade for write lock */
2500                                 btrfs_tree_read_unlock(b);
2501                                 free_extent_buffer(b);
2502                                 b = btrfs_lock_root_node(root);
2503                                 root_lock = BTRFS_WRITE_LOCK;
2504
2505                                 /* the level might have changed, check again */
2506                                 level = btrfs_header_level(b);
2507                         }
2508                 }
2509         }
2510         p->nodes[level] = b;
2511         if (!p->skip_locking)
2512                 p->locks[level] = root_lock;
2513
2514         while (b) {
2515                 level = btrfs_header_level(b);
2516
2517                 /*
2518                  * setup the path here so we can release it under lock
2519                  * contention with the cow code
2520                  */
2521                 if (cow) {
2522                         /*
2523                          * if we don't really need to cow this block
2524                          * then we don't want to set the path blocking,
2525                          * so we test it here
2526                          */
2527                         if (!should_cow_block(trans, root, b))
2528                                 goto cow_done;
2529
2530                         btrfs_set_path_blocking(p);
2531
2532                         /*
2533                          * must have write locks on this node and the
2534                          * parent
2535                          */
2536                         if (level + 1 > write_lock_level) {
2537                                 write_lock_level = level + 1;
2538                                 btrfs_release_path(p);
2539                                 goto again;
2540                         }
2541
2542                         err = btrfs_cow_block(trans, root, b,
2543                                               p->nodes[level + 1],
2544                                               p->slots[level + 1], &b);
2545                         if (err) {
2546                                 ret = err;
2547                                 goto done;
2548                         }
2549                 }
2550 cow_done:
2551                 BUG_ON(!cow && ins_len);
2552
2553                 p->nodes[level] = b;
2554                 btrfs_clear_path_blocking(p, NULL, 0);
2555
2556                 /*
2557                  * we have a lock on b and as long as we aren't changing
2558                  * the tree, there is no way to for the items in b to change.
2559                  * It is safe to drop the lock on our parent before we
2560                  * go through the expensive btree search on b.
2561                  *
2562                  * If cow is true, then we might be changing slot zero,
2563                  * which may require changing the parent.  So, we can't
2564                  * drop the lock until after we know which slot we're
2565                  * operating on.
2566                  */
2567                 if (!cow)
2568                         btrfs_unlock_up_safe(p, level + 1);
2569
2570                 ret = bin_search(b, key, level, &slot);
2571
2572                 if (level != 0) {
2573                         int dec = 0;
2574                         if (ret && slot > 0) {
2575                                 dec = 1;
2576                                 slot -= 1;
2577                         }
2578                         p->slots[level] = slot;
2579                         err = setup_nodes_for_search(trans, root, p, b, level,
2580                                              ins_len, &write_lock_level);
2581                         if (err == -EAGAIN)
2582                                 goto again;
2583                         if (err) {
2584                                 ret = err;
2585                                 goto done;
2586                         }
2587                         b = p->nodes[level];
2588                         slot = p->slots[level];
2589
2590                         /*
2591                          * slot 0 is special, if we change the key
2592                          * we have to update the parent pointer
2593                          * which means we must have a write lock
2594                          * on the parent
2595                          */
2596                         if (slot == 0 && cow &&
2597                             write_lock_level < level + 1) {
2598                                 write_lock_level = level + 1;
2599                                 btrfs_release_path(p);
2600                                 goto again;
2601                         }
2602
2603                         unlock_up(p, level, lowest_unlock,
2604                                   min_write_lock_level, &write_lock_level);
2605
2606                         if (level == lowest_level) {
2607                                 if (dec)
2608                                         p->slots[level]++;
2609                                 goto done;
2610                         }
2611
2612                         err = read_block_for_search(trans, root, p,
2613                                                     &b, level, slot, key, 0);
2614                         if (err == -EAGAIN)
2615                                 goto again;
2616                         if (err) {
2617                                 ret = err;
2618                                 goto done;
2619                         }
2620
2621                         if (!p->skip_locking) {
2622                                 level = btrfs_header_level(b);
2623                                 if (level <= write_lock_level) {
2624                                         err = btrfs_try_tree_write_lock(b);
2625                                         if (!err) {
2626                                                 btrfs_set_path_blocking(p);
2627                                                 btrfs_tree_lock(b);
2628                                                 btrfs_clear_path_blocking(p, b,
2629                                                                   BTRFS_WRITE_LOCK);
2630                                         }
2631                                         p->locks[level] = BTRFS_WRITE_LOCK;
2632                                 } else {
2633                                         err = btrfs_try_tree_read_lock(b);
2634                                         if (!err) {
2635                                                 btrfs_set_path_blocking(p);
2636                                                 btrfs_tree_read_lock(b);
2637                                                 btrfs_clear_path_blocking(p, b,
2638                                                                   BTRFS_READ_LOCK);
2639                                         }
2640                                         p->locks[level] = BTRFS_READ_LOCK;
2641                                 }
2642                                 p->nodes[level] = b;
2643                         }
2644                 } else {
2645                         p->slots[level] = slot;
2646                         if (ins_len > 0 &&
2647                             btrfs_leaf_free_space(root, b) < ins_len) {
2648                                 if (write_lock_level < 1) {
2649                                         write_lock_level = 1;
2650                                         btrfs_release_path(p);
2651                                         goto again;
2652                                 }
2653
2654                                 btrfs_set_path_blocking(p);
2655                                 err = split_leaf(trans, root, key,
2656                                                  p, ins_len, ret == 0);
2657                                 btrfs_clear_path_blocking(p, NULL, 0);
2658
2659                                 BUG_ON(err > 0);
2660                                 if (err) {
2661                                         ret = err;
2662                                         goto done;
2663                                 }
2664                         }
2665                         if (!p->search_for_split)
2666                                 unlock_up(p, level, lowest_unlock,
2667                                           min_write_lock_level, &write_lock_level);
2668                         goto done;
2669                 }
2670         }
2671         ret = 1;
2672 done:
2673         /*
2674          * we don't really know what they plan on doing with the path
2675          * from here on, so for now just mark it as blocking
2676          */
2677         if (!p->leave_spinning)
2678                 btrfs_set_path_blocking(p);
2679         if (ret < 0)
2680                 btrfs_release_path(p);
2681         return ret;
2682 }
2683
2684 /*
2685  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2686  * current state of the tree together with the operations recorded in the tree
2687  * modification log to search for the key in a previous version of this tree, as
2688  * denoted by the time_seq parameter.
2689  *
2690  * Naturally, there is no support for insert, delete or cow operations.
2691  *
2692  * The resulting path and return value will be set up as if we called
2693  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2694  */
2695 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2696                           struct btrfs_path *p, u64 time_seq)
2697 {
2698         struct extent_buffer *b;
2699         int slot;
2700         int ret;
2701         int err;
2702         int level;
2703         int lowest_unlock = 1;
2704         u8 lowest_level = 0;
2705
2706         lowest_level = p->lowest_level;
2707         WARN_ON(p->nodes[0] != NULL);
2708
2709         if (p->search_commit_root) {
2710                 BUG_ON(time_seq);
2711                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2712         }
2713
2714 again:
2715         b = get_old_root(root, time_seq);
2716         level = btrfs_header_level(b);
2717         p->locks[level] = BTRFS_READ_LOCK;
2718
2719         while (b) {
2720                 level = btrfs_header_level(b);
2721                 p->nodes[level] = b;
2722                 btrfs_clear_path_blocking(p, NULL, 0);
2723
2724                 /*
2725                  * we have a lock on b and as long as we aren't changing
2726                  * the tree, there is no way to for the items in b to change.
2727                  * It is safe to drop the lock on our parent before we
2728                  * go through the expensive btree search on b.
2729                  */
2730                 btrfs_unlock_up_safe(p, level + 1);
2731
2732                 ret = bin_search(b, key, level, &slot);
2733
2734                 if (level != 0) {
2735                         int dec = 0;
2736                         if (ret && slot > 0) {
2737                                 dec = 1;
2738                                 slot -= 1;
2739                         }
2740                         p->slots[level] = slot;
2741                         unlock_up(p, level, lowest_unlock, 0, NULL);
2742
2743                         if (level == lowest_level) {
2744                                 if (dec)
2745                                         p->slots[level]++;
2746                                 goto done;
2747                         }
2748
2749                         err = read_block_for_search(NULL, root, p, &b, level,
2750                                                     slot, key, time_seq);
2751                         if (err == -EAGAIN)
2752                                 goto again;
2753                         if (err) {
2754                                 ret = err;
2755                                 goto done;
2756                         }
2757
2758                         level = btrfs_header_level(b);
2759                         err = btrfs_try_tree_read_lock(b);
2760                         if (!err) {
2761                                 btrfs_set_path_blocking(p);
2762                                 btrfs_tree_read_lock(b);
2763                                 btrfs_clear_path_blocking(p, b,
2764                                                           BTRFS_READ_LOCK);
2765                         }
2766                         p->locks[level] = BTRFS_READ_LOCK;
2767                         p->nodes[level] = b;
2768                         b = tree_mod_log_rewind(root->fs_info, b, time_seq);
2769                         if (b != p->nodes[level]) {
2770                                 btrfs_tree_unlock_rw(p->nodes[level],
2771                                                      p->locks[level]);
2772                                 p->locks[level] = 0;
2773                                 p->nodes[level] = b;
2774                         }
2775                 } else {
2776                         p->slots[level] = slot;
2777                         unlock_up(p, level, lowest_unlock, 0, NULL);
2778                         goto done;
2779                 }
2780         }
2781         ret = 1;
2782 done:
2783         if (!p->leave_spinning)
2784                 btrfs_set_path_blocking(p);
2785         if (ret < 0)
2786                 btrfs_release_path(p);
2787
2788         return ret;
2789 }
2790
2791 /*
2792  * helper to use instead of search slot if no exact match is needed but
2793  * instead the next or previous item should be returned.
2794  * When find_higher is true, the next higher item is returned, the next lower
2795  * otherwise.
2796  * When return_any and find_higher are both true, and no higher item is found,
2797  * return the next lower instead.
2798  * When return_any is true and find_higher is false, and no lower item is found,
2799  * return the next higher instead.
2800  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2801  * < 0 on error
2802  */
2803 int btrfs_search_slot_for_read(struct btrfs_root *root,
2804                                struct btrfs_key *key, struct btrfs_path *p,
2805                                int find_higher, int return_any)
2806 {
2807         int ret;
2808         struct extent_buffer *leaf;
2809
2810 again:
2811         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2812         if (ret <= 0)
2813                 return ret;
2814         /*
2815          * a return value of 1 means the path is at the position where the
2816          * item should be inserted. Normally this is the next bigger item,
2817          * but in case the previous item is the last in a leaf, path points
2818          * to the first free slot in the previous leaf, i.e. at an invalid
2819          * item.
2820          */
2821         leaf = p->nodes[0];
2822
2823         if (find_higher) {
2824                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2825                         ret = btrfs_next_leaf(root, p);
2826                         if (ret <= 0)
2827                                 return ret;
2828                         if (!return_any)
2829                                 return 1;
2830                         /*
2831                          * no higher item found, return the next
2832                          * lower instead
2833                          */
2834                         return_any = 0;
2835                         find_higher = 0;
2836                         btrfs_release_path(p);
2837                         goto again;
2838                 }
2839         } else {
2840                 if (p->slots[0] == 0) {
2841                         ret = btrfs_prev_leaf(root, p);
2842                         if (ret < 0)
2843                                 return ret;
2844                         if (!ret) {
2845                                 p->slots[0] = btrfs_header_nritems(leaf) - 1;
2846                                 return 0;
2847                         }
2848                         if (!return_any)
2849                                 return 1;
2850                         /*
2851                          * no lower item found, return the next
2852                          * higher instead
2853                          */
2854                         return_any = 0;
2855                         find_higher = 1;
2856                         btrfs_release_path(p);
2857                         goto again;
2858                 } else {
2859                         --p->slots[0];
2860                 }
2861         }
2862         return 0;
2863 }
2864
2865 /*
2866  * adjust the pointers going up the tree, starting at level
2867  * making sure the right key of each node is points to 'key'.
2868  * This is used after shifting pointers to the left, so it stops
2869  * fixing up pointers when a given leaf/node is not in slot 0 of the
2870  * higher levels
2871  *
2872  */
2873 static void fixup_low_keys(struct btrfs_trans_handle *trans,
2874                            struct btrfs_root *root, struct btrfs_path *path,
2875                            struct btrfs_disk_key *key, int level)
2876 {
2877         int i;
2878         struct extent_buffer *t;
2879
2880         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2881                 int tslot = path->slots[i];
2882                 if (!path->nodes[i])
2883                         break;
2884                 t = path->nodes[i];
2885                 tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
2886                 btrfs_set_node_key(t, key, tslot);
2887                 btrfs_mark_buffer_dirty(path->nodes[i]);
2888                 if (tslot != 0)
2889                         break;
2890         }
2891 }
2892
2893 /*
2894  * update item key.
2895  *
2896  * This function isn't completely safe. It's the caller's responsibility
2897  * that the new key won't break the order
2898  */
2899 void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
2900                              struct btrfs_root *root, struct btrfs_path *path,
2901                              struct btrfs_key *new_key)
2902 {
2903         struct btrfs_disk_key disk_key;
2904         struct extent_buffer *eb;
2905         int slot;
2906
2907         eb = path->nodes[0];
2908         slot = path->slots[0];
2909         if (slot > 0) {
2910                 btrfs_item_key(eb, &disk_key, slot - 1);
2911                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
2912         }
2913         if (slot < btrfs_header_nritems(eb) - 1) {
2914                 btrfs_item_key(eb, &disk_key, slot + 1);
2915                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
2916         }
2917
2918         btrfs_cpu_key_to_disk(&disk_key, new_key);
2919         btrfs_set_item_key(eb, &disk_key, slot);
2920         btrfs_mark_buffer_dirty(eb);
2921         if (slot == 0)
2922                 fixup_low_keys(trans, root, path, &disk_key, 1);
2923 }
2924
2925 /*
2926  * try to push data from one node into the next node left in the
2927  * tree.
2928  *
2929  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2930  * error, and > 0 if there was no room in the left hand block.
2931  */
2932 static int push_node_left(struct btrfs_trans_handle *trans,
2933                           struct btrfs_root *root, struct extent_buffer *dst,
2934                           struct extent_buffer *src, int empty)
2935 {
2936         int push_items = 0;
2937         int src_nritems;
2938         int dst_nritems;
2939         int ret = 0;
2940
2941         src_nritems = btrfs_header_nritems(src);
2942         dst_nritems = btrfs_header_nritems(dst);
2943         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
2944         WARN_ON(btrfs_header_generation(src) != trans->transid);
2945         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2946
2947         if (!empty && src_nritems <= 8)
2948                 return 1;
2949
2950         if (push_items <= 0)
2951                 return 1;
2952
2953         if (empty) {
2954                 push_items = min(src_nritems, push_items);
2955                 if (push_items < src_nritems) {
2956                         /* leave at least 8 pointers in the node if
2957                          * we aren't going to empty it
2958                          */
2959                         if (src_nritems - push_items < 8) {
2960                                 if (push_items <= 8)
2961                                         return 1;
2962                                 push_items -= 8;
2963                         }
2964                 }
2965         } else
2966                 push_items = min(src_nritems - 8, push_items);
2967
2968         tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
2969                              push_items);
2970         copy_extent_buffer(dst, src,
2971                            btrfs_node_key_ptr_offset(dst_nritems),
2972                            btrfs_node_key_ptr_offset(0),
2973                            push_items * sizeof(struct btrfs_key_ptr));
2974
2975         if (push_items < src_nritems) {
2976                 tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
2977                                      src_nritems - push_items);
2978                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2979                                       btrfs_node_key_ptr_offset(push_items),
2980                                       (src_nritems - push_items) *
2981                                       sizeof(struct btrfs_key_ptr));
2982         }
2983         btrfs_set_header_nritems(src, src_nritems - push_items);
2984         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2985         btrfs_mark_buffer_dirty(src);
2986         btrfs_mark_buffer_dirty(dst);
2987
2988         return ret;
2989 }
2990
2991 /*
2992  * try to push data from one node into the next node right in the
2993  * tree.
2994  *
2995  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2996  * error, and > 0 if there was no room in the right hand block.
2997  *
2998  * this will  only push up to 1/2 the contents of the left node over
2999  */
3000 static int balance_node_right(struct btrfs_trans_handle *trans,
3001                               struct btrfs_root *root,
3002                               struct extent_buffer *dst,
3003                               struct extent_buffer *src)
3004 {
3005         int push_items = 0;
3006         int max_push;
3007         int src_nritems;
3008         int dst_nritems;
3009         int ret = 0;
3010
3011         WARN_ON(btrfs_header_generation(src) != trans->transid);
3012         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3013
3014         src_nritems = btrfs_header_nritems(src);
3015         dst_nritems = btrfs_header_nritems(dst);
3016         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3017         if (push_items <= 0)
3018                 return 1;
3019
3020         if (src_nritems < 4)
3021                 return 1;
3022
3023         max_push = src_nritems / 2 + 1;
3024         /* don't try to empty the node */
3025         if (max_push >= src_nritems)
3026                 return 1;
3027
3028         if (max_push < push_items)
3029                 push_items = max_push;
3030
3031         tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3032         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3033                                       btrfs_node_key_ptr_offset(0),
3034                                       (dst_nritems) *
3035                                       sizeof(struct btrfs_key_ptr));
3036
3037         tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3038                              src_nritems - push_items, push_items);
3039         copy_extent_buffer(dst, src,
3040                            btrfs_node_key_ptr_offset(0),
3041                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3042                            push_items * sizeof(struct btrfs_key_ptr));
3043
3044         btrfs_set_header_nritems(src, src_nritems - push_items);
3045         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3046
3047         btrfs_mark_buffer_dirty(src);
3048         btrfs_mark_buffer_dirty(dst);
3049
3050         return ret;
3051 }
3052
3053 /*
3054  * helper function to insert a new root level in the tree.
3055  * A new node is allocated, and a single item is inserted to
3056  * point to the existing root
3057  *
3058  * returns zero on success or < 0 on failure.
3059  */
3060 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3061                            struct btrfs_root *root,
3062                            struct btrfs_path *path, int level)
3063 {
3064         u64 lower_gen;
3065         struct extent_buffer *lower;
3066         struct extent_buffer *c;
3067         struct extent_buffer *old;
3068         struct btrfs_disk_key lower_key;
3069
3070         BUG_ON(path->nodes[level]);
3071         BUG_ON(path->nodes[level-1] != root->node);
3072
3073         lower = path->nodes[level-1];
3074         if (level == 1)
3075                 btrfs_item_key(lower, &lower_key, 0);
3076         else
3077                 btrfs_node_key(lower, &lower_key, 0);
3078
3079         c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3080                                    root->root_key.objectid, &lower_key,
3081                                    level, root->node->start, 0);
3082         if (IS_ERR(c))
3083                 return PTR_ERR(c);
3084
3085         root_add_used(root, root->nodesize);
3086
3087         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3088         btrfs_set_header_nritems(c, 1);
3089         btrfs_set_header_level(c, level);
3090         btrfs_set_header_bytenr(c, c->start);
3091         btrfs_set_header_generation(c, trans->transid);
3092         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3093         btrfs_set_header_owner(c, root->root_key.objectid);
3094
3095         write_extent_buffer(c, root->fs_info->fsid,
3096                             (unsigned long)btrfs_header_fsid(c),
3097                             BTRFS_FSID_SIZE);
3098
3099         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3100                             (unsigned long)btrfs_header_chunk_tree_uuid(c),
3101                             BTRFS_UUID_SIZE);
3102
3103         btrfs_set_node_key(c, &lower_key, 0);
3104         btrfs_set_node_blockptr(c, 0, lower->start);
3105         lower_gen = btrfs_header_generation(lower);
3106         WARN_ON(lower_gen != trans->transid);
3107
3108         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3109
3110         btrfs_mark_buffer_dirty(c);
3111
3112         old = root->node;
3113         tree_mod_log_set_root_pointer(root, c);
3114         rcu_assign_pointer(root->node, c);
3115
3116         /* the super has an extra ref to root->node */
3117         free_extent_buffer(old);
3118
3119         add_root_to_dirty_list(root);
3120         extent_buffer_get(c);
3121         path->nodes[level] = c;
3122         path->locks[level] = BTRFS_WRITE_LOCK;
3123         path->slots[level] = 0;
3124         return 0;
3125 }
3126
3127 /*
3128  * worker function to insert a single pointer in a node.
3129  * the node should have enough room for the pointer already
3130  *
3131  * slot and level indicate where you want the key to go, and
3132  * blocknr is the block the key points to.
3133  */
3134 static void insert_ptr(struct btrfs_trans_handle *trans,
3135                        struct btrfs_root *root, struct btrfs_path *path,
3136                        struct btrfs_disk_key *key, u64 bytenr,
3137                        int slot, int level)
3138 {
3139         struct extent_buffer *lower;
3140         int nritems;
3141         int ret;
3142
3143         BUG_ON(!path->nodes[level]);
3144         btrfs_assert_tree_locked(path->nodes[level]);
3145         lower = path->nodes[level];
3146         nritems = btrfs_header_nritems(lower);
3147         BUG_ON(slot > nritems);
3148         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3149         if (slot != nritems) {
3150                 if (level)
3151                         tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3152                                              slot, nritems - slot);
3153                 memmove_extent_buffer(lower,
3154                               btrfs_node_key_ptr_offset(slot + 1),
3155                               btrfs_node_key_ptr_offset(slot),
3156                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3157         }
3158         if (level) {
3159                 ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3160                                               MOD_LOG_KEY_ADD);
3161                 BUG_ON(ret < 0);
3162         }
3163         btrfs_set_node_key(lower, key, slot);
3164         btrfs_set_node_blockptr(lower, slot, bytenr);
3165         WARN_ON(trans->transid == 0);
3166         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3167         btrfs_set_header_nritems(lower, nritems + 1);
3168         btrfs_mark_buffer_dirty(lower);
3169 }
3170
3171 /*
3172  * split the node at the specified level in path in two.
3173  * The path is corrected to point to the appropriate node after the split
3174  *
3175  * Before splitting this tries to make some room in the node by pushing
3176  * left and right, if either one works, it returns right away.
3177  *
3178  * returns 0 on success and < 0 on failure
3179  */
3180 static noinline int split_node(struct btrfs_trans_handle *trans,
3181                                struct btrfs_root *root,
3182                                struct btrfs_path *path, int level)
3183 {
3184         struct extent_buffer *c;
3185         struct extent_buffer *split;
3186         struct btrfs_disk_key disk_key;
3187         int mid;
3188         int ret;
3189         u32 c_nritems;
3190
3191         c = path->nodes[level];
3192         WARN_ON(btrfs_header_generation(c) != trans->transid);
3193         if (c == root->node) {
3194                 /* trying to split the root, lets make a new one */
3195                 ret = insert_new_root(trans, root, path, level + 1);
3196                 if (ret)
3197                         return ret;
3198         } else {
3199                 ret = push_nodes_for_insert(trans, root, path, level);
3200                 c = path->nodes[level];
3201                 if (!ret && btrfs_header_nritems(c) <
3202                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3203                         return 0;
3204                 if (ret < 0)
3205                         return ret;
3206         }
3207
3208         c_nritems = btrfs_header_nritems(c);
3209         mid = (c_nritems + 1) / 2;
3210         btrfs_node_key(c, &disk_key, mid);
3211
3212         split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3213                                         root->root_key.objectid,
3214                                         &disk_key, level, c->start, 0);
3215         if (IS_ERR(split))
3216                 return PTR_ERR(split);
3217
3218         root_add_used(root, root->nodesize);
3219
3220         memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3221         btrfs_set_header_level(split, btrfs_header_level(c));
3222         btrfs_set_header_bytenr(split, split->start);
3223         btrfs_set_header_generation(split, trans->transid);
3224         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3225         btrfs_set_header_owner(split, root->root_key.objectid);
3226         write_extent_buffer(split, root->fs_info->fsid,
3227                             (unsigned long)btrfs_header_fsid(split),
3228                             BTRFS_FSID_SIZE);
3229         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3230                             (unsigned long)btrfs_header_chunk_tree_uuid(split),
3231                             BTRFS_UUID_SIZE);
3232
3233         tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
3234         copy_extent_buffer(split, c,
3235                            btrfs_node_key_ptr_offset(0),
3236                            btrfs_node_key_ptr_offset(mid),
3237                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3238         btrfs_set_header_nritems(split, c_nritems - mid);
3239         btrfs_set_header_nritems(c, mid);
3240         ret = 0;
3241
3242         btrfs_mark_buffer_dirty(c);
3243         btrfs_mark_buffer_dirty(split);
3244
3245         insert_ptr(trans, root, path, &disk_key, split->start,
3246                    path->slots[level + 1] + 1, level + 1);
3247
3248         if (path->slots[level] >= mid) {
3249                 path->slots[level] -= mid;
3250                 btrfs_tree_unlock(c);
3251                 free_extent_buffer(c);
3252                 path->nodes[level] = split;
3253                 path->slots[level + 1] += 1;
3254         } else {
3255                 btrfs_tree_unlock(split);
3256                 free_extent_buffer(split);
3257         }
3258         return ret;
3259 }
3260
3261 /*
3262  * how many bytes are required to store the items in a leaf.  start
3263  * and nr indicate which items in the leaf to check.  This totals up the
3264  * space used both by the item structs and the item data
3265  */
3266 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3267 {
3268         int data_len;
3269         int nritems = btrfs_header_nritems(l);
3270         int end = min(nritems, start + nr) - 1;
3271
3272         if (!nr)
3273                 return 0;
3274         data_len = btrfs_item_end_nr(l, start);
3275         data_len = data_len - btrfs_item_offset_nr(l, end);
3276         data_len += sizeof(struct btrfs_item) * nr;
3277         WARN_ON(data_len < 0);
3278         return data_len;
3279 }
3280
3281 /*
3282  * The space between the end of the leaf items and
3283  * the start of the leaf data.  IOW, how much room
3284  * the leaf has left for both items and data
3285  */
3286 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3287                                    struct extent_buffer *leaf)
3288 {
3289         int nritems = btrfs_header_nritems(leaf);
3290         int ret;
3291         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3292         if (ret < 0) {
3293                 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
3294                        "used %d nritems %d\n",
3295                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3296                        leaf_space_used(leaf, 0, nritems), nritems);
3297         }
3298         return ret;
3299 }
3300
3301 /*
3302  * min slot controls the lowest index we're willing to push to the
3303  * right.  We'll push up to and including min_slot, but no lower
3304  */
3305 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3306                                       struct btrfs_root *root,
3307                                       struct btrfs_path *path,
3308                                       int data_size, int empty,
3309                                       struct extent_buffer *right,
3310                                       int free_space, u32 left_nritems,
3311                                       u32 min_slot)
3312 {
3313         struct extent_buffer *left = path->nodes[0];
3314         struct extent_buffer *upper = path->nodes[1];
3315         struct btrfs_map_token token;
3316         struct btrfs_disk_key disk_key;
3317         int slot;
3318         u32 i;
3319         int push_space = 0;
3320         int push_items = 0;
3321         struct btrfs_item *item;
3322         u32 nr;
3323         u32 right_nritems;
3324         u32 data_end;
3325         u32 this_item_size;
3326
3327         btrfs_init_map_token(&token);
3328
3329         if (empty)
3330                 nr = 0;
3331         else
3332                 nr = max_t(u32, 1, min_slot);
3333
3334         if (path->slots[0] >= left_nritems)
3335                 push_space += data_size;
3336
3337         slot = path->slots[1];
3338         i = left_nritems - 1;
3339         while (i >= nr) {
3340                 item = btrfs_item_nr(left, i);
3341
3342                 if (!empty && push_items > 0) {
3343                         if (path->slots[0] > i)
3344                                 break;
3345                         if (path->slots[0] == i) {
3346                                 int space = btrfs_leaf_free_space(root, left);
3347                                 if (space + push_space * 2 > free_space)
3348                                         break;
3349                         }
3350                 }
3351
3352                 if (path->slots[0] == i)
3353                         push_space += data_size;
3354
3355                 this_item_size = btrfs_item_size(left, item);
3356                 if (this_item_size + sizeof(*item) + push_space > free_space)
3357                         break;
3358
3359                 push_items++;
3360                 push_space += this_item_size + sizeof(*item);
3361                 if (i == 0)
3362                         break;
3363                 i--;
3364         }
3365
3366         if (push_items == 0)
3367                 goto out_unlock;
3368
3369         if (!empty && push_items == left_nritems)
3370                 WARN_ON(1);
3371
3372         /* push left to right */
3373         right_nritems = btrfs_header_nritems(right);
3374
3375         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3376         push_space -= leaf_data_end(root, left);
3377
3378         /* make room in the right data area */
3379         data_end = leaf_data_end(root, right);
3380         memmove_extent_buffer(right,
3381                               btrfs_leaf_data(right) + data_end - push_space,
3382                               btrfs_leaf_data(right) + data_end,
3383                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
3384
3385         /* copy from the left data area */
3386         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3387                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3388                      btrfs_leaf_data(left) + leaf_data_end(root, left),
3389                      push_space);
3390
3391         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3392                               btrfs_item_nr_offset(0),
3393                               right_nritems * sizeof(struct btrfs_item));
3394
3395         /* copy the items from left to right */
3396         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3397                    btrfs_item_nr_offset(left_nritems - push_items),
3398                    push_items * sizeof(struct btrfs_item));
3399
3400         /* update the item pointers */
3401         right_nritems += push_items;
3402         btrfs_set_header_nritems(right, right_nritems);
3403         push_space = BTRFS_LEAF_DATA_SIZE(root);
3404         for (i = 0; i < right_nritems; i++) {
3405                 item = btrfs_item_nr(right, i);
3406                 push_space -= btrfs_token_item_size(right, item, &token);
3407                 btrfs_set_token_item_offset(right, item, push_space, &token);
3408         }
3409
3410         left_nritems -= push_items;
3411         btrfs_set_header_nritems(left, left_nritems);
3412
3413         if (left_nritems)
3414                 btrfs_mark_buffer_dirty(left);
3415         else
3416                 clean_tree_block(trans, root, left);
3417
3418         btrfs_mark_buffer_dirty(right);
3419
3420         btrfs_item_key(right, &disk_key, 0);
3421         btrfs_set_node_key(upper, &disk_key, slot + 1);
3422         btrfs_mark_buffer_dirty(upper);
3423
3424         /* then fixup the leaf pointer in the path */
3425         if (path->slots[0] >= left_nritems) {
3426                 path->slots[0] -= left_nritems;
3427                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3428                         clean_tree_block(trans, root, path->nodes[0]);
3429                 btrfs_tree_unlock(path->nodes[0]);
3430                 free_extent_buffer(path->nodes[0]);
3431                 path->nodes[0] = right;
3432                 path->slots[1] += 1;
3433         } else {
3434                 btrfs_tree_unlock(right);
3435                 free_extent_buffer(right);
3436         }
3437         return 0;
3438
3439 out_unlock:
3440         btrfs_tree_unlock(right);
3441         free_extent_buffer(right);
3442         return 1;
3443 }
3444
3445 /*
3446  * push some data in the path leaf to the right, trying to free up at
3447  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3448  *
3449  * returns 1 if the push failed because the other node didn't have enough
3450  * room, 0 if everything worked out and < 0 if there were major errors.
3451  *
3452  * this will push starting from min_slot to the end of the leaf.  It won't
3453  * push any slot lower than min_slot
3454  */
3455 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3456                            *root, struct btrfs_path *path,
3457                            int min_data_size, int data_size,
3458                            int empty, u32 min_slot)
3459 {
3460         struct extent_buffer *left = path->nodes[0];
3461         struct extent_buffer *right;
3462         struct extent_buffer *upper;
3463         int slot;
3464         int free_space;
3465         u32 left_nritems;
3466         int ret;
3467
3468         if (!path->nodes[1])
3469                 return 1;
3470
3471         slot = path->slots[1];
3472         upper = path->nodes[1];
3473         if (slot >= btrfs_header_nritems(upper) - 1)
3474                 return 1;
3475
3476         btrfs_assert_tree_locked(path->nodes[1]);
3477
3478         right = read_node_slot(root, upper, slot + 1);
3479         if (right == NULL)
3480                 return 1;
3481
3482         btrfs_tree_lock(right);
3483         btrfs_set_lock_blocking(right);
3484
3485         free_space = btrfs_leaf_free_space(root, right);
3486         if (free_space < data_size)
3487                 goto out_unlock;
3488
3489         /* cow and double check */
3490         ret = btrfs_cow_block(trans, root, right, upper,
3491                               slot + 1, &right);
3492         if (ret)
3493                 goto out_unlock;
3494
3495         free_space = btrfs_leaf_free_space(root, right);
3496         if (free_space < data_size)
3497                 goto out_unlock;
3498
3499         left_nritems = btrfs_header_nritems(left);
3500         if (left_nritems == 0)
3501                 goto out_unlock;
3502
3503         return __push_leaf_right(trans, root, path, min_data_size, empty,
3504                                 right, free_space, left_nritems, min_slot);
3505 out_unlock:
3506         btrfs_tree_unlock(right);
3507         free_extent_buffer(right);
3508         return 1;
3509 }
3510
3511 /*
3512  * push some data in the path leaf to the left, trying to free up at
3513  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3514  *
3515  * max_slot can put a limit on how far into the leaf we'll push items.  The
3516  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3517  * items
3518  */
3519 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3520                                      struct btrfs_root *root,
3521                                      struct btrfs_path *path, int data_size,
3522                                      int empty, struct extent_buffer *left,
3523                                      int free_space, u32 right_nritems,
3524                                      u32 max_slot)
3525 {
3526         struct btrfs_disk_key disk_key;
3527         struct extent_buffer *right = path->nodes[0];
3528         int i;
3529         int push_space = 0;
3530         int push_items = 0;
3531         struct btrfs_item *item;
3532         u32 old_left_nritems;
3533         u32 nr;
3534         int ret = 0;
3535         u32 this_item_size;
3536         u32 old_left_item_size;
3537         struct btrfs_map_token token;
3538
3539         btrfs_init_map_token(&token);
3540
3541         if (empty)
3542                 nr = min(right_nritems, max_slot);
3543         else
3544                 nr = min(right_nritems - 1, max_slot);
3545
3546         for (i = 0; i < nr; i++) {
3547                 item = btrfs_item_nr(right, i);
3548
3549                 if (!empty && push_items > 0) {
3550                         if (path->slots[0] < i)
3551                                 break;
3552                         if (path->slots[0] == i) {
3553                                 int space = btrfs_leaf_free_space(root, right);
3554                                 if (space + push_space * 2 > free_space)
3555                                         break;
3556                         }
3557                 }
3558
3559                 if (path->slots[0] == i)
3560                         push_space += data_size;
3561
3562                 this_item_size = btrfs_item_size(right, item);
3563                 if (this_item_size + sizeof(*item) + push_space > free_space)
3564                         break;
3565
3566                 push_items++;
3567                 push_space += this_item_size + sizeof(*item);
3568         }
3569
3570         if (push_items == 0) {
3571                 ret = 1;
3572                 goto out;
3573         }
3574         if (!empty && push_items == btrfs_header_nritems(right))
3575                 WARN_ON(1);
3576
3577         /* push data from right to left */
3578         copy_extent_buffer(left, right,
3579                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3580                            btrfs_item_nr_offset(0),
3581                            push_items * sizeof(struct btrfs_item));
3582
3583         push_space = BTRFS_LEAF_DATA_SIZE(root) -
3584                      btrfs_item_offset_nr(right, push_items - 1);
3585
3586         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3587                      leaf_data_end(root, left) - push_space,
3588                      btrfs_leaf_data(right) +
3589                      btrfs_item_offset_nr(right, push_items - 1),
3590                      push_space);
3591         old_left_nritems = btrfs_header_nritems(left);
3592         BUG_ON(old_left_nritems <= 0);
3593
3594         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3595         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3596                 u32 ioff;
3597
3598                 item = btrfs_item_nr(left, i);
3599
3600                 ioff = btrfs_token_item_offset(left, item, &token);
3601                 btrfs_set_token_item_offset(left, item,
3602                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3603                       &token);
3604         }
3605         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3606
3607         /* fixup right node */
3608         if (push_items > right_nritems) {
3609                 printk(KERN_CRIT "push items %d nr %u\n", push_items,
3610                        right_nritems);
3611                 WARN_ON(1);
3612         }
3613
3614         if (push_items < right_nritems) {
3615                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3616                                                   leaf_data_end(root, right);
3617                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
3618                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
3619                                       btrfs_leaf_data(right) +
3620                                       leaf_data_end(root, right), push_space);
3621
3622                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3623                               btrfs_item_nr_offset(push_items),
3624                              (btrfs_header_nritems(right) - push_items) *
3625                              sizeof(struct btrfs_item));
3626         }
3627         right_nritems -= push_items;
3628         btrfs_set_header_nritems(right, right_nritems);
3629         push_space = BTRFS_LEAF_DATA_SIZE(root);
3630         for (i = 0; i < right_nritems; i++) {
3631                 item = btrfs_item_nr(right, i);
3632
3633                 push_space = push_space - btrfs_token_item_size(right,
3634                                                                 item, &token);
3635                 btrfs_set_token_item_offset(right, item, push_space, &token);
3636         }
3637
3638         btrfs_mark_buffer_dirty(left);
3639         if (right_nritems)
3640                 btrfs_mark_buffer_dirty(right);
3641         else
3642                 clean_tree_block(trans, root, right);
3643
3644         btrfs_item_key(right, &disk_key, 0);
3645         fixup_low_keys(trans, root, path, &disk_key, 1);
3646
3647         /* then fixup the leaf pointer in the path */
3648         if (path->slots[0] < push_items) {
3649                 path->slots[0] += old_left_nritems;
3650                 btrfs_tree_unlock(path->nodes[0]);
3651                 free_extent_buffer(path->nodes[0]);
3652                 path->nodes[0] = left;
3653                 path->slots[1] -= 1;
3654         } else {
3655                 btrfs_tree_unlock(left);
3656                 free_extent_buffer(left);
3657                 path->slots[0] -= push_items;
3658         }
3659         BUG_ON(path->slots[0] < 0);
3660         return ret;
3661 out:
3662         btrfs_tree_unlock(left);
3663         free_extent_buffer(left);
3664         return ret;
3665 }
3666
3667 /*
3668  * push some data in the path leaf to the left, trying to free up at
3669  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3670  *
3671  * max_slot can put a limit on how far into the leaf we'll push items.  The
3672  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3673  * items
3674  */
3675 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3676                           *root, struct btrfs_path *path, int min_data_size,
3677                           int data_size, int empty, u32 max_slot)
3678 {
3679         struct extent_buffer *right = path->nodes[0];
3680         struct extent_buffer *left;
3681         int slot;
3682         int free_space;
3683         u32 right_nritems;
3684         int ret = 0;
3685
3686         slot = path->slots[1];
3687         if (slot == 0)
3688                 return 1;
3689         if (!path->nodes[1])
3690                 return 1;
3691
3692         right_nritems = btrfs_header_nritems(right);
3693         if (right_nritems == 0)
3694                 return 1;
3695
3696         btrfs_assert_tree_locked(path->nodes[1]);
3697
3698         left = read_node_slot(root, path->nodes[1], slot - 1);
3699         if (left == NULL)
3700                 return 1;
3701
3702         btrfs_tree_lock(left);
3703         btrfs_set_lock_blocking(left);
3704
3705         free_space = btrfs_leaf_free_space(root, left);
3706         if (free_space < data_size) {
3707                 ret = 1;
3708                 goto out;
3709         }
3710
3711         /* cow and double check */
3712         ret = btrfs_cow_block(trans, root, left,
3713                               path->nodes[1], slot - 1, &left);
3714         if (ret) {
3715                 /* we hit -ENOSPC, but it isn't fatal here */
3716                 if (ret == -ENOSPC)
3717                         ret = 1;
3718                 goto out;
3719         }
3720
3721         free_space = btrfs_leaf_free_space(root, left);
3722         if (free_space < data_size) {
3723                 ret = 1;
3724                 goto out;
3725         }
3726
3727         return __push_leaf_left(trans, root, path, min_data_size,
3728                                empty, left, free_space, right_nritems,
3729                                max_slot);
3730 out:
3731         btrfs_tree_unlock(left);
3732         free_extent_buffer(left);
3733         return ret;
3734 }
3735
3736 /*
3737  * split the path's leaf in two, making sure there is at least data_size
3738  * available for the resulting leaf level of the path.
3739  */
3740 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3741                                     struct btrfs_root *root,
3742                                     struct btrfs_path *path,
3743                                     struct extent_buffer *l,
3744                                     struct extent_buffer *right,
3745                                     int slot, int mid, int nritems)
3746 {
3747         int data_copy_size;
3748         int rt_data_off;
3749         int i;
3750         struct btrfs_disk_key disk_key;
3751         struct btrfs_map_token token;
3752
3753         btrfs_init_map_token(&token);
3754
3755         nritems = nritems - mid;
3756         btrfs_set_header_nritems(right, nritems);
3757         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
3758
3759         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3760                            btrfs_item_nr_offset(mid),
3761                            nritems * sizeof(struct btrfs_item));
3762
3763         copy_extent_buffer(right, l,
3764                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
3765                      data_copy_size, btrfs_leaf_data(l) +
3766                      leaf_data_end(root, l), data_copy_size);
3767
3768         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
3769                       btrfs_item_end_nr(l, mid);
3770
3771         for (i = 0; i < nritems; i++) {
3772                 struct btrfs_item *item = btrfs_item_nr(right, i);
3773                 u32 ioff;
3774
3775                 ioff = btrfs_token_item_offset(right, item, &token);
3776                 btrfs_set_token_item_offset(right, item,
3777                                             ioff + rt_data_off, &token);
3778         }
3779
3780         btrfs_set_header_nritems(l, mid);
3781         btrfs_item_key(right, &disk_key, 0);
3782         insert_ptr(trans, root, path, &disk_key, right->start,
3783                    path->slots[1] + 1, 1);
3784
3785         btrfs_mark_buffer_dirty(right);
3786         btrfs_mark_buffer_dirty(l);
3787         BUG_ON(path->slots[0] != slot);
3788
3789         if (mid <= slot) {
3790                 btrfs_tree_unlock(path->nodes[0]);
3791                 free_extent_buffer(path->nodes[0]);
3792                 path->nodes[0] = right;
3793                 path->slots[0] -= mid;
3794                 path->slots[1] += 1;
3795         } else {
3796                 btrfs_tree_unlock(right);
3797                 free_extent_buffer(right);
3798         }
3799
3800         BUG_ON(path->slots[0] < 0);
3801 }
3802
3803 /*
3804  * double splits happen when we need to insert a big item in the middle
3805  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3806  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3807  *          A                 B                 C
3808  *
3809  * We avoid this by trying to push the items on either side of our target
3810  * into the adjacent leaves.  If all goes well we can avoid the double split
3811  * completely.
3812  */
3813 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3814                                           struct btrfs_root *root,
3815                                           struct btrfs_path *path,
3816                                           int data_size)
3817 {
3818         int ret;
3819         int progress = 0;
3820         int slot;
3821         u32 nritems;
3822
3823         slot = path->slots[0];
3824
3825         /*
3826          * try to push all the items after our slot into the
3827          * right leaf
3828          */
3829         ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
3830         if (ret < 0)
3831                 return ret;
3832
3833         if (ret == 0)
3834                 progress++;
3835
3836         nritems = btrfs_header_nritems(path->nodes[0]);
3837         /*
3838          * our goal is to get our slot at the start or end of a leaf.  If
3839          * we've done so we're done
3840          */
3841         if (path->slots[0] == 0 || path->slots[0] == nritems)
3842                 return 0;
3843
3844         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3845                 return 0;
3846
3847         /* try to push all the items before our slot into the next leaf */
3848         slot = path->slots[0];
3849         ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
3850         if (ret < 0)
3851                 return ret;
3852
3853         if (ret == 0)
3854                 progress++;
3855
3856         if (progress)
3857                 return 0;
3858         return 1;
3859 }
3860
3861 /*
3862  * split the path's leaf in two, making sure there is at least data_size
3863  * available for the resulting leaf level of the path.
3864  *
3865  * returns 0 if all went well and < 0 on failure.
3866  */
3867 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3868                                struct btrfs_root *root,
3869                                struct btrfs_key *ins_key,
3870                                struct btrfs_path *path, int data_size,
3871                                int extend)
3872 {
3873         struct btrfs_disk_key disk_key;
3874         struct extent_buffer *l;
3875         u32 nritems;
3876         int mid;
3877         int slot;
3878         struct extent_buffer *right;
3879         int ret = 0;
3880         int wret;
3881         int split;
3882         int num_doubles = 0;
3883         int tried_avoid_double = 0;
3884
3885         l = path->nodes[0];
3886         slot = path->slots[0];
3887         if (extend && data_size + btrfs_item_size_nr(l, slot) +
3888             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
3889                 return -EOVERFLOW;
3890
3891         /* first try to make some room by pushing left and right */
3892         if (data_size) {
3893                 wret = push_leaf_right(trans, root, path, data_size,
3894                                        data_size, 0, 0);
3895                 if (wret < 0)
3896                         return wret;
3897                 if (wret) {
3898                         wret = push_leaf_left(trans, root, path, data_size,
3899                                               data_size, 0, (u32)-1);
3900                         if (wret < 0)
3901                                 return wret;
3902                 }
3903                 l = path->nodes[0];
3904
3905                 /* did the pushes work? */
3906                 if (btrfs_leaf_free_space(root, l) >= data_size)
3907                         return 0;
3908         }
3909
3910         if (!path->nodes[1]) {
3911                 ret = insert_new_root(trans, root, path, 1);
3912                 if (ret)
3913                         return ret;
3914         }
3915 again:
3916         split = 1;
3917         l = path->nodes[0];
3918         slot = path->slots[0];
3919         nritems = btrfs_header_nritems(l);
3920         mid = (nritems + 1) / 2;
3921
3922         if (mid <= slot) {
3923                 if (nritems == 1 ||
3924                     leaf_space_used(l, mid, nritems - mid) + data_size >
3925                         BTRFS_LEAF_DATA_SIZE(root)) {
3926                         if (slot >= nritems) {
3927                                 split = 0;
3928                         } else {
3929                                 mid = slot;
3930                                 if (mid != nritems &&
3931                                     leaf_space_used(l, mid, nritems - mid) +
3932                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3933                                         if (data_size && !tried_avoid_double)
3934                                                 goto push_for_double;
3935                                         split = 2;
3936                                 }
3937                         }
3938                 }
3939         } else {
3940                 if (leaf_space_used(l, 0, mid) + data_size >
3941                         BTRFS_LEAF_DATA_SIZE(root)) {
3942                         if (!extend && data_size && slot == 0) {
3943                                 split = 0;
3944                         } else if ((extend || !data_size) && slot == 0) {
3945                                 mid = 1;
3946                         } else {
3947                                 mid = slot;
3948                                 if (mid != nritems &&
3949                                     leaf_space_used(l, mid, nritems - mid) +
3950                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3951                                         if (data_size && !tried_avoid_double)
3952                                                 goto push_for_double;
3953                                         split = 2 ;
3954                                 }
3955                         }
3956                 }
3957         }
3958
3959         if (split == 0)
3960                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3961         else
3962                 btrfs_item_key(l, &disk_key, mid);
3963
3964         right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
3965                                         root->root_key.objectid,
3966                                         &disk_key, 0, l->start, 0);
3967         if (IS_ERR(right))
3968                 return PTR_ERR(right);
3969
3970         root_add_used(root, root->leafsize);
3971
3972         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
3973         btrfs_set_header_bytenr(right, right->start);
3974         btrfs_set_header_generation(right, trans->transid);
3975         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
3976         btrfs_set_header_owner(right, root->root_key.objectid);
3977         btrfs_set_header_level(right, 0);
3978         write_extent_buffer(right, root->fs_info->fsid,
3979                             (unsigned long)btrfs_header_fsid(right),
3980                             BTRFS_FSID_SIZE);
3981
3982         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
3983                             (unsigned long)btrfs_header_chunk_tree_uuid(right),
3984                             BTRFS_UUID_SIZE);
3985
3986         if (split == 0) {
3987                 if (mid <= slot) {
3988                         btrfs_set_header_nritems(right, 0);
3989                         insert_ptr(trans, root, path, &disk_key, right->start,
3990                                    path->slots[1] + 1, 1);
3991                         btrfs_tree_unlock(path->nodes[0]);
3992                         free_extent_buffer(path->nodes[0]);
3993                         path->nodes[0] = right;
3994                         path->slots[0] = 0;
3995                         path->slots[1] += 1;
3996                 } else {
3997                         btrfs_set_header_nritems(right, 0);
3998                         insert_ptr(trans, root, path, &disk_key, right->start,
3999                                           path->slots[1], 1);
4000                         btrfs_tree_unlock(path->nodes[0]);
4001                         free_extent_buffer(path->nodes[0]);
4002                         path->nodes[0] = right;
4003                         path->slots[0] = 0;
4004                         if (path->slots[1] == 0)
4005                                 fixup_low_keys(trans, root, path,
4006                                                &disk_key, 1);
4007                 }
4008                 btrfs_mark_buffer_dirty(right);
4009                 return ret;
4010         }
4011
4012         copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4013
4014         if (split == 2) {
4015                 BUG_ON(num_doubles != 0);
4016                 num_doubles++;
4017                 goto again;
4018         }
4019
4020         return 0;
4021
4022 push_for_double:
4023         push_for_double_split(trans, root, path, data_size);
4024         tried_avoid_double = 1;
4025         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4026                 return 0;
4027         goto again;
4028 }
4029
4030 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4031                                          struct btrfs_root *root,
4032                                          struct btrfs_path *path, int ins_len)
4033 {
4034         struct btrfs_key key;
4035         struct extent_buffer *leaf;
4036         struct btrfs_file_extent_item *fi;
4037         u64 extent_len = 0;
4038         u32 item_size;
4039         int ret;
4040
4041         leaf = path->nodes[0];
4042         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4043
4044         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4045                key.type != BTRFS_EXTENT_CSUM_KEY);
4046
4047         if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4048                 return 0;
4049
4050         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4051         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4052                 fi = btrfs_item_ptr(leaf, path->slots[0],
4053                                     struct btrfs_file_extent_item);
4054                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4055         }
4056         btrfs_release_path(path);
4057
4058         path->keep_locks = 1;
4059         path->search_for_split = 1;
4060         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4061         path->search_for_split = 0;
4062         if (ret < 0)
4063                 goto err;
4064
4065         ret = -EAGAIN;
4066         leaf = path->nodes[0];
4067         /* if our item isn't there or got smaller, return now */
4068         if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4069                 goto err;
4070
4071         /* the leaf has  changed, it now has room.  return now */
4072         if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4073                 goto err;
4074
4075         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4076                 fi = btrfs_item_ptr(leaf, path->slots[0],
4077                                     struct btrfs_file_extent_item);
4078                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4079                         goto err;
4080         }
4081
4082         btrfs_set_path_blocking(path);
4083         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4084         if (ret)
4085                 goto err;
4086
4087         path->keep_locks = 0;
4088         btrfs_unlock_up_safe(path, 1);
4089         return 0;
4090 err:
4091         path->keep_locks = 0;
4092         return ret;
4093 }
4094
4095 static noinline int split_item(struct btrfs_trans_handle *trans,
4096                                struct btrfs_root *root,
4097                                struct btrfs_path *path,
4098                                struct btrfs_key *new_key,
4099                                unsigned long split_offset)
4100 {
4101         struct extent_buffer *leaf;
4102         struct btrfs_item *item;
4103         struct btrfs_item *new_item;
4104         int slot;
4105         char *buf;
4106         u32 nritems;
4107         u32 item_size;
4108         u32 orig_offset;
4109         struct btrfs_disk_key disk_key;
4110
4111         leaf = path->nodes[0];
4112         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4113
4114         btrfs_set_path_blocking(path);
4115
4116         item = btrfs_item_nr(leaf, path->slots[0]);
4117         orig_offset = btrfs_item_offset(leaf, item);
4118         item_size = btrfs_item_size(leaf, item);
4119
4120         buf = kmalloc(item_size, GFP_NOFS);
4121         if (!buf)
4122                 return -ENOMEM;
4123
4124         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4125                             path->slots[0]), item_size);
4126
4127         slot = path->slots[0] + 1;
4128         nritems = btrfs_header_nritems(leaf);
4129         if (slot != nritems) {
4130                 /* shift the items */
4131                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4132                                 btrfs_item_nr_offset(slot),
4133                                 (nritems - slot) * sizeof(struct btrfs_item));
4134         }
4135
4136         btrfs_cpu_key_to_disk(&disk_key, new_key);
4137         btrfs_set_item_key(leaf, &disk_key, slot);
4138
4139         new_item = btrfs_item_nr(leaf, slot);
4140
4141         btrfs_set_item_offset(leaf, new_item, orig_offset);
4142         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4143
4144         btrfs_set_item_offset(leaf, item,
4145                               orig_offset + item_size - split_offset);
4146         btrfs_set_item_size(leaf, item, split_offset);
4147
4148         btrfs_set_header_nritems(leaf, nritems + 1);
4149
4150         /* write the data for the start of the original item */
4151         write_extent_buffer(leaf, buf,
4152                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4153                             split_offset);
4154
4155         /* write the data for the new item */
4156         write_extent_buffer(leaf, buf + split_offset,
4157                             btrfs_item_ptr_offset(leaf, slot),
4158                             item_size - split_offset);
4159         btrfs_mark_buffer_dirty(leaf);
4160
4161         BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4162         kfree(buf);
4163         return 0;
4164 }
4165
4166 /*
4167  * This function splits a single item into two items,
4168  * giving 'new_key' to the new item and splitting the
4169  * old one at split_offset (from the start of the item).
4170  *
4171  * The path may be released by this operation.  After
4172  * the split, the path is pointing to the old item.  The
4173  * new item is going to be in the same node as the old one.
4174  *
4175  * Note, the item being split must be smaller enough to live alone on
4176  * a tree block with room for one extra struct btrfs_item
4177  *
4178  * This allows us to split the item in place, keeping a lock on the
4179  * leaf the entire time.
4180  */
4181 int btrfs_split_item(struct btrfs_trans_handle *trans,
4182                      struct btrfs_root *root,
4183                      struct btrfs_path *path,
4184                      struct btrfs_key *new_key,
4185                      unsigned long split_offset)
4186 {
4187         int ret;
4188         ret = setup_leaf_for_split(trans, root, path,
4189                                    sizeof(struct btrfs_item));
4190         if (ret)
4191                 return ret;
4192
4193         ret = split_item(trans, root, path, new_key, split_offset);
4194         return ret;
4195 }
4196
4197 /*
4198  * This function duplicate a item, giving 'new_key' to the new item.
4199  * It guarantees both items live in the same tree leaf and the new item
4200  * is contiguous with the original item.
4201  *
4202  * This allows us to split file extent in place, keeping a lock on the
4203  * leaf the entire time.
4204  */
4205 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4206                          struct btrfs_root *root,
4207                          struct btrfs_path *path,
4208                          struct btrfs_key *new_key)
4209 {
4210         struct extent_buffer *leaf;
4211         int ret;
4212         u32 item_size;
4213
4214         leaf = path->nodes[0];
4215         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4216         ret = setup_leaf_for_split(trans, root, path,
4217                                    item_size + sizeof(struct btrfs_item));
4218         if (ret)
4219                 return ret;
4220
4221         path->slots[0]++;
4222         setup_items_for_insert(trans, root, path, new_key, &item_size,
4223                                item_size, item_size +
4224                                sizeof(struct btrfs_item), 1);
4225         leaf = path->nodes[0];
4226         memcpy_extent_buffer(leaf,
4227                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4228                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4229                              item_size);
4230         return 0;
4231 }
4232
4233 /*
4234  * make the item pointed to by the path smaller.  new_size indicates
4235  * how small to make it, and from_end tells us if we just chop bytes
4236  * off the end of the item or if we shift the item to chop bytes off
4237  * the front.
4238  */
4239 void btrfs_truncate_item(struct btrfs_trans_handle *trans,
4240                          struct btrfs_root *root,
4241                          struct btrfs_path *path,
4242                          u32 new_size, int from_end)
4243 {
4244         int slot;
4245         struct extent_buffer *leaf;
4246         struct btrfs_item *item;
4247         u32 nritems;
4248         unsigned int data_end;
4249         unsigned int old_data_start;
4250         unsigned int old_size;
4251         unsigned int size_diff;
4252         int i;
4253         struct btrfs_map_token token;
4254
4255         btrfs_init_map_token(&token);
4256
4257         leaf = path->nodes[0];
4258         slot = path->slots[0];
4259
4260         old_size = btrfs_item_size_nr(leaf, slot);
4261         if (old_size == new_size)
4262                 return;
4263
4264         nritems = btrfs_header_nritems(leaf);
4265         data_end = leaf_data_end(root, leaf);
4266
4267         old_data_start = btrfs_item_offset_nr(leaf, slot);
4268
4269         size_diff = old_size - new_size;
4270
4271         BUG_ON(slot < 0);
4272         BUG_ON(slot >= nritems);
4273
4274         /*
4275          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4276          */
4277         /* first correct the data pointers */
4278         for (i = slot; i < nritems; i++) {
4279                 u32 ioff;
4280                 item = btrfs_item_nr(leaf, i);
4281
4282                 ioff = btrfs_token_item_offset(leaf, item, &token);
4283                 btrfs_set_token_item_offset(leaf, item,
4284                                             ioff + size_diff, &token);
4285         }
4286
4287         /* shift the data */
4288         if (from_end) {
4289                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4290                               data_end + size_diff, btrfs_leaf_data(leaf) +
4291                               data_end, old_data_start + new_size - data_end);
4292         } else {
4293                 struct btrfs_disk_key disk_key;
4294                 u64 offset;
4295
4296                 btrfs_item_key(leaf, &disk_key, slot);
4297
4298                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4299                         unsigned long ptr;
4300                         struct btrfs_file_extent_item *fi;
4301
4302                         fi = btrfs_item_ptr(leaf, slot,
4303                                             struct btrfs_file_extent_item);
4304                         fi = (struct btrfs_file_extent_item *)(
4305                              (unsigned long)fi - size_diff);
4306
4307                         if (btrfs_file_extent_type(leaf, fi) ==
4308                             BTRFS_FILE_EXTENT_INLINE) {
4309                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4310                                 memmove_extent_buffer(leaf, ptr,
4311                                       (unsigned long)fi,
4312                                       offsetof(struct btrfs_file_extent_item,
4313                                                  disk_bytenr));
4314                         }
4315                 }
4316
4317                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4318                               data_end + size_diff, btrfs_leaf_data(leaf) +
4319                               data_end, old_data_start - data_end);
4320
4321                 offset = btrfs_disk_key_offset(&disk_key);
4322                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4323                 btrfs_set_item_key(leaf, &disk_key, slot);
4324                 if (slot == 0)
4325                         fixup_low_keys(trans, root, path, &disk_key, 1);
4326         }
4327
4328         item = btrfs_item_nr(leaf, slot);
4329         btrfs_set_item_size(leaf, item, new_size);
4330         btrfs_mark_buffer_dirty(leaf);
4331
4332         if (btrfs_leaf_free_space(root, leaf) < 0) {
4333                 btrfs_print_leaf(root, leaf);
4334                 BUG();
4335         }
4336 }
4337
4338 /*
4339  * make the item pointed to by the path bigger, data_size is the new size.
4340  */
4341 void btrfs_extend_item(struct btrfs_trans_handle *trans,
4342                        struct btrfs_root *root, struct btrfs_path *path,
4343                        u32 data_size)
4344 {
4345         int slot;
4346         struct extent_buffer *leaf;
4347         struct btrfs_item *item;
4348         u32 nritems;
4349         unsigned int data_end;
4350         unsigned int old_data;
4351         unsigned int old_size;
4352         int i;
4353         struct btrfs_map_token token;
4354
4355         btrfs_init_map_token(&token);
4356
4357         leaf = path->nodes[0];
4358
4359         nritems = btrfs_header_nritems(leaf);
4360         data_end = leaf_data_end(root, leaf);
4361
4362         if (btrfs_leaf_free_space(root, leaf) < data_size) {
4363                 btrfs_print_leaf(root, leaf);
4364                 BUG();
4365         }
4366         slot = path->slots[0];
4367         old_data = btrfs_item_end_nr(leaf, slot);
4368
4369         BUG_ON(slot < 0);
4370         if (slot >= nritems) {
4371                 btrfs_print_leaf(root, leaf);
4372                 printk(KERN_CRIT "slot %d too large, nritems %d\n",
4373                        slot, nritems);
4374                 BUG_ON(1);
4375         }
4376
4377         /*
4378          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4379          */
4380         /* first correct the data pointers */
4381         for (i = slot; i < nritems; i++) {
4382                 u32 ioff;
4383                 item = btrfs_item_nr(leaf, i);
4384
4385                 ioff = btrfs_token_item_offset(leaf, item, &token);
4386                 btrfs_set_token_item_offset(leaf, item,
4387                                             ioff - data_size, &token);
4388         }
4389
4390         /* shift the data */
4391         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4392                       data_end - data_size, btrfs_leaf_data(leaf) +
4393                       data_end, old_data - data_end);
4394
4395         data_end = old_data;
4396         old_size = btrfs_item_size_nr(leaf, slot);
4397         item = btrfs_item_nr(leaf, slot);
4398         btrfs_set_item_size(leaf, item, old_size + data_size);
4399         btrfs_mark_buffer_dirty(leaf);
4400
4401         if (btrfs_leaf_free_space(root, leaf) < 0) {
4402                 btrfs_print_leaf(root, leaf);
4403                 BUG();
4404         }
4405 }
4406
4407 /*
4408  * Given a key and some data, insert items into the tree.
4409  * This does all the path init required, making room in the tree if needed.
4410  * Returns the number of keys that were inserted.
4411  */
4412 int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
4413                             struct btrfs_root *root,
4414                             struct btrfs_path *path,
4415                             struct btrfs_key *cpu_key, u32 *data_size,
4416                             int nr)
4417 {
4418         struct extent_buffer *leaf;
4419         struct btrfs_item *item;
4420         int ret = 0;
4421         int slot;
4422         int i;
4423         u32 nritems;
4424         u32 total_data = 0;
4425         u32 total_size = 0;
4426         unsigned int data_end;
4427         struct btrfs_disk_key disk_key;
4428         struct btrfs_key found_key;
4429         struct btrfs_map_token token;
4430
4431         btrfs_init_map_token(&token);
4432
4433         for (i = 0; i < nr; i++) {
4434                 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
4435                     BTRFS_LEAF_DATA_SIZE(root)) {
4436                         break;
4437                         nr = i;
4438                 }
4439                 total_data += data_size[i];
4440                 total_size += data_size[i] + sizeof(struct btrfs_item);
4441         }
4442         BUG_ON(nr == 0);
4443
4444         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4445         if (ret == 0)
4446                 return -EEXIST;
4447         if (ret < 0)
4448                 goto out;
4449
4450         leaf = path->nodes[0];
4451
4452         nritems = btrfs_header_nritems(leaf);
4453         data_end = leaf_data_end(root, leaf);
4454
4455         if (btrfs_leaf_free_space(root, leaf) < total_size) {
4456                 for (i = nr; i >= 0; i--) {
4457                         total_data -= data_size[i];
4458                         total_size -= data_size[i] + sizeof(struct btrfs_item);
4459                         if (total_size < btrfs_leaf_free_space(root, leaf))
4460                                 break;
4461                 }
4462                 nr = i;
4463         }
4464
4465         slot = path->slots[0];
4466         BUG_ON(slot < 0);
4467
4468         if (slot != nritems) {
4469                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4470
4471                 item = btrfs_item_nr(leaf, slot);
4472                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4473
4474                 /* figure out how many keys we can insert in here */
4475                 total_data = data_size[0];
4476                 for (i = 1; i < nr; i++) {
4477                         if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
4478                                 break;
4479                         total_data += data_size[i];
4480                 }
4481                 nr = i;
4482
4483                 if (old_data < data_end) {
4484                         btrfs_print_leaf(root, leaf);
4485                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4486                                slot, old_data, data_end);
4487                         BUG_ON(1);
4488                 }
4489                 /*
4490                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4491                  */
4492                 /* first correct the data pointers */
4493                 for (i = slot; i < nritems; i++) {
4494                         u32 ioff;
4495
4496                         item = btrfs_item_nr(leaf, i);
4497                         ioff = btrfs_token_item_offset(leaf, item, &token);
4498                         btrfs_set_token_item_offset(leaf, item,
4499                                                     ioff - total_data, &token);
4500                 }
4501                 /* shift the items */
4502                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4503                               btrfs_item_nr_offset(slot),
4504                               (nritems - slot) * sizeof(struct btrfs_item));
4505
4506                 /* shift the data */
4507                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4508                               data_end - total_data, btrfs_leaf_data(leaf) +
4509                               data_end, old_data - data_end);
4510                 data_end = old_data;
4511         } else {
4512                 /*
4513                  * this sucks but it has to be done, if we are inserting at
4514                  * the end of the leaf only insert 1 of the items, since we
4515                  * have no way of knowing whats on the next leaf and we'd have
4516                  * to drop our current locks to figure it out
4517                  */
4518                 nr = 1;
4519         }
4520
4521         /* setup the item for the new data */
4522         for (i = 0; i < nr; i++) {
4523                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4524                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4525                 item = btrfs_item_nr(leaf, slot + i);
4526                 btrfs_set_token_item_offset(leaf, item,
4527                                             data_end - data_size[i], &token);
4528                 data_end -= data_size[i];
4529                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4530         }
4531         btrfs_set_header_nritems(leaf, nritems + nr);
4532         btrfs_mark_buffer_dirty(leaf);
4533
4534         ret = 0;
4535         if (slot == 0) {
4536                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4537                 fixup_low_keys(trans, root, path, &disk_key, 1);
4538         }
4539
4540         if (btrfs_leaf_free_space(root, leaf) < 0) {
4541                 btrfs_print_leaf(root, leaf);
4542                 BUG();
4543         }
4544 out:
4545         if (!ret)
4546                 ret = nr;
4547         return ret;
4548 }
4549
4550 /*
4551  * this is a helper for btrfs_insert_empty_items, the main goal here is
4552  * to save stack depth by doing the bulk of the work in a function
4553  * that doesn't call btrfs_search_slot
4554  */
4555 void setup_items_for_insert(struct btrfs_trans_handle *trans,
4556                             struct btrfs_root *root, struct btrfs_path *path,
4557                             struct btrfs_key *cpu_key, u32 *data_size,
4558                             u32 total_data, u32 total_size, int nr)
4559 {
4560         struct btrfs_item *item;
4561         int i;
4562         u32 nritems;
4563         unsigned int data_end;
4564         struct btrfs_disk_key disk_key;
4565         struct extent_buffer *leaf;
4566         int slot;
4567         struct btrfs_map_token token;
4568
4569         btrfs_init_map_token(&token);
4570
4571         leaf = path->nodes[0];
4572         slot = path->slots[0];
4573
4574         nritems = btrfs_header_nritems(leaf);
4575         data_end = leaf_data_end(root, leaf);
4576
4577         if (btrfs_leaf_free_space(root, leaf) < total_size) {
4578                 btrfs_print_leaf(root, leaf);
4579                 printk(KERN_CRIT "not enough freespace need %u have %d\n",
4580                        total_size, btrfs_leaf_free_space(root, leaf));
4581                 BUG();
4582         }
4583
4584         if (slot != nritems) {
4585                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4586
4587                 if (old_data < data_end) {
4588                         btrfs_print_leaf(root, leaf);
4589                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4590                                slot, old_data, data_end);
4591                         BUG_ON(1);
4592                 }
4593                 /*
4594                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4595                  */
4596                 /* first correct the data pointers */
4597                 for (i = slot; i < nritems; i++) {
4598                         u32 ioff;
4599
4600                         item = btrfs_item_nr(leaf, i);
4601                         ioff = btrfs_token_item_offset(leaf, item, &token);
4602                         btrfs_set_token_item_offset(leaf, item,
4603                                                     ioff - total_data, &token);
4604                 }
4605                 /* shift the items */
4606                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4607                               btrfs_item_nr_offset(slot),
4608                               (nritems - slot) * sizeof(struct btrfs_item));
4609
4610                 /* shift the data */
4611                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4612                               data_end - total_data, btrfs_leaf_data(leaf) +
4613                               data_end, old_data - data_end);
4614                 data_end = old_data;
4615         }
4616
4617         /* setup the item for the new data */
4618         for (i = 0; i < nr; i++) {
4619                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4620                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4621                 item = btrfs_item_nr(leaf, slot + i);
4622                 btrfs_set_token_item_offset(leaf, item,
4623                                             data_end - data_size[i], &token);
4624                 data_end -= data_size[i];
4625                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4626         }
4627
4628         btrfs_set_header_nritems(leaf, nritems + nr);
4629
4630         if (slot == 0) {
4631                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4632                 fixup_low_keys(trans, root, path, &disk_key, 1);
4633         }
4634         btrfs_unlock_up_safe(path, 1);
4635         btrfs_mark_buffer_dirty(leaf);
4636
4637         if (btrfs_leaf_free_space(root, leaf) < 0) {
4638                 btrfs_print_leaf(root, leaf);
4639                 BUG();
4640         }
4641 }
4642
4643 /*
4644  * Given a key and some data, insert items into the tree.
4645  * This does all the path init required, making room in the tree if needed.
4646  */
4647 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4648                             struct btrfs_root *root,
4649                             struct btrfs_path *path,
4650                             struct btrfs_key *cpu_key, u32 *data_size,
4651                             int nr)
4652 {
4653         int ret = 0;
4654         int slot;
4655         int i;
4656         u32 total_size = 0;
4657         u32 total_data = 0;
4658
4659         for (i = 0; i < nr; i++)
4660                 total_data += data_size[i];
4661
4662         total_size = total_data + (nr * sizeof(struct btrfs_item));
4663         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4664         if (ret == 0)
4665                 return -EEXIST;
4666         if (ret < 0)
4667                 return ret;
4668
4669         slot = path->slots[0];
4670         BUG_ON(slot < 0);
4671
4672         setup_items_for_insert(trans, root, path, cpu_key, data_size,
4673                                total_data, total_size, nr);
4674         return 0;
4675 }
4676
4677 /*
4678  * Given a key and some data, insert an item into the tree.
4679  * This does all the path init required, making room in the tree if needed.
4680  */
4681 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4682                       *root, struct btrfs_key *cpu_key, void *data, u32
4683                       data_size)
4684 {
4685         int ret = 0;
4686         struct btrfs_path *path;
4687         struct extent_buffer *leaf;
4688         unsigned long ptr;
4689
4690         path = btrfs_alloc_path();
4691         if (!path)
4692                 return -ENOMEM;
4693         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4694         if (!ret) {
4695                 leaf = path->nodes[0];
4696                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4697                 write_extent_buffer(leaf, data, ptr, data_size);
4698                 btrfs_mark_buffer_dirty(leaf);
4699         }
4700         btrfs_free_path(path);
4701         return ret;
4702 }
4703
4704 /*
4705  * delete the pointer from a given node.
4706  *
4707  * the tree should have been previously balanced so the deletion does not
4708  * empty a node.
4709  */
4710 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4711                     struct btrfs_path *path, int level, int slot,
4712                     int tree_mod_log)
4713 {
4714         struct extent_buffer *parent = path->nodes[level];
4715         u32 nritems;
4716         int ret;
4717
4718         nritems = btrfs_header_nritems(parent);
4719         if (slot != nritems - 1) {
4720                 if (tree_mod_log && level)
4721                         tree_mod_log_eb_move(root->fs_info, parent, slot,
4722                                              slot + 1, nritems - slot - 1);
4723                 memmove_extent_buffer(parent,
4724                               btrfs_node_key_ptr_offset(slot),
4725                               btrfs_node_key_ptr_offset(slot + 1),
4726                               sizeof(struct btrfs_key_ptr) *
4727                               (nritems - slot - 1));
4728         } else if (tree_mod_log && level) {
4729                 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4730                                               MOD_LOG_KEY_REMOVE);
4731                 BUG_ON(ret < 0);
4732         }
4733
4734         nritems--;
4735         btrfs_set_header_nritems(parent, nritems);
4736         if (nritems == 0 && parent == root->node) {
4737                 BUG_ON(btrfs_header_level(root->node) != 1);
4738                 /* just turn the root into a leaf and break */
4739                 btrfs_set_header_level(root->node, 0);
4740         } else if (slot == 0) {
4741                 struct btrfs_disk_key disk_key;
4742
4743                 btrfs_node_key(parent, &disk_key, 0);
4744                 fixup_low_keys(trans, root, path, &disk_key, level + 1);
4745         }
4746         btrfs_mark_buffer_dirty(parent);
4747 }
4748
4749 /*
4750  * a helper function to delete the leaf pointed to by path->slots[1] and
4751  * path->nodes[1].
4752  *
4753  * This deletes the pointer in path->nodes[1] and frees the leaf
4754  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4755  *
4756  * The path must have already been setup for deleting the leaf, including
4757  * all the proper balancing.  path->nodes[1] must be locked.
4758  */
4759 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4760                                     struct btrfs_root *root,
4761                                     struct btrfs_path *path,
4762                                     struct extent_buffer *leaf)
4763 {
4764         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4765         del_ptr(trans, root, path, 1, path->slots[1], 1);
4766
4767         /*
4768          * btrfs_free_extent is expensive, we want to make sure we
4769          * aren't holding any locks when we call it
4770          */
4771         btrfs_unlock_up_safe(path, 0);
4772
4773         root_sub_used(root, leaf->len);
4774
4775         extent_buffer_get(leaf);
4776         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4777         free_extent_buffer_stale(leaf);
4778 }
4779 /*
4780  * delete the item at the leaf level in path.  If that empties
4781  * the leaf, remove it from the tree
4782  */
4783 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4784                     struct btrfs_path *path, int slot, int nr)
4785 {
4786         struct extent_buffer *leaf;
4787         struct btrfs_item *item;
4788         int last_off;
4789         int dsize = 0;
4790         int ret = 0;
4791         int wret;
4792         int i;
4793         u32 nritems;
4794         struct btrfs_map_token token;
4795
4796         btrfs_init_map_token(&token);
4797
4798         leaf = path->nodes[0];
4799         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4800
4801         for (i = 0; i < nr; i++)
4802                 dsize += btrfs_item_size_nr(leaf, slot + i);
4803
4804         nritems = btrfs_header_nritems(leaf);
4805
4806         if (slot + nr != nritems) {
4807                 int data_end = leaf_data_end(root, leaf);
4808
4809                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4810                               data_end + dsize,
4811                               btrfs_leaf_data(leaf) + data_end,
4812                               last_off - data_end);
4813
4814                 for (i = slot + nr; i < nritems; i++) {
4815                         u32 ioff;
4816
4817                         item = btrfs_item_nr(leaf, i);
4818                         ioff = btrfs_token_item_offset(leaf, item, &token);
4819                         btrfs_set_token_item_offset(leaf, item,
4820                                                     ioff + dsize, &token);
4821                 }
4822
4823                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4824                               btrfs_item_nr_offset(slot + nr),
4825                               sizeof(struct btrfs_item) *
4826                               (nritems - slot - nr));
4827         }
4828         btrfs_set_header_nritems(leaf, nritems - nr);
4829         nritems -= nr;
4830
4831         /* delete the leaf if we've emptied it */
4832         if (nritems == 0) {
4833                 if (leaf == root->node) {
4834                         btrfs_set_header_level(leaf, 0);
4835                 } else {
4836                         btrfs_set_path_blocking(path);
4837                         clean_tree_block(trans, root, leaf);
4838                         btrfs_del_leaf(trans, root, path, leaf);
4839                 }
4840         } else {
4841                 int used = leaf_space_used(leaf, 0, nritems);
4842                 if (slot == 0) {
4843                         struct btrfs_disk_key disk_key;
4844
4845                         btrfs_item_key(leaf, &disk_key, 0);
4846                         fixup_low_keys(trans, root, path, &disk_key, 1);
4847                 }
4848
4849                 /* delete the leaf if it is mostly empty */
4850                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4851                         /* push_leaf_left fixes the path.
4852                          * make sure the path still points to our leaf
4853                          * for possible call to del_ptr below
4854                          */
4855                         slot = path->slots[1];
4856                         extent_buffer_get(leaf);
4857
4858                         btrfs_set_path_blocking(path);
4859                         wret = push_leaf_left(trans, root, path, 1, 1,
4860                                               1, (u32)-1);
4861                         if (wret < 0 && wret != -ENOSPC)
4862                                 ret = wret;
4863
4864                         if (path->nodes[0] == leaf &&
4865                             btrfs_header_nritems(leaf)) {
4866                                 wret = push_leaf_right(trans, root, path, 1,
4867                                                        1, 1, 0);
4868                                 if (wret < 0 && wret != -ENOSPC)
4869                                         ret = wret;
4870                         }
4871
4872                         if (btrfs_header_nritems(leaf) == 0) {
4873                                 path->slots[1] = slot;
4874                                 btrfs_del_leaf(trans, root, path, leaf);
4875                                 free_extent_buffer(leaf);
4876                                 ret = 0;
4877                         } else {
4878                                 /* if we're still in the path, make sure
4879                                  * we're dirty.  Otherwise, one of the
4880                                  * push_leaf functions must have already
4881                                  * dirtied this buffer
4882                                  */
4883                                 if (path->nodes[0] == leaf)
4884                                         btrfs_mark_buffer_dirty(leaf);
4885                                 free_extent_buffer(leaf);
4886                         }
4887                 } else {
4888                         btrfs_mark_buffer_dirty(leaf);
4889                 }
4890         }
4891         return ret;
4892 }
4893
4894 /*
4895  * search the tree again to find a leaf with lesser keys
4896  * returns 0 if it found something or 1 if there are no lesser leaves.
4897  * returns < 0 on io errors.
4898  *
4899  * This may release the path, and so you may lose any locks held at the
4900  * time you call it.
4901  */
4902 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4903 {
4904         struct btrfs_key key;
4905         struct btrfs_disk_key found_key;
4906         int ret;
4907
4908         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4909
4910         if (key.offset > 0)
4911                 key.offset--;
4912         else if (key.type > 0)
4913                 key.type--;
4914         else if (key.objectid > 0)
4915                 key.objectid--;
4916         else
4917                 return 1;
4918
4919         btrfs_release_path(path);
4920         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4921         if (ret < 0)
4922                 return ret;
4923         btrfs_item_key(path->nodes[0], &found_key, 0);
4924         ret = comp_keys(&found_key, &key);
4925         if (ret < 0)
4926                 return 0;
4927         return 1;
4928 }
4929
4930 /*
4931  * A helper function to walk down the tree starting at min_key, and looking
4932  * for nodes or leaves that are either in cache or have a minimum
4933  * transaction id.  This is used by the btree defrag code, and tree logging
4934  *
4935  * This does not cow, but it does stuff the starting key it finds back
4936  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4937  * key and get a writable path.
4938  *
4939  * This does lock as it descends, and path->keep_locks should be set
4940  * to 1 by the caller.
4941  *
4942  * This honors path->lowest_level to prevent descent past a given level
4943  * of the tree.
4944  *
4945  * min_trans indicates the oldest transaction that you are interested
4946  * in walking through.  Any nodes or leaves older than min_trans are
4947  * skipped over (without reading them).
4948  *
4949  * returns zero if something useful was found, < 0 on error and 1 if there
4950  * was nothing in the tree that matched the search criteria.
4951  */
4952 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4953                          struct btrfs_key *max_key,
4954                          struct btrfs_path *path, int cache_only,
4955                          u64 min_trans)
4956 {
4957         struct extent_buffer *cur;
4958         struct btrfs_key found_key;
4959         int slot;
4960         int sret;
4961         u32 nritems;
4962         int level;
4963         int ret = 1;
4964
4965         WARN_ON(!path->keep_locks);
4966 again:
4967         cur = btrfs_read_lock_root_node(root);
4968         level = btrfs_header_level(cur);
4969         WARN_ON(path->nodes[level]);
4970         path->nodes[level] = cur;
4971         path->locks[level] = BTRFS_READ_LOCK;
4972
4973         if (btrfs_header_generation(cur) < min_trans) {
4974                 ret = 1;
4975                 goto out;
4976         }
4977         while (1) {
4978                 nritems = btrfs_header_nritems(cur);
4979                 level = btrfs_header_level(cur);
4980                 sret = bin_search(cur, min_key, level, &slot);
4981
4982                 /* at the lowest level, we're done, setup the path and exit */
4983                 if (level == path->lowest_level) {
4984                         if (slot >= nritems)
4985                                 goto find_next_key;
4986                         ret = 0;
4987                         path->slots[level] = slot;
4988                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4989                         goto out;
4990                 }
4991                 if (sret && slot > 0)
4992                         slot--;
4993                 /*
4994                  * check this node pointer against the cache_only and
4995                  * min_trans parameters.  If it isn't in cache or is too
4996                  * old, skip to the next one.
4997                  */
4998                 while (slot < nritems) {
4999                         u64 blockptr;
5000                         u64 gen;
5001                         struct extent_buffer *tmp;
5002                         struct btrfs_disk_key disk_key;
5003
5004                         blockptr = btrfs_node_blockptr(cur, slot);
5005                         gen = btrfs_node_ptr_generation(cur, slot);
5006                         if (gen < min_trans) {
5007                                 slot++;
5008                                 continue;
5009                         }
5010                         if (!cache_only)
5011                                 break;
5012
5013                         if (max_key) {
5014                                 btrfs_node_key(cur, &disk_key, slot);
5015                                 if (comp_keys(&disk_key, max_key) >= 0) {
5016                                         ret = 1;
5017                                         goto out;
5018                                 }
5019                         }
5020
5021                         tmp = btrfs_find_tree_block(root, blockptr,
5022                                             btrfs_level_size(root, level - 1));
5023
5024                         if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
5025                                 free_extent_buffer(tmp);
5026                                 break;
5027                         }
5028                         if (tmp)
5029                                 free_extent_buffer(tmp);
5030                         slot++;
5031                 }
5032 find_next_key:
5033                 /*
5034                  * we didn't find a candidate key in this node, walk forward
5035                  * and find another one
5036                  */
5037                 if (slot >= nritems) {
5038                         path->slots[level] = slot;
5039                         btrfs_set_path_blocking(path);
5040                         sret = btrfs_find_next_key(root, path, min_key, level,
5041                                                   cache_only, min_trans);
5042                         if (sret == 0) {
5043                                 btrfs_release_path(path);
5044                                 goto again;
5045                         } else {
5046                                 goto out;
5047                         }
5048                 }
5049                 /* save our key for returning back */
5050                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5051                 path->slots[level] = slot;
5052                 if (level == path->lowest_level) {
5053                         ret = 0;
5054                         unlock_up(path, level, 1, 0, NULL);
5055                         goto out;
5056                 }
5057                 btrfs_set_path_blocking(path);
5058                 cur = read_node_slot(root, cur, slot);
5059                 BUG_ON(!cur); /* -ENOMEM */
5060
5061                 btrfs_tree_read_lock(cur);
5062
5063                 path->locks[level - 1] = BTRFS_READ_LOCK;
5064                 path->nodes[level - 1] = cur;
5065                 unlock_up(path, level, 1, 0, NULL);
5066                 btrfs_clear_path_blocking(path, NULL, 0);
5067         }
5068 out:
5069         if (ret == 0)
5070                 memcpy(min_key, &found_key, sizeof(found_key));
5071         btrfs_set_path_blocking(path);
5072         return ret;
5073 }
5074
5075 static void tree_move_down(struct btrfs_root *root,
5076                            struct btrfs_path *path,
5077                            int *level, int root_level)
5078 {
5079         path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5080                                         path->slots[*level]);
5081         path->slots[*level - 1] = 0;
5082         (*level)--;
5083 }
5084
5085 static int tree_move_next_or_upnext(struct btrfs_root *root,
5086                                     struct btrfs_path *path,
5087                                     int *level, int root_level)
5088 {
5089         int ret = 0;
5090         int nritems;
5091         nritems = btrfs_header_nritems(path->nodes[*level]);
5092
5093         path->slots[*level]++;
5094
5095         while (path->slots[*level] == nritems) {
5096                 if (*level == root_level)
5097                         return -1;
5098
5099                 /* move upnext */
5100                 path->slots[*level] = 0;
5101                 free_extent_buffer(path->nodes[*level]);
5102                 path->nodes[*level] = NULL;
5103                 (*level)++;
5104                 path->slots[*level]++;
5105
5106                 nritems = btrfs_header_nritems(path->nodes[*level]);
5107                 ret = 1;
5108         }
5109         return ret;
5110 }
5111
5112 /*
5113  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5114  * or down.
5115  */
5116 static int tree_advance(struct btrfs_root *root,
5117                         struct btrfs_path *path,
5118                         int *level, int root_level,
5119                         int allow_down,
5120                         struct btrfs_key *key)
5121 {
5122         int ret;
5123
5124         if (*level == 0 || !allow_down) {
5125                 ret = tree_move_next_or_upnext(root, path, level, root_level);
5126         } else {
5127                 tree_move_down(root, path, level, root_level);
5128                 ret = 0;
5129         }
5130         if (ret >= 0) {
5131                 if (*level == 0)
5132                         btrfs_item_key_to_cpu(path->nodes[*level], key,
5133                                         path->slots[*level]);
5134                 else
5135                         btrfs_node_key_to_cpu(path->nodes[*level], key,
5136                                         path->slots[*level]);
5137         }
5138         return ret;
5139 }
5140
5141 static int tree_compare_item(struct btrfs_root *left_root,
5142                              struct btrfs_path *left_path,
5143                              struct btrfs_path *right_path,
5144                              char *tmp_buf)
5145 {
5146         int cmp;
5147         int len1, len2;
5148         unsigned long off1, off2;
5149
5150         len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5151         len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5152         if (len1 != len2)
5153                 return 1;
5154
5155         off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5156         off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5157                                 right_path->slots[0]);
5158
5159         read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5160
5161         cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5162         if (cmp)
5163                 return 1;
5164         return 0;
5165 }
5166
5167 #define ADVANCE 1
5168 #define ADVANCE_ONLY_NEXT -1
5169
5170 /*
5171  * This function compares two trees and calls the provided callback for
5172  * every changed/new/deleted item it finds.
5173  * If shared tree blocks are encountered, whole subtrees are skipped, making
5174  * the compare pretty fast on snapshotted subvolumes.
5175  *
5176  * This currently works on commit roots only. As commit roots are read only,
5177  * we don't do any locking. The commit roots are protected with transactions.
5178  * Transactions are ended and rejoined when a commit is tried in between.
5179  *
5180  * This function checks for modifications done to the trees while comparing.
5181  * If it detects a change, it aborts immediately.
5182  */
5183 int btrfs_compare_trees(struct btrfs_root *left_root,
5184                         struct btrfs_root *right_root,
5185                         btrfs_changed_cb_t changed_cb, void *ctx)
5186 {
5187         int ret;
5188         int cmp;
5189         struct btrfs_trans_handle *trans = NULL;
5190         struct btrfs_path *left_path = NULL;
5191         struct btrfs_path *right_path = NULL;
5192         struct btrfs_key left_key;
5193         struct btrfs_key right_key;
5194         char *tmp_buf = NULL;
5195         int left_root_level;
5196         int right_root_level;
5197         int left_level;
5198         int right_level;
5199         int left_end_reached;
5200         int right_end_reached;
5201         int advance_left;
5202         int advance_right;
5203         u64 left_blockptr;
5204         u64 right_blockptr;
5205         u64 left_start_ctransid;
5206         u64 right_start_ctransid;
5207         u64 ctransid;
5208
5209         left_path = btrfs_alloc_path();
5210         if (!left_path) {
5211                 ret = -ENOMEM;
5212                 goto out;
5213         }
5214         right_path = btrfs_alloc_path();
5215         if (!right_path) {
5216                 ret = -ENOMEM;
5217                 goto out;
5218         }
5219
5220         tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
5221         if (!tmp_buf) {
5222                 ret = -ENOMEM;
5223                 goto out;
5224         }
5225
5226         left_path->search_commit_root = 1;
5227         left_path->skip_locking = 1;
5228         right_path->search_commit_root = 1;
5229         right_path->skip_locking = 1;
5230
5231         spin_lock(&left_root->root_times_lock);
5232         left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5233         spin_unlock(&left_root->root_times_lock);
5234
5235         spin_lock(&right_root->root_times_lock);
5236         right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5237         spin_unlock(&right_root->root_times_lock);
5238
5239         trans = btrfs_join_transaction(left_root);
5240         if (IS_ERR(trans)) {
5241                 ret = PTR_ERR(trans);
5242                 trans = NULL;
5243                 goto out;
5244         }
5245
5246         /*
5247          * Strategy: Go to the first items of both trees. Then do
5248          *
5249          * If both trees are at level 0
5250          *   Compare keys of current items
5251          *     If left < right treat left item as new, advance left tree
5252          *       and repeat
5253          *     If left > right treat right item as deleted, advance right tree
5254          *       and repeat
5255          *     If left == right do deep compare of items, treat as changed if
5256          *       needed, advance both trees and repeat
5257          * If both trees are at the same level but not at level 0
5258          *   Compare keys of current nodes/leafs
5259          *     If left < right advance left tree and repeat
5260          *     If left > right advance right tree and repeat
5261          *     If left == right compare blockptrs of the next nodes/leafs
5262          *       If they match advance both trees but stay at the same level
5263          *         and repeat
5264          *       If they don't match advance both trees while allowing to go
5265          *         deeper and repeat
5266          * If tree levels are different
5267          *   Advance the tree that needs it and repeat
5268          *
5269          * Advancing a tree means:
5270          *   If we are at level 0, try to go to the next slot. If that's not
5271          *   possible, go one level up and repeat. Stop when we found a level
5272          *   where we could go to the next slot. We may at this point be on a
5273          *   node or a leaf.
5274          *
5275          *   If we are not at level 0 and not on shared tree blocks, go one
5276          *   level deeper.
5277          *
5278          *   If we are not at level 0 and on shared tree blocks, go one slot to
5279          *   the right if possible or go up and right.
5280          */
5281
5282         left_level = btrfs_header_level(left_root->commit_root);
5283         left_root_level = left_level;
5284         left_path->nodes[left_level] = left_root->commit_root;
5285         extent_buffer_get(left_path->nodes[left_level]);
5286
5287         right_level = btrfs_header_level(right_root->commit_root);
5288         right_root_level = right_level;
5289         right_path->nodes[right_level] = right_root->commit_root;
5290         extent_buffer_get(right_path->nodes[right_level]);
5291
5292         if (left_level == 0)
5293                 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5294                                 &left_key, left_path->slots[left_level]);
5295         else
5296                 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5297                                 &left_key, left_path->slots[left_level]);
5298         if (right_level == 0)
5299                 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5300                                 &right_key, right_path->slots[right_level]);
5301         else
5302                 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5303                                 &right_key, right_path->slots[right_level]);
5304
5305         left_end_reached = right_end_reached = 0;
5306         advance_left = advance_right = 0;
5307
5308         while (1) {
5309                 /*
5310                  * We need to make sure the transaction does not get committed
5311                  * while we do anything on commit roots. This means, we need to
5312                  * join and leave transactions for every item that we process.
5313                  */
5314                 if (trans && btrfs_should_end_transaction(trans, left_root)) {
5315                         btrfs_release_path(left_path);
5316                         btrfs_release_path(right_path);
5317
5318                         ret = btrfs_end_transaction(trans, left_root);
5319                         trans = NULL;
5320                         if (ret < 0)
5321                                 goto out;
5322                 }
5323                 /* now rejoin the transaction */
5324                 if (!trans) {
5325                         trans = btrfs_join_transaction(left_root);
5326                         if (IS_ERR(trans)) {
5327                                 ret = PTR_ERR(trans);
5328                                 trans = NULL;
5329                                 goto out;
5330                         }
5331
5332                         spin_lock(&left_root->root_times_lock);
5333                         ctransid = btrfs_root_ctransid(&left_root->root_item);
5334                         spin_unlock(&left_root->root_times_lock);
5335                         if (ctransid != left_start_ctransid)
5336                                 left_start_ctransid = 0;
5337
5338                         spin_lock(&right_root->root_times_lock);
5339                         ctransid = btrfs_root_ctransid(&right_root->root_item);
5340                         spin_unlock(&right_root->root_times_lock);
5341                         if (ctransid != right_start_ctransid)
5342                                 right_start_ctransid = 0;
5343
5344                         if (!left_start_ctransid || !right_start_ctransid) {
5345                                 WARN(1, KERN_WARNING
5346                                         "btrfs: btrfs_compare_tree detected "
5347                                         "a change in one of the trees while "
5348                                         "iterating. This is probably a "
5349                                         "bug.\n");
5350                                 ret = -EIO;
5351                                 goto out;
5352                         }
5353
5354                         /*
5355                          * the commit root may have changed, so start again
5356                          * where we stopped
5357                          */
5358                         left_path->lowest_level = left_level;
5359                         right_path->lowest_level = right_level;
5360                         ret = btrfs_search_slot(NULL, left_root,
5361                                         &left_key, left_path, 0, 0);
5362                         if (ret < 0)
5363                                 goto out;
5364                         ret = btrfs_search_slot(NULL, right_root,
5365                                         &right_key, right_path, 0, 0);
5366                         if (ret < 0)
5367                                 goto out;
5368                 }
5369
5370                 if (advance_left && !left_end_reached) {
5371                         ret = tree_advance(left_root, left_path, &left_level,
5372                                         left_root_level,
5373                                         advance_left != ADVANCE_ONLY_NEXT,
5374                                         &left_key);
5375                         if (ret < 0)
5376                                 left_end_reached = ADVANCE;
5377                         advance_left = 0;
5378                 }
5379                 if (advance_right && !right_end_reached) {
5380                         ret = tree_advance(right_root, right_path, &right_level,
5381                                         right_root_level,
5382                                         advance_right != ADVANCE_ONLY_NEXT,
5383                                         &right_key);
5384                         if (ret < 0)
5385                                 right_end_reached = ADVANCE;
5386                         advance_right = 0;
5387                 }
5388
5389                 if (left_end_reached && right_end_reached) {
5390                         ret = 0;
5391                         goto out;
5392                 } else if (left_end_reached) {
5393                         if (right_level == 0) {
5394                                 ret = changed_cb(left_root, right_root,
5395                                                 left_path, right_path,
5396                                                 &right_key,
5397                                                 BTRFS_COMPARE_TREE_DELETED,
5398                                                 ctx);
5399                                 if (ret < 0)
5400                                         goto out;
5401                         }
5402                         advance_right = ADVANCE;
5403                         continue;
5404                 } else if (right_end_reached) {
5405                         if (left_level == 0) {
5406                                 ret = changed_cb(left_root, right_root,
5407                                                 left_path, right_path,
5408                                                 &left_key,
5409                                                 BTRFS_COMPARE_TREE_NEW,
5410                                                 ctx);
5411                                 if (ret < 0)
5412                                         goto out;
5413                         }
5414                         advance_left = ADVANCE;
5415                         continue;
5416                 }
5417
5418                 if (left_level == 0 && right_level == 0) {
5419                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5420                         if (cmp < 0) {
5421                                 ret = changed_cb(left_root, right_root,
5422                                                 left_path, right_path,
5423                                                 &left_key,
5424                                                 BTRFS_COMPARE_TREE_NEW,
5425                                                 ctx);
5426                                 if (ret < 0)
5427                                         goto out;
5428                                 advance_left = ADVANCE;
5429                         } else if (cmp > 0) {
5430                                 ret = changed_cb(left_root, right_root,
5431                                                 left_path, right_path,
5432                                                 &right_key,
5433                                                 BTRFS_COMPARE_TREE_DELETED,
5434                                                 ctx);
5435                                 if (ret < 0)
5436                                         goto out;
5437                                 advance_right = ADVANCE;
5438                         } else {
5439                                 ret = tree_compare_item(left_root, left_path,
5440                                                 right_path, tmp_buf);
5441                                 if (ret) {
5442                                         ret = changed_cb(left_root, right_root,
5443                                                 left_path, right_path,
5444                                                 &left_key,
5445                                                 BTRFS_COMPARE_TREE_CHANGED,
5446                                                 ctx);
5447                                         if (ret < 0)
5448                                                 goto out;
5449                                 }
5450                                 advance_left = ADVANCE;
5451                                 advance_right = ADVANCE;
5452                         }
5453                 } else if (left_level == right_level) {
5454                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5455                         if (cmp < 0) {
5456                                 advance_left = ADVANCE;
5457                         } else if (cmp > 0) {
5458                                 advance_right = ADVANCE;
5459                         } else {
5460                                 left_blockptr = btrfs_node_blockptr(
5461                                                 left_path->nodes[left_level],
5462                                                 left_path->slots[left_level]);
5463                                 right_blockptr = btrfs_node_blockptr(
5464                                                 right_path->nodes[right_level],
5465                                                 right_path->slots[right_level]);
5466                                 if (left_blockptr == right_blockptr) {
5467                                         /*
5468                                          * As we're on a shared block, don't
5469                                          * allow to go deeper.
5470                                          */
5471                                         advance_left = ADVANCE_ONLY_NEXT;
5472                                         advance_right = ADVANCE_ONLY_NEXT;
5473                                 } else {
5474                                         advance_left = ADVANCE;
5475                                         advance_right = ADVANCE;
5476                                 }
5477                         }
5478                 } else if (left_level < right_level) {
5479                         advance_right = ADVANCE;
5480                 } else {
5481                         advance_left = ADVANCE;
5482                 }
5483         }
5484
5485 out:
5486         btrfs_free_path(left_path);
5487         btrfs_free_path(right_path);
5488         kfree(tmp_buf);
5489
5490         if (trans) {
5491                 if (!ret)
5492                         ret = btrfs_end_transaction(trans, left_root);
5493                 else
5494                         btrfs_end_transaction(trans, left_root);
5495         }
5496
5497         return ret;
5498 }
5499
5500 /*
5501  * this is similar to btrfs_next_leaf, but does not try to preserve
5502  * and fixup the path.  It looks for and returns the next key in the
5503  * tree based on the current path and the cache_only and min_trans
5504  * parameters.
5505  *
5506  * 0 is returned if another key is found, < 0 if there are any errors
5507  * and 1 is returned if there are no higher keys in the tree
5508  *
5509  * path->keep_locks should be set to 1 on the search made before
5510  * calling this function.
5511  */
5512 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5513                         struct btrfs_key *key, int level,
5514                         int cache_only, u64 min_trans)
5515 {
5516         int slot;
5517         struct extent_buffer *c;
5518
5519         WARN_ON(!path->keep_locks);
5520         while (level < BTRFS_MAX_LEVEL) {
5521                 if (!path->nodes[level])
5522                         return 1;
5523
5524                 slot = path->slots[level] + 1;
5525                 c = path->nodes[level];
5526 next:
5527                 if (slot >= btrfs_header_nritems(c)) {
5528                         int ret;
5529                         int orig_lowest;
5530                         struct btrfs_key cur_key;
5531                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5532                             !path->nodes[level + 1])
5533                                 return 1;
5534
5535                         if (path->locks[level + 1]) {
5536                                 level++;
5537                                 continue;
5538                         }
5539
5540                         slot = btrfs_header_nritems(c) - 1;
5541                         if (level == 0)
5542                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5543                         else
5544                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5545
5546                         orig_lowest = path->lowest_level;
5547                         btrfs_release_path(path);
5548                         path->lowest_level = level;
5549                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5550                                                 0, 0);
5551                         path->lowest_level = orig_lowest;
5552                         if (ret < 0)
5553                                 return ret;
5554
5555                         c = path->nodes[level];
5556                         slot = path->slots[level];
5557                         if (ret == 0)
5558                                 slot++;
5559                         goto next;
5560                 }
5561
5562                 if (level == 0)
5563                         btrfs_item_key_to_cpu(c, key, slot);
5564                 else {
5565                         u64 blockptr = btrfs_node_blockptr(c, slot);
5566                         u64 gen = btrfs_node_ptr_generation(c, slot);
5567
5568                         if (cache_only) {
5569                                 struct extent_buffer *cur;
5570                                 cur = btrfs_find_tree_block(root, blockptr,
5571                                             btrfs_level_size(root, level - 1));
5572                                 if (!cur ||
5573                                     btrfs_buffer_uptodate(cur, gen, 1) <= 0) {
5574                                         slot++;
5575                                         if (cur)
5576                                                 free_extent_buffer(cur);
5577                                         goto next;
5578                                 }
5579                                 free_extent_buffer(cur);
5580                         }
5581                         if (gen < min_trans) {
5582                                 slot++;
5583                                 goto next;
5584                         }
5585                         btrfs_node_key_to_cpu(c, key, slot);
5586                 }
5587                 return 0;
5588         }
5589         return 1;
5590 }
5591
5592 /*
5593  * search the tree again to find a leaf with greater keys
5594  * returns 0 if it found something or 1 if there are no greater leaves.
5595  * returns < 0 on io errors.
5596  */
5597 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5598 {
5599         return btrfs_next_old_leaf(root, path, 0);
5600 }
5601
5602 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5603                         u64 time_seq)
5604 {
5605         int slot;
5606         int level;
5607         struct extent_buffer *c;
5608         struct extent_buffer *next;
5609         struct btrfs_key key;
5610         u32 nritems;
5611         int ret;
5612         int old_spinning = path->leave_spinning;
5613         int next_rw_lock = 0;
5614
5615         nritems = btrfs_header_nritems(path->nodes[0]);
5616         if (nritems == 0)
5617                 return 1;
5618
5619         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5620 again:
5621         level = 1;
5622         next = NULL;
5623         next_rw_lock = 0;
5624         btrfs_release_path(path);
5625
5626         path->keep_locks = 1;
5627         path->leave_spinning = 1;
5628
5629         if (time_seq)
5630                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5631         else
5632                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5633         path->keep_locks = 0;
5634
5635         if (ret < 0)
5636                 return ret;
5637
5638         nritems = btrfs_header_nritems(path->nodes[0]);
5639         /*
5640          * by releasing the path above we dropped all our locks.  A balance
5641          * could have added more items next to the key that used to be
5642          * at the very end of the block.  So, check again here and
5643          * advance the path if there are now more items available.
5644          */
5645         if (nritems > 0 && path->slots[0] < nritems - 1) {
5646                 if (ret == 0)
5647                         path->slots[0]++;
5648                 ret = 0;
5649                 goto done;
5650         }
5651
5652         while (level < BTRFS_MAX_LEVEL) {
5653                 if (!path->nodes[level]) {
5654                         ret = 1;
5655                         goto done;
5656                 }
5657
5658                 slot = path->slots[level] + 1;
5659                 c = path->nodes[level];
5660                 if (slot >= btrfs_header_nritems(c)) {
5661                         level++;
5662                         if (level == BTRFS_MAX_LEVEL) {
5663                                 ret = 1;
5664                                 goto done;
5665                         }
5666                         continue;
5667                 }
5668
5669                 if (next) {
5670                         btrfs_tree_unlock_rw(next, next_rw_lock);
5671                         free_extent_buffer(next);
5672                 }
5673
5674                 next = c;
5675                 next_rw_lock = path->locks[level];
5676                 ret = read_block_for_search(NULL, root, path, &next, level,
5677                                             slot, &key, 0);
5678                 if (ret == -EAGAIN)
5679                         goto again;
5680
5681                 if (ret < 0) {
5682                         btrfs_release_path(path);
5683                         goto done;
5684                 }
5685
5686                 if (!path->skip_locking) {
5687                         ret = btrfs_try_tree_read_lock(next);
5688                         if (!ret && time_seq) {
5689                                 /*
5690                                  * If we don't get the lock, we may be racing
5691                                  * with push_leaf_left, holding that lock while
5692                                  * itself waiting for the leaf we've currently
5693                                  * locked. To solve this situation, we give up
5694                                  * on our lock and cycle.
5695                                  */
5696                                 free_extent_buffer(next);
5697                                 btrfs_release_path(path);
5698                                 cond_resched();
5699                                 goto again;
5700                         }
5701                         if (!ret) {
5702                                 btrfs_set_path_blocking(path);
5703                                 btrfs_tree_read_lock(next);
5704                                 btrfs_clear_path_blocking(path, next,
5705                                                           BTRFS_READ_LOCK);
5706                         }
5707                         next_rw_lock = BTRFS_READ_LOCK;
5708                 }
5709                 break;
5710         }
5711         path->slots[level] = slot;
5712         while (1) {
5713                 level--;
5714                 c = path->nodes[level];
5715                 if (path->locks[level])
5716                         btrfs_tree_unlock_rw(c, path->locks[level]);
5717
5718                 free_extent_buffer(c);
5719                 path->nodes[level] = next;
5720                 path->slots[level] = 0;
5721                 if (!path->skip_locking)
5722                         path->locks[level] = next_rw_lock;
5723                 if (!level)
5724                         break;
5725
5726                 ret = read_block_for_search(NULL, root, path, &next, level,
5727                                             0, &key, 0);
5728                 if (ret == -EAGAIN)
5729                         goto again;
5730
5731                 if (ret < 0) {
5732                         btrfs_release_path(path);
5733                         goto done;
5734                 }
5735
5736                 if (!path->skip_locking) {
5737                         ret = btrfs_try_tree_read_lock(next);
5738                         if (!ret) {
5739                                 btrfs_set_path_blocking(path);
5740                                 btrfs_tree_read_lock(next);
5741                                 btrfs_clear_path_blocking(path, next,
5742                                                           BTRFS_READ_LOCK);
5743                         }
5744                         next_rw_lock = BTRFS_READ_LOCK;
5745                 }
5746         }
5747         ret = 0;
5748 done:
5749         unlock_up(path, 0, 1, 0, NULL);
5750         path->leave_spinning = old_spinning;
5751         if (!old_spinning)
5752                 btrfs_set_path_blocking(path);
5753
5754         return ret;
5755 }
5756
5757 /*
5758  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5759  * searching until it gets past min_objectid or finds an item of 'type'
5760  *
5761  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5762  */
5763 int btrfs_previous_item(struct btrfs_root *root,
5764                         struct btrfs_path *path, u64 min_objectid,
5765                         int type)
5766 {
5767         struct btrfs_key found_key;
5768         struct extent_buffer *leaf;
5769         u32 nritems;
5770         int ret;
5771
5772         while (1) {
5773                 if (path->slots[0] == 0) {
5774                         btrfs_set_path_blocking(path);
5775                         ret = btrfs_prev_leaf(root, path);
5776                         if (ret != 0)
5777                                 return ret;
5778                 } else {
5779                         path->slots[0]--;
5780                 }
5781                 leaf = path->nodes[0];
5782                 nritems = btrfs_header_nritems(leaf);
5783                 if (nritems == 0)
5784                         return 1;
5785                 if (path->slots[0] == nritems)
5786                         path->slots[0]--;
5787
5788                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5789                 if (found_key.objectid < min_objectid)
5790                         break;
5791                 if (found_key.type == type)
5792                         return 0;
5793                 if (found_key.objectid == min_objectid &&
5794                     found_key.type < type)
5795                         break;
5796         }
5797         return 1;
5798 }