]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/md/persistent-data/dm-btree.c
dm persistent data: add btree_walk
[karo-tx-linux.git] / drivers / md / persistent-data / dm-btree.c
1 /*
2  * Copyright (C) 2011 Red Hat, Inc.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include "dm-btree-internal.h"
8 #include "dm-space-map.h"
9 #include "dm-transaction-manager.h"
10
11 #include <linux/export.h>
12 #include <linux/device-mapper.h>
13
14 #define DM_MSG_PREFIX "btree"
15
16 /*----------------------------------------------------------------
17  * Array manipulation
18  *--------------------------------------------------------------*/
19 static void memcpy_disk(void *dest, const void *src, size_t len)
20         __dm_written_to_disk(src)
21 {
22         memcpy(dest, src, len);
23         __dm_unbless_for_disk(src);
24 }
25
26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27                          unsigned index, void *elt)
28         __dm_written_to_disk(elt)
29 {
30         if (index < nr_elts)
31                 memmove(base + (elt_size * (index + 1)),
32                         base + (elt_size * index),
33                         (nr_elts - index) * elt_size);
34
35         memcpy_disk(base + (elt_size * index), elt, elt_size);
36 }
37
38 /*----------------------------------------------------------------*/
39
40 /* makes the assumption that no two keys are the same. */
41 static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42 {
43         int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45         while (hi - lo > 1) {
46                 int mid = lo + ((hi - lo) / 2);
47                 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49                 if (mid_key == key)
50                         return mid;
51
52                 if (mid_key < key)
53                         lo = mid;
54                 else
55                         hi = mid;
56         }
57
58         return want_hi ? hi : lo;
59 }
60
61 int lower_bound(struct btree_node *n, uint64_t key)
62 {
63         return bsearch(n, key, 0);
64 }
65
66 void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
67                   struct dm_btree_value_type *vt)
68 {
69         unsigned i;
70         uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
71
72         if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73                 for (i = 0; i < nr_entries; i++)
74                         dm_tm_inc(tm, value64(n, i));
75         else if (vt->inc)
76                 for (i = 0; i < nr_entries; i++)
77                         vt->inc(vt->context, value_ptr(n, i));
78 }
79
80 static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
81                       uint64_t key, void *value)
82                       __dm_written_to_disk(value)
83 {
84         uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
85         __le64 key_le = cpu_to_le64(key);
86
87         if (index > nr_entries ||
88             index >= le32_to_cpu(node->header.max_entries)) {
89                 DMERR("too many entries in btree node for insert");
90                 __dm_unbless_for_disk(value);
91                 return -ENOMEM;
92         }
93
94         __dm_bless_for_disk(&key_le);
95
96         array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
97         array_insert(value_base(node), value_size, nr_entries, index, value);
98         node->header.nr_entries = cpu_to_le32(nr_entries + 1);
99
100         return 0;
101 }
102
103 /*----------------------------------------------------------------*/
104
105 /*
106  * We want 3n entries (for some n).  This works more nicely for repeated
107  * insert remove loops than (2n + 1).
108  */
109 static uint32_t calc_max_entries(size_t value_size, size_t block_size)
110 {
111         uint32_t total, n;
112         size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
113
114         block_size -= sizeof(struct node_header);
115         total = block_size / elt_size;
116         n = total / 3;          /* rounds down */
117
118         return 3 * n;
119 }
120
121 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
122 {
123         int r;
124         struct dm_block *b;
125         struct btree_node *n;
126         size_t block_size;
127         uint32_t max_entries;
128
129         r = new_block(info, &b);
130         if (r < 0)
131                 return r;
132
133         block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
134         max_entries = calc_max_entries(info->value_type.size, block_size);
135
136         n = dm_block_data(b);
137         memset(n, 0, block_size);
138         n->header.flags = cpu_to_le32(LEAF_NODE);
139         n->header.nr_entries = cpu_to_le32(0);
140         n->header.max_entries = cpu_to_le32(max_entries);
141         n->header.value_size = cpu_to_le32(info->value_type.size);
142
143         *root = dm_block_location(b);
144         return unlock_block(info, b);
145 }
146 EXPORT_SYMBOL_GPL(dm_btree_empty);
147
148 /*----------------------------------------------------------------*/
149
150 /*
151  * Deletion uses a recursive algorithm, since we have limited stack space
152  * we explicitly manage our own stack on the heap.
153  */
154 #define MAX_SPINE_DEPTH 64
155 struct frame {
156         struct dm_block *b;
157         struct btree_node *n;
158         unsigned level;
159         unsigned nr_children;
160         unsigned current_child;
161 };
162
163 struct del_stack {
164         struct dm_transaction_manager *tm;
165         int top;
166         struct frame spine[MAX_SPINE_DEPTH];
167 };
168
169 static int top_frame(struct del_stack *s, struct frame **f)
170 {
171         if (s->top < 0) {
172                 DMERR("btree deletion stack empty");
173                 return -EINVAL;
174         }
175
176         *f = s->spine + s->top;
177
178         return 0;
179 }
180
181 static int unprocessed_frames(struct del_stack *s)
182 {
183         return s->top >= 0;
184 }
185
186 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
187 {
188         int r;
189         uint32_t ref_count;
190
191         if (s->top >= MAX_SPINE_DEPTH - 1) {
192                 DMERR("btree deletion stack out of memory");
193                 return -ENOMEM;
194         }
195
196         r = dm_tm_ref(s->tm, b, &ref_count);
197         if (r)
198                 return r;
199
200         if (ref_count > 1)
201                 /*
202                  * This is a shared node, so we can just decrement it's
203                  * reference counter and leave the children.
204                  */
205                 dm_tm_dec(s->tm, b);
206
207         else {
208                 struct frame *f = s->spine + ++s->top;
209
210                 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
211                 if (r) {
212                         s->top--;
213                         return r;
214                 }
215
216                 f->n = dm_block_data(f->b);
217                 f->level = level;
218                 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
219                 f->current_child = 0;
220         }
221
222         return 0;
223 }
224
225 static void pop_frame(struct del_stack *s)
226 {
227         struct frame *f = s->spine + s->top--;
228
229         dm_tm_dec(s->tm, dm_block_location(f->b));
230         dm_tm_unlock(s->tm, f->b);
231 }
232
233 static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
234 {
235         return f->level < (info->levels - 1);
236 }
237
238 int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
239 {
240         int r;
241         struct del_stack *s;
242
243         s = kmalloc(sizeof(*s), GFP_KERNEL);
244         if (!s)
245                 return -ENOMEM;
246         s->tm = info->tm;
247         s->top = -1;
248
249         r = push_frame(s, root, 0);
250         if (r)
251                 goto out;
252
253         while (unprocessed_frames(s)) {
254                 uint32_t flags;
255                 struct frame *f;
256                 dm_block_t b;
257
258                 r = top_frame(s, &f);
259                 if (r)
260                         goto out;
261
262                 if (f->current_child >= f->nr_children) {
263                         pop_frame(s);
264                         continue;
265                 }
266
267                 flags = le32_to_cpu(f->n->header.flags);
268                 if (flags & INTERNAL_NODE) {
269                         b = value64(f->n, f->current_child);
270                         f->current_child++;
271                         r = push_frame(s, b, f->level);
272                         if (r)
273                                 goto out;
274
275                 } else if (is_internal_level(info, f)) {
276                         b = value64(f->n, f->current_child);
277                         f->current_child++;
278                         r = push_frame(s, b, f->level + 1);
279                         if (r)
280                                 goto out;
281
282                 } else {
283                         if (info->value_type.dec) {
284                                 unsigned i;
285
286                                 for (i = 0; i < f->nr_children; i++)
287                                         info->value_type.dec(info->value_type.context,
288                                                              value_ptr(f->n, i));
289                         }
290                         f->current_child = f->nr_children;
291                 }
292         }
293
294 out:
295         kfree(s);
296         return r;
297 }
298 EXPORT_SYMBOL_GPL(dm_btree_del);
299
300 /*----------------------------------------------------------------*/
301
302 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
303                             int (*search_fn)(struct btree_node *, uint64_t),
304                             uint64_t *result_key, void *v, size_t value_size)
305 {
306         int i, r;
307         uint32_t flags, nr_entries;
308
309         do {
310                 r = ro_step(s, block);
311                 if (r < 0)
312                         return r;
313
314                 i = search_fn(ro_node(s), key);
315
316                 flags = le32_to_cpu(ro_node(s)->header.flags);
317                 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
318                 if (i < 0 || i >= nr_entries)
319                         return -ENODATA;
320
321                 if (flags & INTERNAL_NODE)
322                         block = value64(ro_node(s), i);
323
324         } while (!(flags & LEAF_NODE));
325
326         *result_key = le64_to_cpu(ro_node(s)->keys[i]);
327         memcpy(v, value_ptr(ro_node(s), i), value_size);
328
329         return 0;
330 }
331
332 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
333                     uint64_t *keys, void *value_le)
334 {
335         unsigned level, last_level = info->levels - 1;
336         int r = -ENODATA;
337         uint64_t rkey;
338         __le64 internal_value_le;
339         struct ro_spine spine;
340
341         init_ro_spine(&spine, info);
342         for (level = 0; level < info->levels; level++) {
343                 size_t size;
344                 void *value_p;
345
346                 if (level == last_level) {
347                         value_p = value_le;
348                         size = info->value_type.size;
349
350                 } else {
351                         value_p = &internal_value_le;
352                         size = sizeof(uint64_t);
353                 }
354
355                 r = btree_lookup_raw(&spine, root, keys[level],
356                                      lower_bound, &rkey,
357                                      value_p, size);
358
359                 if (!r) {
360                         if (rkey != keys[level]) {
361                                 exit_ro_spine(&spine);
362                                 return -ENODATA;
363                         }
364                 } else {
365                         exit_ro_spine(&spine);
366                         return r;
367                 }
368
369                 root = le64_to_cpu(internal_value_le);
370         }
371         exit_ro_spine(&spine);
372
373         return r;
374 }
375 EXPORT_SYMBOL_GPL(dm_btree_lookup);
376
377 /*
378  * Splits a node by creating a sibling node and shifting half the nodes
379  * contents across.  Assumes there is a parent node, and it has room for
380  * another child.
381  *
382  * Before:
383  *        +--------+
384  *        | Parent |
385  *        +--------+
386  *           |
387  *           v
388  *      +----------+
389  *      | A ++++++ |
390  *      +----------+
391  *
392  *
393  * After:
394  *              +--------+
395  *              | Parent |
396  *              +--------+
397  *                |     |
398  *                v     +------+
399  *          +---------+        |
400  *          | A* +++  |        v
401  *          +---------+   +-------+
402  *                        | B +++ |
403  *                        +-------+
404  *
405  * Where A* is a shadow of A.
406  */
407 static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
408                                unsigned parent_index, uint64_t key)
409 {
410         int r;
411         size_t size;
412         unsigned nr_left, nr_right;
413         struct dm_block *left, *right, *parent;
414         struct btree_node *ln, *rn, *pn;
415         __le64 location;
416
417         left = shadow_current(s);
418
419         r = new_block(s->info, &right);
420         if (r < 0)
421                 return r;
422
423         ln = dm_block_data(left);
424         rn = dm_block_data(right);
425
426         nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
427         nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
428
429         ln->header.nr_entries = cpu_to_le32(nr_left);
430
431         rn->header.flags = ln->header.flags;
432         rn->header.nr_entries = cpu_to_le32(nr_right);
433         rn->header.max_entries = ln->header.max_entries;
434         rn->header.value_size = ln->header.value_size;
435         memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
436
437         size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
438                 sizeof(uint64_t) : s->info->value_type.size;
439         memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
440                size * nr_right);
441
442         /*
443          * Patch up the parent
444          */
445         parent = shadow_parent(s);
446
447         pn = dm_block_data(parent);
448         location = cpu_to_le64(dm_block_location(left));
449         __dm_bless_for_disk(&location);
450         memcpy_disk(value_ptr(pn, parent_index),
451                     &location, sizeof(__le64));
452
453         location = cpu_to_le64(dm_block_location(right));
454         __dm_bless_for_disk(&location);
455
456         r = insert_at(sizeof(__le64), pn, parent_index + 1,
457                       le64_to_cpu(rn->keys[0]), &location);
458         if (r)
459                 return r;
460
461         if (key < le64_to_cpu(rn->keys[0])) {
462                 unlock_block(s->info, right);
463                 s->nodes[1] = left;
464         } else {
465                 unlock_block(s->info, left);
466                 s->nodes[1] = right;
467         }
468
469         return 0;
470 }
471
472 /*
473  * Splits a node by creating two new children beneath the given node.
474  *
475  * Before:
476  *        +----------+
477  *        | A ++++++ |
478  *        +----------+
479  *
480  *
481  * After:
482  *      +------------+
483  *      | A (shadow) |
484  *      +------------+
485  *          |   |
486  *   +------+   +----+
487  *   |               |
488  *   v               v
489  * +-------+     +-------+
490  * | B +++ |     | C +++ |
491  * +-------+     +-------+
492  */
493 static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
494 {
495         int r;
496         size_t size;
497         unsigned nr_left, nr_right;
498         struct dm_block *left, *right, *new_parent;
499         struct btree_node *pn, *ln, *rn;
500         __le64 val;
501
502         new_parent = shadow_current(s);
503
504         r = new_block(s->info, &left);
505         if (r < 0)
506                 return r;
507
508         r = new_block(s->info, &right);
509         if (r < 0) {
510                 /* FIXME: put left */
511                 return r;
512         }
513
514         pn = dm_block_data(new_parent);
515         ln = dm_block_data(left);
516         rn = dm_block_data(right);
517
518         nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
519         nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
520
521         ln->header.flags = pn->header.flags;
522         ln->header.nr_entries = cpu_to_le32(nr_left);
523         ln->header.max_entries = pn->header.max_entries;
524         ln->header.value_size = pn->header.value_size;
525
526         rn->header.flags = pn->header.flags;
527         rn->header.nr_entries = cpu_to_le32(nr_right);
528         rn->header.max_entries = pn->header.max_entries;
529         rn->header.value_size = pn->header.value_size;
530
531         memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
532         memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
533
534         size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
535                 sizeof(__le64) : s->info->value_type.size;
536         memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
537         memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
538                nr_right * size);
539
540         /* new_parent should just point to l and r now */
541         pn->header.flags = cpu_to_le32(INTERNAL_NODE);
542         pn->header.nr_entries = cpu_to_le32(2);
543         pn->header.max_entries = cpu_to_le32(
544                 calc_max_entries(sizeof(__le64),
545                                  dm_bm_block_size(
546                                          dm_tm_get_bm(s->info->tm))));
547         pn->header.value_size = cpu_to_le32(sizeof(__le64));
548
549         val = cpu_to_le64(dm_block_location(left));
550         __dm_bless_for_disk(&val);
551         pn->keys[0] = ln->keys[0];
552         memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
553
554         val = cpu_to_le64(dm_block_location(right));
555         __dm_bless_for_disk(&val);
556         pn->keys[1] = rn->keys[0];
557         memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
558
559         /*
560          * rejig the spine.  This is ugly, since it knows too
561          * much about the spine
562          */
563         if (s->nodes[0] != new_parent) {
564                 unlock_block(s->info, s->nodes[0]);
565                 s->nodes[0] = new_parent;
566         }
567         if (key < le64_to_cpu(rn->keys[0])) {
568                 unlock_block(s->info, right);
569                 s->nodes[1] = left;
570         } else {
571                 unlock_block(s->info, left);
572                 s->nodes[1] = right;
573         }
574         s->count = 2;
575
576         return 0;
577 }
578
579 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
580                             struct dm_btree_value_type *vt,
581                             uint64_t key, unsigned *index)
582 {
583         int r, i = *index, top = 1;
584         struct btree_node *node;
585
586         for (;;) {
587                 r = shadow_step(s, root, vt);
588                 if (r < 0)
589                         return r;
590
591                 node = dm_block_data(shadow_current(s));
592
593                 /*
594                  * We have to patch up the parent node, ugly, but I don't
595                  * see a way to do this automatically as part of the spine
596                  * op.
597                  */
598                 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
599                         __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
600
601                         __dm_bless_for_disk(&location);
602                         memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
603                                     &location, sizeof(__le64));
604                 }
605
606                 node = dm_block_data(shadow_current(s));
607
608                 if (node->header.nr_entries == node->header.max_entries) {
609                         if (top)
610                                 r = btree_split_beneath(s, key);
611                         else
612                                 r = btree_split_sibling(s, root, i, key);
613
614                         if (r < 0)
615                                 return r;
616                 }
617
618                 node = dm_block_data(shadow_current(s));
619
620                 i = lower_bound(node, key);
621
622                 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
623                         break;
624
625                 if (i < 0) {
626                         /* change the bounds on the lowest key */
627                         node->keys[0] = cpu_to_le64(key);
628                         i = 0;
629                 }
630
631                 root = value64(node, i);
632                 top = 0;
633         }
634
635         if (i < 0 || le64_to_cpu(node->keys[i]) != key)
636                 i++;
637
638         *index = i;
639         return 0;
640 }
641
642 static int insert(struct dm_btree_info *info, dm_block_t root,
643                   uint64_t *keys, void *value, dm_block_t *new_root,
644                   int *inserted)
645                   __dm_written_to_disk(value)
646 {
647         int r, need_insert;
648         unsigned level, index = -1, last_level = info->levels - 1;
649         dm_block_t block = root;
650         struct shadow_spine spine;
651         struct btree_node *n;
652         struct dm_btree_value_type le64_type;
653
654         le64_type.context = NULL;
655         le64_type.size = sizeof(__le64);
656         le64_type.inc = NULL;
657         le64_type.dec = NULL;
658         le64_type.equal = NULL;
659
660         init_shadow_spine(&spine, info);
661
662         for (level = 0; level < (info->levels - 1); level++) {
663                 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
664                 if (r < 0)
665                         goto bad;
666
667                 n = dm_block_data(shadow_current(&spine));
668                 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
669                                (le64_to_cpu(n->keys[index]) != keys[level]));
670
671                 if (need_insert) {
672                         dm_block_t new_tree;
673                         __le64 new_le;
674
675                         r = dm_btree_empty(info, &new_tree);
676                         if (r < 0)
677                                 goto bad;
678
679                         new_le = cpu_to_le64(new_tree);
680                         __dm_bless_for_disk(&new_le);
681
682                         r = insert_at(sizeof(uint64_t), n, index,
683                                       keys[level], &new_le);
684                         if (r)
685                                 goto bad;
686                 }
687
688                 if (level < last_level)
689                         block = value64(n, index);
690         }
691
692         r = btree_insert_raw(&spine, block, &info->value_type,
693                              keys[level], &index);
694         if (r < 0)
695                 goto bad;
696
697         n = dm_block_data(shadow_current(&spine));
698         need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
699                        (le64_to_cpu(n->keys[index]) != keys[level]));
700
701         if (need_insert) {
702                 if (inserted)
703                         *inserted = 1;
704
705                 r = insert_at(info->value_type.size, n, index,
706                               keys[level], value);
707                 if (r)
708                         goto bad_unblessed;
709         } else {
710                 if (inserted)
711                         *inserted = 0;
712
713                 if (info->value_type.dec &&
714                     (!info->value_type.equal ||
715                      !info->value_type.equal(
716                              info->value_type.context,
717                              value_ptr(n, index),
718                              value))) {
719                         info->value_type.dec(info->value_type.context,
720                                              value_ptr(n, index));
721                 }
722                 memcpy_disk(value_ptr(n, index),
723                             value, info->value_type.size);
724         }
725
726         *new_root = shadow_root(&spine);
727         exit_shadow_spine(&spine);
728
729         return 0;
730
731 bad:
732         __dm_unbless_for_disk(value);
733 bad_unblessed:
734         exit_shadow_spine(&spine);
735         return r;
736 }
737
738 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
739                     uint64_t *keys, void *value, dm_block_t *new_root)
740                     __dm_written_to_disk(value)
741 {
742         return insert(info, root, keys, value, new_root, NULL);
743 }
744 EXPORT_SYMBOL_GPL(dm_btree_insert);
745
746 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
747                            uint64_t *keys, void *value, dm_block_t *new_root,
748                            int *inserted)
749                            __dm_written_to_disk(value)
750 {
751         return insert(info, root, keys, value, new_root, inserted);
752 }
753 EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
754
755 /*----------------------------------------------------------------*/
756
757 static int find_highest_key(struct ro_spine *s, dm_block_t block,
758                             uint64_t *result_key, dm_block_t *next_block)
759 {
760         int i, r;
761         uint32_t flags;
762
763         do {
764                 r = ro_step(s, block);
765                 if (r < 0)
766                         return r;
767
768                 flags = le32_to_cpu(ro_node(s)->header.flags);
769                 i = le32_to_cpu(ro_node(s)->header.nr_entries);
770                 if (!i)
771                         return -ENODATA;
772                 else
773                         i--;
774
775                 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
776                 if (next_block || flags & INTERNAL_NODE)
777                         block = value64(ro_node(s), i);
778
779         } while (flags & INTERNAL_NODE);
780
781         if (next_block)
782                 *next_block = block;
783         return 0;
784 }
785
786 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
787                               uint64_t *result_keys)
788 {
789         int r = 0, count = 0, level;
790         struct ro_spine spine;
791
792         init_ro_spine(&spine, info);
793         for (level = 0; level < info->levels; level++) {
794                 r = find_highest_key(&spine, root, result_keys + level,
795                                      level == info->levels - 1 ? NULL : &root);
796                 if (r == -ENODATA) {
797                         r = 0;
798                         break;
799
800                 } else if (r)
801                         break;
802
803                 count++;
804         }
805         exit_ro_spine(&spine);
806
807         return r ? r : count;
808 }
809 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
810
811 /*
812  * FIXME: We shouldn't use a recursive algorithm when we have limited stack
813  * space.  Also this only works for single level trees.
814  */
815 static int walk_node(struct ro_spine *s, dm_block_t block,
816                      int (*fn)(void *context, uint64_t *keys, void *leaf),
817                      void *context)
818 {
819         int r;
820         unsigned i, nr;
821         struct btree_node *n;
822         uint64_t keys;
823
824         r = ro_step(s, block);
825         n = ro_node(s);
826
827         nr = le32_to_cpu(n->header.nr_entries);
828         for (i = 0; i < nr; i++) {
829                 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
830                         r = walk_node(s, value64(n, i), fn, context);
831                         if (r)
832                                 goto out;
833                 } else {
834                         keys = le64_to_cpu(*key_ptr(n, i));
835                         r = fn(context, &keys, value_ptr(n, i));
836                         if (r)
837                                 goto out;
838                 }
839         }
840
841 out:
842         ro_pop(s);
843         return r;
844 }
845
846 int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
847                   int (*fn)(void *context, uint64_t *keys, void *leaf),
848                   void *context)
849 {
850         int r;
851         struct ro_spine spine;
852
853         BUG_ON(info->levels > 1);
854
855         init_ro_spine(&spine, info);
856         r = walk_node(&spine, root, fn, context);
857         exit_ro_spine(&spine);
858
859         return r;
860 }
861 EXPORT_SYMBOL_GPL(dm_btree_walk);