]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/md/persistent-data/dm-btree.c
Merge remote-tracking branch 'device-mapper/for-next'
[karo-tx-linux.git] / drivers / md / persistent-data / dm-btree.c
1 /*
2  * Copyright (C) 2011 Red Hat, Inc.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include "dm-btree-internal.h"
8 #include "dm-space-map.h"
9 #include "dm-transaction-manager.h"
10
11 #include <linux/export.h>
12 #include <linux/device-mapper.h>
13
14 #define DM_MSG_PREFIX "btree"
15
16 /*----------------------------------------------------------------
17  * Array manipulation
18  *--------------------------------------------------------------*/
19 static void memcpy_disk(void *dest, const void *src, size_t len)
20         __dm_written_to_disk(src)
21 {
22         memcpy(dest, src, len);
23         __dm_unbless_for_disk(src);
24 }
25
26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27                          unsigned index, void *elt)
28         __dm_written_to_disk(elt)
29 {
30         if (index < nr_elts)
31                 memmove(base + (elt_size * (index + 1)),
32                         base + (elt_size * index),
33                         (nr_elts - index) * elt_size);
34
35         memcpy_disk(base + (elt_size * index), elt, elt_size);
36 }
37
38 /*----------------------------------------------------------------*/
39
40 /* makes the assumption that no two keys are the same. */
41 static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42 {
43         int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45         while (hi - lo > 1) {
46                 int mid = lo + ((hi - lo) / 2);
47                 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49                 if (mid_key == key)
50                         return mid;
51
52                 if (mid_key < key)
53                         lo = mid;
54                 else
55                         hi = mid;
56         }
57
58         return want_hi ? hi : lo;
59 }
60
61 int lower_bound(struct btree_node *n, uint64_t key)
62 {
63         return bsearch(n, key, 0);
64 }
65
66 void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
67                   struct dm_btree_value_type *vt)
68 {
69         unsigned i;
70         uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
71
72         if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73                 for (i = 0; i < nr_entries; i++)
74                         dm_tm_inc(tm, value64(n, i));
75         else if (vt->inc)
76                 for (i = 0; i < nr_entries; i++)
77                         vt->inc(vt->context, value_ptr(n, i));
78 }
79
80 static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
81                       uint64_t key, void *value)
82                       __dm_written_to_disk(value)
83 {
84         uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
85         __le64 key_le = cpu_to_le64(key);
86
87         if (index > nr_entries ||
88             index >= le32_to_cpu(node->header.max_entries)) {
89                 DMERR("too many entries in btree node for insert");
90                 __dm_unbless_for_disk(value);
91                 return -ENOMEM;
92         }
93
94         __dm_bless_for_disk(&key_le);
95
96         array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
97         array_insert(value_base(node), value_size, nr_entries, index, value);
98         node->header.nr_entries = cpu_to_le32(nr_entries + 1);
99
100         return 0;
101 }
102
103 /*----------------------------------------------------------------*/
104
105 /*
106  * We want 3n entries (for some n).  This works more nicely for repeated
107  * insert remove loops than (2n + 1).
108  */
109 static uint32_t calc_max_entries(size_t value_size, size_t block_size)
110 {
111         uint32_t total, n;
112         size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
113
114         block_size -= sizeof(struct node_header);
115         total = block_size / elt_size;
116         n = total / 3;          /* rounds down */
117
118         return 3 * n;
119 }
120
121 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
122 {
123         int r;
124         struct dm_block *b;
125         struct btree_node *n;
126         size_t block_size;
127         uint32_t max_entries;
128
129         r = new_block(info, &b);
130         if (r < 0)
131                 return r;
132
133         block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
134         max_entries = calc_max_entries(info->value_type.size, block_size);
135
136         n = dm_block_data(b);
137         memset(n, 0, block_size);
138         n->header.flags = cpu_to_le32(LEAF_NODE);
139         n->header.nr_entries = cpu_to_le32(0);
140         n->header.max_entries = cpu_to_le32(max_entries);
141         n->header.value_size = cpu_to_le32(info->value_type.size);
142
143         *root = dm_block_location(b);
144         unlock_block(info, b);
145
146         return 0;
147 }
148 EXPORT_SYMBOL_GPL(dm_btree_empty);
149
150 /*----------------------------------------------------------------*/
151
152 /*
153  * Deletion uses a recursive algorithm, since we have limited stack space
154  * we explicitly manage our own stack on the heap.
155  */
156 #define MAX_SPINE_DEPTH 64
157 struct frame {
158         struct dm_block *b;
159         struct btree_node *n;
160         unsigned level;
161         unsigned nr_children;
162         unsigned current_child;
163 };
164
165 struct del_stack {
166         struct dm_btree_info *info;
167         struct dm_transaction_manager *tm;
168         int top;
169         struct frame spine[MAX_SPINE_DEPTH];
170 };
171
172 static int top_frame(struct del_stack *s, struct frame **f)
173 {
174         if (s->top < 0) {
175                 DMERR("btree deletion stack empty");
176                 return -EINVAL;
177         }
178
179         *f = s->spine + s->top;
180
181         return 0;
182 }
183
184 static int unprocessed_frames(struct del_stack *s)
185 {
186         return s->top >= 0;
187 }
188
189 static void prefetch_children(struct del_stack *s, struct frame *f)
190 {
191         unsigned i;
192         struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
193
194         for (i = 0; i < f->nr_children; i++)
195                 dm_bm_prefetch(bm, value64(f->n, i));
196 }
197
198 static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
199 {
200         return f->level < (info->levels - 1);
201 }
202
203 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
204 {
205         int r;
206         uint32_t ref_count;
207
208         if (s->top >= MAX_SPINE_DEPTH - 1) {
209                 DMERR("btree deletion stack out of memory");
210                 return -ENOMEM;
211         }
212
213         r = dm_tm_ref(s->tm, b, &ref_count);
214         if (r)
215                 return r;
216
217         if (ref_count > 1)
218                 /*
219                  * This is a shared node, so we can just decrement it's
220                  * reference counter and leave the children.
221                  */
222                 dm_tm_dec(s->tm, b);
223
224         else {
225                 uint32_t flags;
226                 struct frame *f = s->spine + ++s->top;
227
228                 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
229                 if (r) {
230                         s->top--;
231                         return r;
232                 }
233
234                 f->n = dm_block_data(f->b);
235                 f->level = level;
236                 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
237                 f->current_child = 0;
238
239                 flags = le32_to_cpu(f->n->header.flags);
240                 if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
241                         prefetch_children(s, f);
242         }
243
244         return 0;
245 }
246
247 static void pop_frame(struct del_stack *s)
248 {
249         struct frame *f = s->spine + s->top--;
250
251         dm_tm_dec(s->tm, dm_block_location(f->b));
252         dm_tm_unlock(s->tm, f->b);
253 }
254
255 int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
256 {
257         int r;
258         struct del_stack *s;
259
260         s = kmalloc(sizeof(*s), GFP_NOIO);
261         if (!s)
262                 return -ENOMEM;
263         s->info = info;
264         s->tm = info->tm;
265         s->top = -1;
266
267         r = push_frame(s, root, 0);
268         if (r)
269                 goto out;
270
271         while (unprocessed_frames(s)) {
272                 uint32_t flags;
273                 struct frame *f;
274                 dm_block_t b;
275
276                 r = top_frame(s, &f);
277                 if (r)
278                         goto out;
279
280                 if (f->current_child >= f->nr_children) {
281                         pop_frame(s);
282                         continue;
283                 }
284
285                 flags = le32_to_cpu(f->n->header.flags);
286                 if (flags & INTERNAL_NODE) {
287                         b = value64(f->n, f->current_child);
288                         f->current_child++;
289                         r = push_frame(s, b, f->level);
290                         if (r)
291                                 goto out;
292
293                 } else if (is_internal_level(info, f)) {
294                         b = value64(f->n, f->current_child);
295                         f->current_child++;
296                         r = push_frame(s, b, f->level + 1);
297                         if (r)
298                                 goto out;
299
300                 } else {
301                         if (info->value_type.dec) {
302                                 unsigned i;
303
304                                 for (i = 0; i < f->nr_children; i++)
305                                         info->value_type.dec(info->value_type.context,
306                                                              value_ptr(f->n, i));
307                         }
308                         pop_frame(s);
309                 }
310         }
311
312 out:
313         kfree(s);
314         return r;
315 }
316 EXPORT_SYMBOL_GPL(dm_btree_del);
317
318 /*----------------------------------------------------------------*/
319
320 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
321                             int (*search_fn)(struct btree_node *, uint64_t),
322                             uint64_t *result_key, void *v, size_t value_size)
323 {
324         int i, r;
325         uint32_t flags, nr_entries;
326
327         do {
328                 r = ro_step(s, block);
329                 if (r < 0)
330                         return r;
331
332                 i = search_fn(ro_node(s), key);
333
334                 flags = le32_to_cpu(ro_node(s)->header.flags);
335                 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
336                 if (i < 0 || i >= nr_entries)
337                         return -ENODATA;
338
339                 if (flags & INTERNAL_NODE)
340                         block = value64(ro_node(s), i);
341
342         } while (!(flags & LEAF_NODE));
343
344         *result_key = le64_to_cpu(ro_node(s)->keys[i]);
345         memcpy(v, value_ptr(ro_node(s), i), value_size);
346
347         return 0;
348 }
349
350 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
351                     uint64_t *keys, void *value_le)
352 {
353         unsigned level, last_level = info->levels - 1;
354         int r = -ENODATA;
355         uint64_t rkey;
356         __le64 internal_value_le;
357         struct ro_spine spine;
358
359         init_ro_spine(&spine, info);
360         for (level = 0; level < info->levels; level++) {
361                 size_t size;
362                 void *value_p;
363
364                 if (level == last_level) {
365                         value_p = value_le;
366                         size = info->value_type.size;
367
368                 } else {
369                         value_p = &internal_value_le;
370                         size = sizeof(uint64_t);
371                 }
372
373                 r = btree_lookup_raw(&spine, root, keys[level],
374                                      lower_bound, &rkey,
375                                      value_p, size);
376
377                 if (!r) {
378                         if (rkey != keys[level]) {
379                                 exit_ro_spine(&spine);
380                                 return -ENODATA;
381                         }
382                 } else {
383                         exit_ro_spine(&spine);
384                         return r;
385                 }
386
387                 root = le64_to_cpu(internal_value_le);
388         }
389         exit_ro_spine(&spine);
390
391         return r;
392 }
393 EXPORT_SYMBOL_GPL(dm_btree_lookup);
394
395 /*
396  * Splits a node by creating a sibling node and shifting half the nodes
397  * contents across.  Assumes there is a parent node, and it has room for
398  * another child.
399  *
400  * Before:
401  *        +--------+
402  *        | Parent |
403  *        +--------+
404  *           |
405  *           v
406  *      +----------+
407  *      | A ++++++ |
408  *      +----------+
409  *
410  *
411  * After:
412  *              +--------+
413  *              | Parent |
414  *              +--------+
415  *                |     |
416  *                v     +------+
417  *          +---------+        |
418  *          | A* +++  |        v
419  *          +---------+   +-------+
420  *                        | B +++ |
421  *                        +-------+
422  *
423  * Where A* is a shadow of A.
424  */
425 static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
426                                uint64_t key)
427 {
428         int r;
429         size_t size;
430         unsigned nr_left, nr_right;
431         struct dm_block *left, *right, *parent;
432         struct btree_node *ln, *rn, *pn;
433         __le64 location;
434
435         left = shadow_current(s);
436
437         r = new_block(s->info, &right);
438         if (r < 0)
439                 return r;
440
441         ln = dm_block_data(left);
442         rn = dm_block_data(right);
443
444         nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
445         nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
446
447         ln->header.nr_entries = cpu_to_le32(nr_left);
448
449         rn->header.flags = ln->header.flags;
450         rn->header.nr_entries = cpu_to_le32(nr_right);
451         rn->header.max_entries = ln->header.max_entries;
452         rn->header.value_size = ln->header.value_size;
453         memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
454
455         size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
456                 sizeof(uint64_t) : s->info->value_type.size;
457         memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
458                size * nr_right);
459
460         /*
461          * Patch up the parent
462          */
463         parent = shadow_parent(s);
464
465         pn = dm_block_data(parent);
466         location = cpu_to_le64(dm_block_location(left));
467         __dm_bless_for_disk(&location);
468         memcpy_disk(value_ptr(pn, parent_index),
469                     &location, sizeof(__le64));
470
471         location = cpu_to_le64(dm_block_location(right));
472         __dm_bless_for_disk(&location);
473
474         r = insert_at(sizeof(__le64), pn, parent_index + 1,
475                       le64_to_cpu(rn->keys[0]), &location);
476         if (r)
477                 return r;
478
479         if (key < le64_to_cpu(rn->keys[0])) {
480                 unlock_block(s->info, right);
481                 s->nodes[1] = left;
482         } else {
483                 unlock_block(s->info, left);
484                 s->nodes[1] = right;
485         }
486
487         return 0;
488 }
489
490 /*
491  * Splits a node by creating two new children beneath the given node.
492  *
493  * Before:
494  *        +----------+
495  *        | A ++++++ |
496  *        +----------+
497  *
498  *
499  * After:
500  *      +------------+
501  *      | A (shadow) |
502  *      +------------+
503  *          |   |
504  *   +------+   +----+
505  *   |               |
506  *   v               v
507  * +-------+     +-------+
508  * | B +++ |     | C +++ |
509  * +-------+     +-------+
510  */
511 static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
512 {
513         int r;
514         size_t size;
515         unsigned nr_left, nr_right;
516         struct dm_block *left, *right, *new_parent;
517         struct btree_node *pn, *ln, *rn;
518         __le64 val;
519
520         new_parent = shadow_current(s);
521
522         r = new_block(s->info, &left);
523         if (r < 0)
524                 return r;
525
526         r = new_block(s->info, &right);
527         if (r < 0) {
528                 unlock_block(s->info, left);
529                 return r;
530         }
531
532         pn = dm_block_data(new_parent);
533         ln = dm_block_data(left);
534         rn = dm_block_data(right);
535
536         nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
537         nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
538
539         ln->header.flags = pn->header.flags;
540         ln->header.nr_entries = cpu_to_le32(nr_left);
541         ln->header.max_entries = pn->header.max_entries;
542         ln->header.value_size = pn->header.value_size;
543
544         rn->header.flags = pn->header.flags;
545         rn->header.nr_entries = cpu_to_le32(nr_right);
546         rn->header.max_entries = pn->header.max_entries;
547         rn->header.value_size = pn->header.value_size;
548
549         memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
550         memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
551
552         size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
553                 sizeof(__le64) : s->info->value_type.size;
554         memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
555         memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
556                nr_right * size);
557
558         /* new_parent should just point to l and r now */
559         pn->header.flags = cpu_to_le32(INTERNAL_NODE);
560         pn->header.nr_entries = cpu_to_le32(2);
561         pn->header.max_entries = cpu_to_le32(
562                 calc_max_entries(sizeof(__le64),
563                                  dm_bm_block_size(
564                                          dm_tm_get_bm(s->info->tm))));
565         pn->header.value_size = cpu_to_le32(sizeof(__le64));
566
567         val = cpu_to_le64(dm_block_location(left));
568         __dm_bless_for_disk(&val);
569         pn->keys[0] = ln->keys[0];
570         memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
571
572         val = cpu_to_le64(dm_block_location(right));
573         __dm_bless_for_disk(&val);
574         pn->keys[1] = rn->keys[0];
575         memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
576
577         /*
578          * rejig the spine.  This is ugly, since it knows too
579          * much about the spine
580          */
581         if (s->nodes[0] != new_parent) {
582                 unlock_block(s->info, s->nodes[0]);
583                 s->nodes[0] = new_parent;
584         }
585         if (key < le64_to_cpu(rn->keys[0])) {
586                 unlock_block(s->info, right);
587                 s->nodes[1] = left;
588         } else {
589                 unlock_block(s->info, left);
590                 s->nodes[1] = right;
591         }
592         s->count = 2;
593
594         return 0;
595 }
596
597 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
598                             struct dm_btree_value_type *vt,
599                             uint64_t key, unsigned *index)
600 {
601         int r, i = *index, top = 1;
602         struct btree_node *node;
603
604         for (;;) {
605                 r = shadow_step(s, root, vt);
606                 if (r < 0)
607                         return r;
608
609                 node = dm_block_data(shadow_current(s));
610
611                 /*
612                  * We have to patch up the parent node, ugly, but I don't
613                  * see a way to do this automatically as part of the spine
614                  * op.
615                  */
616                 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
617                         __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
618
619                         __dm_bless_for_disk(&location);
620                         memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
621                                     &location, sizeof(__le64));
622                 }
623
624                 node = dm_block_data(shadow_current(s));
625
626                 if (node->header.nr_entries == node->header.max_entries) {
627                         if (top)
628                                 r = btree_split_beneath(s, key);
629                         else
630                                 r = btree_split_sibling(s, i, key);
631
632                         if (r < 0)
633                                 return r;
634                 }
635
636                 node = dm_block_data(shadow_current(s));
637
638                 i = lower_bound(node, key);
639
640                 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
641                         break;
642
643                 if (i < 0) {
644                         /* change the bounds on the lowest key */
645                         node->keys[0] = cpu_to_le64(key);
646                         i = 0;
647                 }
648
649                 root = value64(node, i);
650                 top = 0;
651         }
652
653         if (i < 0 || le64_to_cpu(node->keys[i]) != key)
654                 i++;
655
656         *index = i;
657         return 0;
658 }
659
660 static int insert(struct dm_btree_info *info, dm_block_t root,
661                   uint64_t *keys, void *value, dm_block_t *new_root,
662                   int *inserted)
663                   __dm_written_to_disk(value)
664 {
665         int r, need_insert;
666         unsigned level, index = -1, last_level = info->levels - 1;
667         dm_block_t block = root;
668         struct shadow_spine spine;
669         struct btree_node *n;
670         struct dm_btree_value_type le64_type;
671
672         init_le64_type(info->tm, &le64_type);
673         init_shadow_spine(&spine, info);
674
675         for (level = 0; level < (info->levels - 1); level++) {
676                 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
677                 if (r < 0)
678                         goto bad;
679
680                 n = dm_block_data(shadow_current(&spine));
681                 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
682                                (le64_to_cpu(n->keys[index]) != keys[level]));
683
684                 if (need_insert) {
685                         dm_block_t new_tree;
686                         __le64 new_le;
687
688                         r = dm_btree_empty(info, &new_tree);
689                         if (r < 0)
690                                 goto bad;
691
692                         new_le = cpu_to_le64(new_tree);
693                         __dm_bless_for_disk(&new_le);
694
695                         r = insert_at(sizeof(uint64_t), n, index,
696                                       keys[level], &new_le);
697                         if (r)
698                                 goto bad;
699                 }
700
701                 if (level < last_level)
702                         block = value64(n, index);
703         }
704
705         r = btree_insert_raw(&spine, block, &info->value_type,
706                              keys[level], &index);
707         if (r < 0)
708                 goto bad;
709
710         n = dm_block_data(shadow_current(&spine));
711         need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
712                        (le64_to_cpu(n->keys[index]) != keys[level]));
713
714         if (need_insert) {
715                 if (inserted)
716                         *inserted = 1;
717
718                 r = insert_at(info->value_type.size, n, index,
719                               keys[level], value);
720                 if (r)
721                         goto bad_unblessed;
722         } else {
723                 if (inserted)
724                         *inserted = 0;
725
726                 if (info->value_type.dec &&
727                     (!info->value_type.equal ||
728                      !info->value_type.equal(
729                              info->value_type.context,
730                              value_ptr(n, index),
731                              value))) {
732                         info->value_type.dec(info->value_type.context,
733                                              value_ptr(n, index));
734                 }
735                 memcpy_disk(value_ptr(n, index),
736                             value, info->value_type.size);
737         }
738
739         *new_root = shadow_root(&spine);
740         exit_shadow_spine(&spine);
741
742         return 0;
743
744 bad:
745         __dm_unbless_for_disk(value);
746 bad_unblessed:
747         exit_shadow_spine(&spine);
748         return r;
749 }
750
751 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
752                     uint64_t *keys, void *value, dm_block_t *new_root)
753                     __dm_written_to_disk(value)
754 {
755         return insert(info, root, keys, value, new_root, NULL);
756 }
757 EXPORT_SYMBOL_GPL(dm_btree_insert);
758
759 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
760                            uint64_t *keys, void *value, dm_block_t *new_root,
761                            int *inserted)
762                            __dm_written_to_disk(value)
763 {
764         return insert(info, root, keys, value, new_root, inserted);
765 }
766 EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
767
768 /*----------------------------------------------------------------*/
769
770 static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
771                     uint64_t *result_key, dm_block_t *next_block)
772 {
773         int i, r;
774         uint32_t flags;
775
776         do {
777                 r = ro_step(s, block);
778                 if (r < 0)
779                         return r;
780
781                 flags = le32_to_cpu(ro_node(s)->header.flags);
782                 i = le32_to_cpu(ro_node(s)->header.nr_entries);
783                 if (!i)
784                         return -ENODATA;
785                 else
786                         i--;
787
788                 if (find_highest)
789                         *result_key = le64_to_cpu(ro_node(s)->keys[i]);
790                 else
791                         *result_key = le64_to_cpu(ro_node(s)->keys[0]);
792
793                 if (next_block || flags & INTERNAL_NODE)
794                         block = value64(ro_node(s), i);
795
796         } while (flags & INTERNAL_NODE);
797
798         if (next_block)
799                 *next_block = block;
800         return 0;
801 }
802
803 static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
804                              bool find_highest, uint64_t *result_keys)
805 {
806         int r = 0, count = 0, level;
807         struct ro_spine spine;
808
809         init_ro_spine(&spine, info);
810         for (level = 0; level < info->levels; level++) {
811                 r = find_key(&spine, root, find_highest, result_keys + level,
812                              level == info->levels - 1 ? NULL : &root);
813                 if (r == -ENODATA) {
814                         r = 0;
815                         break;
816
817                 } else if (r)
818                         break;
819
820                 count++;
821         }
822         exit_ro_spine(&spine);
823
824         return r ? r : count;
825 }
826
827 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
828                               uint64_t *result_keys)
829 {
830         return dm_btree_find_key(info, root, true, result_keys);
831 }
832 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
833
834 int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
835                              uint64_t *result_keys)
836 {
837         return dm_btree_find_key(info, root, false, result_keys);
838 }
839 EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
840
841 /*----------------------------------------------------------------*/
842
843 /*
844  * FIXME: We shouldn't use a recursive algorithm when we have limited stack
845  * space.  Also this only works for single level trees.
846  */
847 static int walk_node(struct dm_btree_info *info, dm_block_t block,
848                      int (*fn)(void *context, uint64_t *keys, void *leaf),
849                      void *context)
850 {
851         int r;
852         unsigned i, nr;
853         struct dm_block *node;
854         struct btree_node *n;
855         uint64_t keys;
856
857         r = bn_read_lock(info, block, &node);
858         if (r)
859                 return r;
860
861         n = dm_block_data(node);
862
863         nr = le32_to_cpu(n->header.nr_entries);
864         for (i = 0; i < nr; i++) {
865                 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
866                         r = walk_node(info, value64(n, i), fn, context);
867                         if (r)
868                                 goto out;
869                 } else {
870                         keys = le64_to_cpu(*key_ptr(n, i));
871                         r = fn(context, &keys, value_ptr(n, i));
872                         if (r)
873                                 goto out;
874                 }
875         }
876
877 out:
878         dm_tm_unlock(info->tm, node);
879         return r;
880 }
881
882 int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
883                   int (*fn)(void *context, uint64_t *keys, void *leaf),
884                   void *context)
885 {
886         BUG_ON(info->levels > 1);
887         return walk_node(info, root, fn, context);
888 }
889 EXPORT_SYMBOL_GPL(dm_btree_walk);