]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - fs/xfs/xfs_btree.c
Merge branch 'serge-next-1' of git://git.kernel.org/pub/scm/linux/kernel/git/sergeh...
[karo-tx-linux.git] / fs / xfs / xfs_btree.c
1 /*
2  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_shared.h"
21 #include "xfs_format.h"
22 #include "xfs_log_format.h"
23 #include "xfs_trans_resv.h"
24 #include "xfs_bit.h"
25 #include "xfs_sb.h"
26 #include "xfs_ag.h"
27 #include "xfs_mount.h"
28 #include "xfs_inode.h"
29 #include "xfs_trans.h"
30 #include "xfs_inode_item.h"
31 #include "xfs_buf_item.h"
32 #include "xfs_btree.h"
33 #include "xfs_error.h"
34 #include "xfs_trace.h"
35 #include "xfs_cksum.h"
36
37 /*
38  * Cursor allocation zone.
39  */
40 kmem_zone_t     *xfs_btree_cur_zone;
41
42 /*
43  * Btree magic numbers.
44  */
45 static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
46         { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC },
47         { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
48           XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC }
49 };
50 #define xfs_btree_magic(cur) \
51         xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
52
53
54 STATIC int                              /* error (0 or EFSCORRUPTED) */
55 xfs_btree_check_lblock(
56         struct xfs_btree_cur    *cur,   /* btree cursor */
57         struct xfs_btree_block  *block, /* btree long form block pointer */
58         int                     level,  /* level of the btree block */
59         struct xfs_buf          *bp)    /* buffer for block, if any */
60 {
61         int                     lblock_ok = 1; /* block passes checks */
62         struct xfs_mount        *mp;    /* file system mount point */
63
64         mp = cur->bc_mp;
65
66         if (xfs_sb_version_hascrc(&mp->m_sb)) {
67                 lblock_ok = lblock_ok &&
68                         uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
69                         block->bb_u.l.bb_blkno == cpu_to_be64(
70                                 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
71         }
72
73         lblock_ok = lblock_ok &&
74                 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
75                 be16_to_cpu(block->bb_level) == level &&
76                 be16_to_cpu(block->bb_numrecs) <=
77                         cur->bc_ops->get_maxrecs(cur, level) &&
78                 block->bb_u.l.bb_leftsib &&
79                 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
80                  XFS_FSB_SANITY_CHECK(mp,
81                         be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
82                 block->bb_u.l.bb_rightsib &&
83                 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
84                  XFS_FSB_SANITY_CHECK(mp,
85                         be64_to_cpu(block->bb_u.l.bb_rightsib)));
86
87         if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
88                         XFS_ERRTAG_BTREE_CHECK_LBLOCK,
89                         XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
90                 if (bp)
91                         trace_xfs_btree_corrupt(bp, _RET_IP_);
92                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
93                 return XFS_ERROR(EFSCORRUPTED);
94         }
95         return 0;
96 }
97
98 STATIC int                              /* error (0 or EFSCORRUPTED) */
99 xfs_btree_check_sblock(
100         struct xfs_btree_cur    *cur,   /* btree cursor */
101         struct xfs_btree_block  *block, /* btree short form block pointer */
102         int                     level,  /* level of the btree block */
103         struct xfs_buf          *bp)    /* buffer containing block */
104 {
105         struct xfs_mount        *mp;    /* file system mount point */
106         struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
107         struct xfs_agf          *agf;   /* ag. freespace structure */
108         xfs_agblock_t           agflen; /* native ag. freespace length */
109         int                     sblock_ok = 1; /* block passes checks */
110
111         mp = cur->bc_mp;
112         agbp = cur->bc_private.a.agbp;
113         agf = XFS_BUF_TO_AGF(agbp);
114         agflen = be32_to_cpu(agf->agf_length);
115
116         if (xfs_sb_version_hascrc(&mp->m_sb)) {
117                 sblock_ok = sblock_ok &&
118                         uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
119                         block->bb_u.s.bb_blkno == cpu_to_be64(
120                                 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
121         }
122
123         sblock_ok = sblock_ok &&
124                 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
125                 be16_to_cpu(block->bb_level) == level &&
126                 be16_to_cpu(block->bb_numrecs) <=
127                         cur->bc_ops->get_maxrecs(cur, level) &&
128                 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
129                  be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
130                 block->bb_u.s.bb_leftsib &&
131                 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
132                  be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
133                 block->bb_u.s.bb_rightsib;
134
135         if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
136                         XFS_ERRTAG_BTREE_CHECK_SBLOCK,
137                         XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
138                 if (bp)
139                         trace_xfs_btree_corrupt(bp, _RET_IP_);
140                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
141                 return XFS_ERROR(EFSCORRUPTED);
142         }
143         return 0;
144 }
145
146 /*
147  * Debug routine: check that block header is ok.
148  */
149 int
150 xfs_btree_check_block(
151         struct xfs_btree_cur    *cur,   /* btree cursor */
152         struct xfs_btree_block  *block, /* generic btree block pointer */
153         int                     level,  /* level of the btree block */
154         struct xfs_buf          *bp)    /* buffer containing block, if any */
155 {
156         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
157                 return xfs_btree_check_lblock(cur, block, level, bp);
158         else
159                 return xfs_btree_check_sblock(cur, block, level, bp);
160 }
161
162 /*
163  * Check that (long) pointer is ok.
164  */
165 int                                     /* error (0 or EFSCORRUPTED) */
166 xfs_btree_check_lptr(
167         struct xfs_btree_cur    *cur,   /* btree cursor */
168         xfs_dfsbno_t            bno,    /* btree block disk address */
169         int                     level)  /* btree block level */
170 {
171         XFS_WANT_CORRUPTED_RETURN(
172                 level > 0 &&
173                 bno != NULLDFSBNO &&
174                 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
175         return 0;
176 }
177
178 #ifdef DEBUG
179 /*
180  * Check that (short) pointer is ok.
181  */
182 STATIC int                              /* error (0 or EFSCORRUPTED) */
183 xfs_btree_check_sptr(
184         struct xfs_btree_cur    *cur,   /* btree cursor */
185         xfs_agblock_t           bno,    /* btree block disk address */
186         int                     level)  /* btree block level */
187 {
188         xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
189
190         XFS_WANT_CORRUPTED_RETURN(
191                 level > 0 &&
192                 bno != NULLAGBLOCK &&
193                 bno != 0 &&
194                 bno < agblocks);
195         return 0;
196 }
197
198 /*
199  * Check that block ptr is ok.
200  */
201 STATIC int                              /* error (0 or EFSCORRUPTED) */
202 xfs_btree_check_ptr(
203         struct xfs_btree_cur    *cur,   /* btree cursor */
204         union xfs_btree_ptr     *ptr,   /* btree block disk address */
205         int                     index,  /* offset from ptr to check */
206         int                     level)  /* btree block level */
207 {
208         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
209                 return xfs_btree_check_lptr(cur,
210                                 be64_to_cpu((&ptr->l)[index]), level);
211         } else {
212                 return xfs_btree_check_sptr(cur,
213                                 be32_to_cpu((&ptr->s)[index]), level);
214         }
215 }
216 #endif
217
218 /*
219  * Calculate CRC on the whole btree block and stuff it into the
220  * long-form btree header.
221  *
222  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
223  * it into the buffer so recovery knows what the last modifcation was that made
224  * it to disk.
225  */
226 void
227 xfs_btree_lblock_calc_crc(
228         struct xfs_buf          *bp)
229 {
230         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
231         struct xfs_buf_log_item *bip = bp->b_fspriv;
232
233         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
234                 return;
235         if (bip)
236                 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
237         xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
238 }
239
240 bool
241 xfs_btree_lblock_verify_crc(
242         struct xfs_buf          *bp)
243 {
244         if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
245                 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
246
247         return true;
248 }
249
250 /*
251  * Calculate CRC on the whole btree block and stuff it into the
252  * short-form btree header.
253  *
254  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
255  * it into the buffer so recovery knows what the last modifcation was that made
256  * it to disk.
257  */
258 void
259 xfs_btree_sblock_calc_crc(
260         struct xfs_buf          *bp)
261 {
262         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
263         struct xfs_buf_log_item *bip = bp->b_fspriv;
264
265         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
266                 return;
267         if (bip)
268                 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
269         xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
270 }
271
272 bool
273 xfs_btree_sblock_verify_crc(
274         struct xfs_buf          *bp)
275 {
276         if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
277                 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
278
279         return true;
280 }
281
282 /*
283  * Delete the btree cursor.
284  */
285 void
286 xfs_btree_del_cursor(
287         xfs_btree_cur_t *cur,           /* btree cursor */
288         int             error)          /* del because of error */
289 {
290         int             i;              /* btree level */
291
292         /*
293          * Clear the buffer pointers, and release the buffers.
294          * If we're doing this in the face of an error, we
295          * need to make sure to inspect all of the entries
296          * in the bc_bufs array for buffers to be unlocked.
297          * This is because some of the btree code works from
298          * level n down to 0, and if we get an error along
299          * the way we won't have initialized all the entries
300          * down to 0.
301          */
302         for (i = 0; i < cur->bc_nlevels; i++) {
303                 if (cur->bc_bufs[i])
304                         xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
305                 else if (!error)
306                         break;
307         }
308         /*
309          * Can't free a bmap cursor without having dealt with the
310          * allocated indirect blocks' accounting.
311          */
312         ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
313                cur->bc_private.b.allocated == 0);
314         /*
315          * Free the cursor.
316          */
317         kmem_zone_free(xfs_btree_cur_zone, cur);
318 }
319
320 /*
321  * Duplicate the btree cursor.
322  * Allocate a new one, copy the record, re-get the buffers.
323  */
324 int                                     /* error */
325 xfs_btree_dup_cursor(
326         xfs_btree_cur_t *cur,           /* input cursor */
327         xfs_btree_cur_t **ncur)         /* output cursor */
328 {
329         xfs_buf_t       *bp;            /* btree block's buffer pointer */
330         int             error;          /* error return value */
331         int             i;              /* level number of btree block */
332         xfs_mount_t     *mp;            /* mount structure for filesystem */
333         xfs_btree_cur_t *new;           /* new cursor value */
334         xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
335
336         tp = cur->bc_tp;
337         mp = cur->bc_mp;
338
339         /*
340          * Allocate a new cursor like the old one.
341          */
342         new = cur->bc_ops->dup_cursor(cur);
343
344         /*
345          * Copy the record currently in the cursor.
346          */
347         new->bc_rec = cur->bc_rec;
348
349         /*
350          * For each level current, re-get the buffer and copy the ptr value.
351          */
352         for (i = 0; i < new->bc_nlevels; i++) {
353                 new->bc_ptrs[i] = cur->bc_ptrs[i];
354                 new->bc_ra[i] = cur->bc_ra[i];
355                 bp = cur->bc_bufs[i];
356                 if (bp) {
357                         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
358                                                    XFS_BUF_ADDR(bp), mp->m_bsize,
359                                                    0, &bp,
360                                                    cur->bc_ops->buf_ops);
361                         if (error) {
362                                 xfs_btree_del_cursor(new, error);
363                                 *ncur = NULL;
364                                 return error;
365                         }
366                 }
367                 new->bc_bufs[i] = bp;
368         }
369         *ncur = new;
370         return 0;
371 }
372
373 /*
374  * XFS btree block layout and addressing:
375  *
376  * There are two types of blocks in the btree: leaf and non-leaf blocks.
377  *
378  * The leaf record start with a header then followed by records containing
379  * the values.  A non-leaf block also starts with the same header, and
380  * then first contains lookup keys followed by an equal number of pointers
381  * to the btree blocks at the previous level.
382  *
383  *              +--------+-------+-------+-------+-------+-------+-------+
384  * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
385  *              +--------+-------+-------+-------+-------+-------+-------+
386  *
387  *              +--------+-------+-------+-------+-------+-------+-------+
388  * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
389  *              +--------+-------+-------+-------+-------+-------+-------+
390  *
391  * The header is called struct xfs_btree_block for reasons better left unknown
392  * and comes in different versions for short (32bit) and long (64bit) block
393  * pointers.  The record and key structures are defined by the btree instances
394  * and opaque to the btree core.  The block pointers are simple disk endian
395  * integers, available in a short (32bit) and long (64bit) variant.
396  *
397  * The helpers below calculate the offset of a given record, key or pointer
398  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
399  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
400  * inside the btree block is done using indices starting at one, not zero!
401  */
402
403 /*
404  * Return size of the btree block header for this btree instance.
405  */
406 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
407 {
408         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
409                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
410                         return XFS_BTREE_LBLOCK_CRC_LEN;
411                 return XFS_BTREE_LBLOCK_LEN;
412         }
413         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
414                 return XFS_BTREE_SBLOCK_CRC_LEN;
415         return XFS_BTREE_SBLOCK_LEN;
416 }
417
418 /*
419  * Return size of btree block pointers for this btree instance.
420  */
421 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
422 {
423         return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
424                 sizeof(__be64) : sizeof(__be32);
425 }
426
427 /*
428  * Calculate offset of the n-th record in a btree block.
429  */
430 STATIC size_t
431 xfs_btree_rec_offset(
432         struct xfs_btree_cur    *cur,
433         int                     n)
434 {
435         return xfs_btree_block_len(cur) +
436                 (n - 1) * cur->bc_ops->rec_len;
437 }
438
439 /*
440  * Calculate offset of the n-th key in a btree block.
441  */
442 STATIC size_t
443 xfs_btree_key_offset(
444         struct xfs_btree_cur    *cur,
445         int                     n)
446 {
447         return xfs_btree_block_len(cur) +
448                 (n - 1) * cur->bc_ops->key_len;
449 }
450
451 /*
452  * Calculate offset of the n-th block pointer in a btree block.
453  */
454 STATIC size_t
455 xfs_btree_ptr_offset(
456         struct xfs_btree_cur    *cur,
457         int                     n,
458         int                     level)
459 {
460         return xfs_btree_block_len(cur) +
461                 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
462                 (n - 1) * xfs_btree_ptr_len(cur);
463 }
464
465 /*
466  * Return a pointer to the n-th record in the btree block.
467  */
468 STATIC union xfs_btree_rec *
469 xfs_btree_rec_addr(
470         struct xfs_btree_cur    *cur,
471         int                     n,
472         struct xfs_btree_block  *block)
473 {
474         return (union xfs_btree_rec *)
475                 ((char *)block + xfs_btree_rec_offset(cur, n));
476 }
477
478 /*
479  * Return a pointer to the n-th key in the btree block.
480  */
481 STATIC union xfs_btree_key *
482 xfs_btree_key_addr(
483         struct xfs_btree_cur    *cur,
484         int                     n,
485         struct xfs_btree_block  *block)
486 {
487         return (union xfs_btree_key *)
488                 ((char *)block + xfs_btree_key_offset(cur, n));
489 }
490
491 /*
492  * Return a pointer to the n-th block pointer in the btree block.
493  */
494 STATIC union xfs_btree_ptr *
495 xfs_btree_ptr_addr(
496         struct xfs_btree_cur    *cur,
497         int                     n,
498         struct xfs_btree_block  *block)
499 {
500         int                     level = xfs_btree_get_level(block);
501
502         ASSERT(block->bb_level != 0);
503
504         return (union xfs_btree_ptr *)
505                 ((char *)block + xfs_btree_ptr_offset(cur, n, level));
506 }
507
508 /*
509  * Get the root block which is stored in the inode.
510  *
511  * For now this btree implementation assumes the btree root is always
512  * stored in the if_broot field of an inode fork.
513  */
514 STATIC struct xfs_btree_block *
515 xfs_btree_get_iroot(
516        struct xfs_btree_cur    *cur)
517 {
518        struct xfs_ifork        *ifp;
519
520        ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
521        return (struct xfs_btree_block *)ifp->if_broot;
522 }
523
524 /*
525  * Retrieve the block pointer from the cursor at the given level.
526  * This may be an inode btree root or from a buffer.
527  */
528 STATIC struct xfs_btree_block *         /* generic btree block pointer */
529 xfs_btree_get_block(
530         struct xfs_btree_cur    *cur,   /* btree cursor */
531         int                     level,  /* level in btree */
532         struct xfs_buf          **bpp)  /* buffer containing the block */
533 {
534         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
535             (level == cur->bc_nlevels - 1)) {
536                 *bpp = NULL;
537                 return xfs_btree_get_iroot(cur);
538         }
539
540         *bpp = cur->bc_bufs[level];
541         return XFS_BUF_TO_BLOCK(*bpp);
542 }
543
544 /*
545  * Get a buffer for the block, return it with no data read.
546  * Long-form addressing.
547  */
548 xfs_buf_t *                             /* buffer for fsbno */
549 xfs_btree_get_bufl(
550         xfs_mount_t     *mp,            /* file system mount point */
551         xfs_trans_t     *tp,            /* transaction pointer */
552         xfs_fsblock_t   fsbno,          /* file system block number */
553         uint            lock)           /* lock flags for get_buf */
554 {
555         xfs_buf_t       *bp;            /* buffer pointer (return value) */
556         xfs_daddr_t             d;              /* real disk block address */
557
558         ASSERT(fsbno != NULLFSBLOCK);
559         d = XFS_FSB_TO_DADDR(mp, fsbno);
560         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
561         ASSERT(!xfs_buf_geterror(bp));
562         return bp;
563 }
564
565 /*
566  * Get a buffer for the block, return it with no data read.
567  * Short-form addressing.
568  */
569 xfs_buf_t *                             /* buffer for agno/agbno */
570 xfs_btree_get_bufs(
571         xfs_mount_t     *mp,            /* file system mount point */
572         xfs_trans_t     *tp,            /* transaction pointer */
573         xfs_agnumber_t  agno,           /* allocation group number */
574         xfs_agblock_t   agbno,          /* allocation group block number */
575         uint            lock)           /* lock flags for get_buf */
576 {
577         xfs_buf_t       *bp;            /* buffer pointer (return value) */
578         xfs_daddr_t             d;              /* real disk block address */
579
580         ASSERT(agno != NULLAGNUMBER);
581         ASSERT(agbno != NULLAGBLOCK);
582         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
583         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
584         ASSERT(!xfs_buf_geterror(bp));
585         return bp;
586 }
587
588 /*
589  * Check for the cursor referring to the last block at the given level.
590  */
591 int                                     /* 1=is last block, 0=not last block */
592 xfs_btree_islastblock(
593         xfs_btree_cur_t         *cur,   /* btree cursor */
594         int                     level)  /* level to check */
595 {
596         struct xfs_btree_block  *block; /* generic btree block pointer */
597         xfs_buf_t               *bp;    /* buffer containing block */
598
599         block = xfs_btree_get_block(cur, level, &bp);
600         xfs_btree_check_block(cur, block, level, bp);
601         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
602                 return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO);
603         else
604                 return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
605 }
606
607 /*
608  * Change the cursor to point to the first record at the given level.
609  * Other levels are unaffected.
610  */
611 STATIC int                              /* success=1, failure=0 */
612 xfs_btree_firstrec(
613         xfs_btree_cur_t         *cur,   /* btree cursor */
614         int                     level)  /* level to change */
615 {
616         struct xfs_btree_block  *block; /* generic btree block pointer */
617         xfs_buf_t               *bp;    /* buffer containing block */
618
619         /*
620          * Get the block pointer for this level.
621          */
622         block = xfs_btree_get_block(cur, level, &bp);
623         xfs_btree_check_block(cur, block, level, bp);
624         /*
625          * It's empty, there is no such record.
626          */
627         if (!block->bb_numrecs)
628                 return 0;
629         /*
630          * Set the ptr value to 1, that's the first record/key.
631          */
632         cur->bc_ptrs[level] = 1;
633         return 1;
634 }
635
636 /*
637  * Change the cursor to point to the last record in the current block
638  * at the given level.  Other levels are unaffected.
639  */
640 STATIC int                              /* success=1, failure=0 */
641 xfs_btree_lastrec(
642         xfs_btree_cur_t         *cur,   /* btree cursor */
643         int                     level)  /* level to change */
644 {
645         struct xfs_btree_block  *block; /* generic btree block pointer */
646         xfs_buf_t               *bp;    /* buffer containing block */
647
648         /*
649          * Get the block pointer for this level.
650          */
651         block = xfs_btree_get_block(cur, level, &bp);
652         xfs_btree_check_block(cur, block, level, bp);
653         /*
654          * It's empty, there is no such record.
655          */
656         if (!block->bb_numrecs)
657                 return 0;
658         /*
659          * Set the ptr value to numrecs, that's the last record/key.
660          */
661         cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
662         return 1;
663 }
664
665 /*
666  * Compute first and last byte offsets for the fields given.
667  * Interprets the offsets table, which contains struct field offsets.
668  */
669 void
670 xfs_btree_offsets(
671         __int64_t       fields,         /* bitmask of fields */
672         const short     *offsets,       /* table of field offsets */
673         int             nbits,          /* number of bits to inspect */
674         int             *first,         /* output: first byte offset */
675         int             *last)          /* output: last byte offset */
676 {
677         int             i;              /* current bit number */
678         __int64_t       imask;          /* mask for current bit number */
679
680         ASSERT(fields != 0);
681         /*
682          * Find the lowest bit, so the first byte offset.
683          */
684         for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
685                 if (imask & fields) {
686                         *first = offsets[i];
687                         break;
688                 }
689         }
690         /*
691          * Find the highest bit, so the last byte offset.
692          */
693         for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
694                 if (imask & fields) {
695                         *last = offsets[i + 1] - 1;
696                         break;
697                 }
698         }
699 }
700
701 /*
702  * Get a buffer for the block, return it read in.
703  * Long-form addressing.
704  */
705 int
706 xfs_btree_read_bufl(
707         struct xfs_mount        *mp,            /* file system mount point */
708         struct xfs_trans        *tp,            /* transaction pointer */
709         xfs_fsblock_t           fsbno,          /* file system block number */
710         uint                    lock,           /* lock flags for read_buf */
711         struct xfs_buf          **bpp,          /* buffer for fsbno */
712         int                     refval,         /* ref count value for buffer */
713         const struct xfs_buf_ops *ops)
714 {
715         struct xfs_buf          *bp;            /* return value */
716         xfs_daddr_t             d;              /* real disk block address */
717         int                     error;
718
719         ASSERT(fsbno != NULLFSBLOCK);
720         d = XFS_FSB_TO_DADDR(mp, fsbno);
721         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
722                                    mp->m_bsize, lock, &bp, ops);
723         if (error)
724                 return error;
725         ASSERT(!xfs_buf_geterror(bp));
726         if (bp)
727                 xfs_buf_set_ref(bp, refval);
728         *bpp = bp;
729         return 0;
730 }
731
732 /*
733  * Read-ahead the block, don't wait for it, don't return a buffer.
734  * Long-form addressing.
735  */
736 /* ARGSUSED */
737 void
738 xfs_btree_reada_bufl(
739         struct xfs_mount        *mp,            /* file system mount point */
740         xfs_fsblock_t           fsbno,          /* file system block number */
741         xfs_extlen_t            count,          /* count of filesystem blocks */
742         const struct xfs_buf_ops *ops)
743 {
744         xfs_daddr_t             d;
745
746         ASSERT(fsbno != NULLFSBLOCK);
747         d = XFS_FSB_TO_DADDR(mp, fsbno);
748         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
749 }
750
751 /*
752  * Read-ahead the block, don't wait for it, don't return a buffer.
753  * Short-form addressing.
754  */
755 /* ARGSUSED */
756 void
757 xfs_btree_reada_bufs(
758         struct xfs_mount        *mp,            /* file system mount point */
759         xfs_agnumber_t          agno,           /* allocation group number */
760         xfs_agblock_t           agbno,          /* allocation group block number */
761         xfs_extlen_t            count,          /* count of filesystem blocks */
762         const struct xfs_buf_ops *ops)
763 {
764         xfs_daddr_t             d;
765
766         ASSERT(agno != NULLAGNUMBER);
767         ASSERT(agbno != NULLAGBLOCK);
768         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
769         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
770 }
771
772 STATIC int
773 xfs_btree_readahead_lblock(
774         struct xfs_btree_cur    *cur,
775         int                     lr,
776         struct xfs_btree_block  *block)
777 {
778         int                     rval = 0;
779         xfs_dfsbno_t            left = be64_to_cpu(block->bb_u.l.bb_leftsib);
780         xfs_dfsbno_t            right = be64_to_cpu(block->bb_u.l.bb_rightsib);
781
782         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
783                 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
784                                      cur->bc_ops->buf_ops);
785                 rval++;
786         }
787
788         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
789                 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
790                                      cur->bc_ops->buf_ops);
791                 rval++;
792         }
793
794         return rval;
795 }
796
797 STATIC int
798 xfs_btree_readahead_sblock(
799         struct xfs_btree_cur    *cur,
800         int                     lr,
801         struct xfs_btree_block *block)
802 {
803         int                     rval = 0;
804         xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
805         xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
806
807
808         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
809                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
810                                      left, 1, cur->bc_ops->buf_ops);
811                 rval++;
812         }
813
814         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
815                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
816                                      right, 1, cur->bc_ops->buf_ops);
817                 rval++;
818         }
819
820         return rval;
821 }
822
823 /*
824  * Read-ahead btree blocks, at the given level.
825  * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
826  */
827 STATIC int
828 xfs_btree_readahead(
829         struct xfs_btree_cur    *cur,           /* btree cursor */
830         int                     lev,            /* level in btree */
831         int                     lr)             /* left/right bits */
832 {
833         struct xfs_btree_block  *block;
834
835         /*
836          * No readahead needed if we are at the root level and the
837          * btree root is stored in the inode.
838          */
839         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
840             (lev == cur->bc_nlevels - 1))
841                 return 0;
842
843         if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
844                 return 0;
845
846         cur->bc_ra[lev] |= lr;
847         block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
848
849         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
850                 return xfs_btree_readahead_lblock(cur, lr, block);
851         return xfs_btree_readahead_sblock(cur, lr, block);
852 }
853
854 STATIC xfs_daddr_t
855 xfs_btree_ptr_to_daddr(
856         struct xfs_btree_cur    *cur,
857         union xfs_btree_ptr     *ptr)
858 {
859         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
860                 ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
861
862                 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
863         } else {
864                 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
865                 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
866
867                 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
868                                         be32_to_cpu(ptr->s));
869         }
870 }
871
872 /*
873  * Readahead @count btree blocks at the given @ptr location.
874  *
875  * We don't need to care about long or short form btrees here as we have a
876  * method of converting the ptr directly to a daddr available to us.
877  */
878 STATIC void
879 xfs_btree_readahead_ptr(
880         struct xfs_btree_cur    *cur,
881         union xfs_btree_ptr     *ptr,
882         xfs_extlen_t            count)
883 {
884         xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
885                           xfs_btree_ptr_to_daddr(cur, ptr),
886                           cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
887 }
888
889 /*
890  * Set the buffer for level "lev" in the cursor to bp, releasing
891  * any previous buffer.
892  */
893 STATIC void
894 xfs_btree_setbuf(
895         xfs_btree_cur_t         *cur,   /* btree cursor */
896         int                     lev,    /* level in btree */
897         xfs_buf_t               *bp)    /* new buffer to set */
898 {
899         struct xfs_btree_block  *b;     /* btree block */
900
901         if (cur->bc_bufs[lev])
902                 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
903         cur->bc_bufs[lev] = bp;
904         cur->bc_ra[lev] = 0;
905
906         b = XFS_BUF_TO_BLOCK(bp);
907         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
908                 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO))
909                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
910                 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO))
911                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
912         } else {
913                 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
914                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
915                 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
916                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
917         }
918 }
919
920 STATIC int
921 xfs_btree_ptr_is_null(
922         struct xfs_btree_cur    *cur,
923         union xfs_btree_ptr     *ptr)
924 {
925         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
926                 return ptr->l == cpu_to_be64(NULLDFSBNO);
927         else
928                 return ptr->s == cpu_to_be32(NULLAGBLOCK);
929 }
930
931 STATIC void
932 xfs_btree_set_ptr_null(
933         struct xfs_btree_cur    *cur,
934         union xfs_btree_ptr     *ptr)
935 {
936         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
937                 ptr->l = cpu_to_be64(NULLDFSBNO);
938         else
939                 ptr->s = cpu_to_be32(NULLAGBLOCK);
940 }
941
942 /*
943  * Get/set/init sibling pointers
944  */
945 STATIC void
946 xfs_btree_get_sibling(
947         struct xfs_btree_cur    *cur,
948         struct xfs_btree_block  *block,
949         union xfs_btree_ptr     *ptr,
950         int                     lr)
951 {
952         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
953
954         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
955                 if (lr == XFS_BB_RIGHTSIB)
956                         ptr->l = block->bb_u.l.bb_rightsib;
957                 else
958                         ptr->l = block->bb_u.l.bb_leftsib;
959         } else {
960                 if (lr == XFS_BB_RIGHTSIB)
961                         ptr->s = block->bb_u.s.bb_rightsib;
962                 else
963                         ptr->s = block->bb_u.s.bb_leftsib;
964         }
965 }
966
967 STATIC void
968 xfs_btree_set_sibling(
969         struct xfs_btree_cur    *cur,
970         struct xfs_btree_block  *block,
971         union xfs_btree_ptr     *ptr,
972         int                     lr)
973 {
974         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
975
976         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
977                 if (lr == XFS_BB_RIGHTSIB)
978                         block->bb_u.l.bb_rightsib = ptr->l;
979                 else
980                         block->bb_u.l.bb_leftsib = ptr->l;
981         } else {
982                 if (lr == XFS_BB_RIGHTSIB)
983                         block->bb_u.s.bb_rightsib = ptr->s;
984                 else
985                         block->bb_u.s.bb_leftsib = ptr->s;
986         }
987 }
988
989 void
990 xfs_btree_init_block_int(
991         struct xfs_mount        *mp,
992         struct xfs_btree_block  *buf,
993         xfs_daddr_t             blkno,
994         __u32                   magic,
995         __u16                   level,
996         __u16                   numrecs,
997         __u64                   owner,
998         unsigned int            flags)
999 {
1000         buf->bb_magic = cpu_to_be32(magic);
1001         buf->bb_level = cpu_to_be16(level);
1002         buf->bb_numrecs = cpu_to_be16(numrecs);
1003
1004         if (flags & XFS_BTREE_LONG_PTRS) {
1005                 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
1006                 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
1007                 if (flags & XFS_BTREE_CRC_BLOCKS) {
1008                         buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1009                         buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1010                         uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
1011                         buf->bb_u.l.bb_pad = 0;
1012                         buf->bb_u.l.bb_lsn = 0;
1013                 }
1014         } else {
1015                 /* owner is a 32 bit value on short blocks */
1016                 __u32 __owner = (__u32)owner;
1017
1018                 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1019                 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1020                 if (flags & XFS_BTREE_CRC_BLOCKS) {
1021                         buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1022                         buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1023                         uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
1024                         buf->bb_u.s.bb_lsn = 0;
1025                 }
1026         }
1027 }
1028
1029 void
1030 xfs_btree_init_block(
1031         struct xfs_mount *mp,
1032         struct xfs_buf  *bp,
1033         __u32           magic,
1034         __u16           level,
1035         __u16           numrecs,
1036         __u64           owner,
1037         unsigned int    flags)
1038 {
1039         xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1040                                  magic, level, numrecs, owner, flags);
1041 }
1042
1043 STATIC void
1044 xfs_btree_init_block_cur(
1045         struct xfs_btree_cur    *cur,
1046         struct xfs_buf          *bp,
1047         int                     level,
1048         int                     numrecs)
1049 {
1050         __u64 owner;
1051
1052         /*
1053          * we can pull the owner from the cursor right now as the different
1054          * owners align directly with the pointer size of the btree. This may
1055          * change in future, but is safe for current users of the generic btree
1056          * code.
1057          */
1058         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1059                 owner = cur->bc_private.b.ip->i_ino;
1060         else
1061                 owner = cur->bc_private.a.agno;
1062
1063         xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1064                                  xfs_btree_magic(cur), level, numrecs,
1065                                  owner, cur->bc_flags);
1066 }
1067
1068 /*
1069  * Return true if ptr is the last record in the btree and
1070  * we need to track updates to this record.  The decision
1071  * will be further refined in the update_lastrec method.
1072  */
1073 STATIC int
1074 xfs_btree_is_lastrec(
1075         struct xfs_btree_cur    *cur,
1076         struct xfs_btree_block  *block,
1077         int                     level)
1078 {
1079         union xfs_btree_ptr     ptr;
1080
1081         if (level > 0)
1082                 return 0;
1083         if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1084                 return 0;
1085
1086         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1087         if (!xfs_btree_ptr_is_null(cur, &ptr))
1088                 return 0;
1089         return 1;
1090 }
1091
1092 STATIC void
1093 xfs_btree_buf_to_ptr(
1094         struct xfs_btree_cur    *cur,
1095         struct xfs_buf          *bp,
1096         union xfs_btree_ptr     *ptr)
1097 {
1098         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1099                 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1100                                         XFS_BUF_ADDR(bp)));
1101         else {
1102                 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1103                                         XFS_BUF_ADDR(bp)));
1104         }
1105 }
1106
1107 STATIC void
1108 xfs_btree_set_refs(
1109         struct xfs_btree_cur    *cur,
1110         struct xfs_buf          *bp)
1111 {
1112         switch (cur->bc_btnum) {
1113         case XFS_BTNUM_BNO:
1114         case XFS_BTNUM_CNT:
1115                 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1116                 break;
1117         case XFS_BTNUM_INO:
1118                 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1119                 break;
1120         case XFS_BTNUM_BMAP:
1121                 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1122                 break;
1123         default:
1124                 ASSERT(0);
1125         }
1126 }
1127
1128 STATIC int
1129 xfs_btree_get_buf_block(
1130         struct xfs_btree_cur    *cur,
1131         union xfs_btree_ptr     *ptr,
1132         int                     flags,
1133         struct xfs_btree_block  **block,
1134         struct xfs_buf          **bpp)
1135 {
1136         struct xfs_mount        *mp = cur->bc_mp;
1137         xfs_daddr_t             d;
1138
1139         /* need to sort out how callers deal with failures first */
1140         ASSERT(!(flags & XBF_TRYLOCK));
1141
1142         d = xfs_btree_ptr_to_daddr(cur, ptr);
1143         *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1144                                  mp->m_bsize, flags);
1145
1146         if (!*bpp)
1147                 return ENOMEM;
1148
1149         (*bpp)->b_ops = cur->bc_ops->buf_ops;
1150         *block = XFS_BUF_TO_BLOCK(*bpp);
1151         return 0;
1152 }
1153
1154 /*
1155  * Read in the buffer at the given ptr and return the buffer and
1156  * the block pointer within the buffer.
1157  */
1158 STATIC int
1159 xfs_btree_read_buf_block(
1160         struct xfs_btree_cur    *cur,
1161         union xfs_btree_ptr     *ptr,
1162         int                     level,
1163         int                     flags,
1164         struct xfs_btree_block  **block,
1165         struct xfs_buf          **bpp)
1166 {
1167         struct xfs_mount        *mp = cur->bc_mp;
1168         xfs_daddr_t             d;
1169         int                     error;
1170
1171         /* need to sort out how callers deal with failures first */
1172         ASSERT(!(flags & XBF_TRYLOCK));
1173
1174         d = xfs_btree_ptr_to_daddr(cur, ptr);
1175         error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1176                                    mp->m_bsize, flags, bpp,
1177                                    cur->bc_ops->buf_ops);
1178         if (error)
1179                 return error;
1180
1181         ASSERT(!xfs_buf_geterror(*bpp));
1182         xfs_btree_set_refs(cur, *bpp);
1183         *block = XFS_BUF_TO_BLOCK(*bpp);
1184         return 0;
1185 }
1186
1187 /*
1188  * Copy keys from one btree block to another.
1189  */
1190 STATIC void
1191 xfs_btree_copy_keys(
1192         struct xfs_btree_cur    *cur,
1193         union xfs_btree_key     *dst_key,
1194         union xfs_btree_key     *src_key,
1195         int                     numkeys)
1196 {
1197         ASSERT(numkeys >= 0);
1198         memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1199 }
1200
1201 /*
1202  * Copy records from one btree block to another.
1203  */
1204 STATIC void
1205 xfs_btree_copy_recs(
1206         struct xfs_btree_cur    *cur,
1207         union xfs_btree_rec     *dst_rec,
1208         union xfs_btree_rec     *src_rec,
1209         int                     numrecs)
1210 {
1211         ASSERT(numrecs >= 0);
1212         memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1213 }
1214
1215 /*
1216  * Copy block pointers from one btree block to another.
1217  */
1218 STATIC void
1219 xfs_btree_copy_ptrs(
1220         struct xfs_btree_cur    *cur,
1221         union xfs_btree_ptr     *dst_ptr,
1222         union xfs_btree_ptr     *src_ptr,
1223         int                     numptrs)
1224 {
1225         ASSERT(numptrs >= 0);
1226         memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1227 }
1228
1229 /*
1230  * Shift keys one index left/right inside a single btree block.
1231  */
1232 STATIC void
1233 xfs_btree_shift_keys(
1234         struct xfs_btree_cur    *cur,
1235         union xfs_btree_key     *key,
1236         int                     dir,
1237         int                     numkeys)
1238 {
1239         char                    *dst_key;
1240
1241         ASSERT(numkeys >= 0);
1242         ASSERT(dir == 1 || dir == -1);
1243
1244         dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1245         memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1246 }
1247
1248 /*
1249  * Shift records one index left/right inside a single btree block.
1250  */
1251 STATIC void
1252 xfs_btree_shift_recs(
1253         struct xfs_btree_cur    *cur,
1254         union xfs_btree_rec     *rec,
1255         int                     dir,
1256         int                     numrecs)
1257 {
1258         char                    *dst_rec;
1259
1260         ASSERT(numrecs >= 0);
1261         ASSERT(dir == 1 || dir == -1);
1262
1263         dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1264         memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1265 }
1266
1267 /*
1268  * Shift block pointers one index left/right inside a single btree block.
1269  */
1270 STATIC void
1271 xfs_btree_shift_ptrs(
1272         struct xfs_btree_cur    *cur,
1273         union xfs_btree_ptr     *ptr,
1274         int                     dir,
1275         int                     numptrs)
1276 {
1277         char                    *dst_ptr;
1278
1279         ASSERT(numptrs >= 0);
1280         ASSERT(dir == 1 || dir == -1);
1281
1282         dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1283         memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1284 }
1285
1286 /*
1287  * Log key values from the btree block.
1288  */
1289 STATIC void
1290 xfs_btree_log_keys(
1291         struct xfs_btree_cur    *cur,
1292         struct xfs_buf          *bp,
1293         int                     first,
1294         int                     last)
1295 {
1296         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1297         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1298
1299         if (bp) {
1300                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1301                 xfs_trans_log_buf(cur->bc_tp, bp,
1302                                   xfs_btree_key_offset(cur, first),
1303                                   xfs_btree_key_offset(cur, last + 1) - 1);
1304         } else {
1305                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1306                                 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1307         }
1308
1309         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1310 }
1311
1312 /*
1313  * Log record values from the btree block.
1314  */
1315 void
1316 xfs_btree_log_recs(
1317         struct xfs_btree_cur    *cur,
1318         struct xfs_buf          *bp,
1319         int                     first,
1320         int                     last)
1321 {
1322         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1323         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1324
1325         xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1326         xfs_trans_log_buf(cur->bc_tp, bp,
1327                           xfs_btree_rec_offset(cur, first),
1328                           xfs_btree_rec_offset(cur, last + 1) - 1);
1329
1330         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1331 }
1332
1333 /*
1334  * Log block pointer fields from a btree block (nonleaf).
1335  */
1336 STATIC void
1337 xfs_btree_log_ptrs(
1338         struct xfs_btree_cur    *cur,   /* btree cursor */
1339         struct xfs_buf          *bp,    /* buffer containing btree block */
1340         int                     first,  /* index of first pointer to log */
1341         int                     last)   /* index of last pointer to log */
1342 {
1343         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1344         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1345
1346         if (bp) {
1347                 struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1348                 int                     level = xfs_btree_get_level(block);
1349
1350                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1351                 xfs_trans_log_buf(cur->bc_tp, bp,
1352                                 xfs_btree_ptr_offset(cur, first, level),
1353                                 xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1354         } else {
1355                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1356                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1357         }
1358
1359         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1360 }
1361
1362 /*
1363  * Log fields from a btree block header.
1364  */
1365 void
1366 xfs_btree_log_block(
1367         struct xfs_btree_cur    *cur,   /* btree cursor */
1368         struct xfs_buf          *bp,    /* buffer containing btree block */
1369         int                     fields) /* mask of fields: XFS_BB_... */
1370 {
1371         int                     first;  /* first byte offset logged */
1372         int                     last;   /* last byte offset logged */
1373         static const short      soffsets[] = {  /* table of offsets (short) */
1374                 offsetof(struct xfs_btree_block, bb_magic),
1375                 offsetof(struct xfs_btree_block, bb_level),
1376                 offsetof(struct xfs_btree_block, bb_numrecs),
1377                 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1378                 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1379                 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1380                 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1381                 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1382                 offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1383                 offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1384                 XFS_BTREE_SBLOCK_CRC_LEN
1385         };
1386         static const short      loffsets[] = {  /* table of offsets (long) */
1387                 offsetof(struct xfs_btree_block, bb_magic),
1388                 offsetof(struct xfs_btree_block, bb_level),
1389                 offsetof(struct xfs_btree_block, bb_numrecs),
1390                 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1391                 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1392                 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1393                 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1394                 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1395                 offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1396                 offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1397                 offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1398                 XFS_BTREE_LBLOCK_CRC_LEN
1399         };
1400
1401         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1402         XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
1403
1404         if (bp) {
1405                 int nbits;
1406
1407                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1408                         /*
1409                          * We don't log the CRC when updating a btree
1410                          * block but instead recreate it during log
1411                          * recovery.  As the log buffers have checksums
1412                          * of their own this is safe and avoids logging a crc
1413                          * update in a lot of places.
1414                          */
1415                         if (fields == XFS_BB_ALL_BITS)
1416                                 fields = XFS_BB_ALL_BITS_CRC;
1417                         nbits = XFS_BB_NUM_BITS_CRC;
1418                 } else {
1419                         nbits = XFS_BB_NUM_BITS;
1420                 }
1421                 xfs_btree_offsets(fields,
1422                                   (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1423                                         loffsets : soffsets,
1424                                   nbits, &first, &last);
1425                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1426                 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1427         } else {
1428                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1429                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1430         }
1431
1432         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1433 }
1434
1435 /*
1436  * Increment cursor by one record at the level.
1437  * For nonzero levels the leaf-ward information is untouched.
1438  */
1439 int                                             /* error */
1440 xfs_btree_increment(
1441         struct xfs_btree_cur    *cur,
1442         int                     level,
1443         int                     *stat)          /* success/failure */
1444 {
1445         struct xfs_btree_block  *block;
1446         union xfs_btree_ptr     ptr;
1447         struct xfs_buf          *bp;
1448         int                     error;          /* error return value */
1449         int                     lev;
1450
1451         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1452         XFS_BTREE_TRACE_ARGI(cur, level);
1453
1454         ASSERT(level < cur->bc_nlevels);
1455
1456         /* Read-ahead to the right at this level. */
1457         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1458
1459         /* Get a pointer to the btree block. */
1460         block = xfs_btree_get_block(cur, level, &bp);
1461
1462 #ifdef DEBUG
1463         error = xfs_btree_check_block(cur, block, level, bp);
1464         if (error)
1465                 goto error0;
1466 #endif
1467
1468         /* We're done if we remain in the block after the increment. */
1469         if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1470                 goto out1;
1471
1472         /* Fail if we just went off the right edge of the tree. */
1473         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1474         if (xfs_btree_ptr_is_null(cur, &ptr))
1475                 goto out0;
1476
1477         XFS_BTREE_STATS_INC(cur, increment);
1478
1479         /*
1480          * March up the tree incrementing pointers.
1481          * Stop when we don't go off the right edge of a block.
1482          */
1483         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1484                 block = xfs_btree_get_block(cur, lev, &bp);
1485
1486 #ifdef DEBUG
1487                 error = xfs_btree_check_block(cur, block, lev, bp);
1488                 if (error)
1489                         goto error0;
1490 #endif
1491
1492                 if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1493                         break;
1494
1495                 /* Read-ahead the right block for the next loop. */
1496                 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1497         }
1498
1499         /*
1500          * If we went off the root then we are either seriously
1501          * confused or have the tree root in an inode.
1502          */
1503         if (lev == cur->bc_nlevels) {
1504                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1505                         goto out0;
1506                 ASSERT(0);
1507                 error = EFSCORRUPTED;
1508                 goto error0;
1509         }
1510         ASSERT(lev < cur->bc_nlevels);
1511
1512         /*
1513          * Now walk back down the tree, fixing up the cursor's buffer
1514          * pointers and key numbers.
1515          */
1516         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1517                 union xfs_btree_ptr     *ptrp;
1518
1519                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1520                 error = xfs_btree_read_buf_block(cur, ptrp, --lev,
1521                                                         0, &block, &bp);
1522                 if (error)
1523                         goto error0;
1524
1525                 xfs_btree_setbuf(cur, lev, bp);
1526                 cur->bc_ptrs[lev] = 1;
1527         }
1528 out1:
1529         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1530         *stat = 1;
1531         return 0;
1532
1533 out0:
1534         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1535         *stat = 0;
1536         return 0;
1537
1538 error0:
1539         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1540         return error;
1541 }
1542
1543 /*
1544  * Decrement cursor by one record at the level.
1545  * For nonzero levels the leaf-ward information is untouched.
1546  */
1547 int                                             /* error */
1548 xfs_btree_decrement(
1549         struct xfs_btree_cur    *cur,
1550         int                     level,
1551         int                     *stat)          /* success/failure */
1552 {
1553         struct xfs_btree_block  *block;
1554         xfs_buf_t               *bp;
1555         int                     error;          /* error return value */
1556         int                     lev;
1557         union xfs_btree_ptr     ptr;
1558
1559         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1560         XFS_BTREE_TRACE_ARGI(cur, level);
1561
1562         ASSERT(level < cur->bc_nlevels);
1563
1564         /* Read-ahead to the left at this level. */
1565         xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1566
1567         /* We're done if we remain in the block after the decrement. */
1568         if (--cur->bc_ptrs[level] > 0)
1569                 goto out1;
1570
1571         /* Get a pointer to the btree block. */
1572         block = xfs_btree_get_block(cur, level, &bp);
1573
1574 #ifdef DEBUG
1575         error = xfs_btree_check_block(cur, block, level, bp);
1576         if (error)
1577                 goto error0;
1578 #endif
1579
1580         /* Fail if we just went off the left edge of the tree. */
1581         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1582         if (xfs_btree_ptr_is_null(cur, &ptr))
1583                 goto out0;
1584
1585         XFS_BTREE_STATS_INC(cur, decrement);
1586
1587         /*
1588          * March up the tree decrementing pointers.
1589          * Stop when we don't go off the left edge of a block.
1590          */
1591         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1592                 if (--cur->bc_ptrs[lev] > 0)
1593                         break;
1594                 /* Read-ahead the left block for the next loop. */
1595                 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1596         }
1597
1598         /*
1599          * If we went off the root then we are seriously confused.
1600          * or the root of the tree is in an inode.
1601          */
1602         if (lev == cur->bc_nlevels) {
1603                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1604                         goto out0;
1605                 ASSERT(0);
1606                 error = EFSCORRUPTED;
1607                 goto error0;
1608         }
1609         ASSERT(lev < cur->bc_nlevels);
1610
1611         /*
1612          * Now walk back down the tree, fixing up the cursor's buffer
1613          * pointers and key numbers.
1614          */
1615         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1616                 union xfs_btree_ptr     *ptrp;
1617
1618                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1619                 error = xfs_btree_read_buf_block(cur, ptrp, --lev,
1620                                                         0, &block, &bp);
1621                 if (error)
1622                         goto error0;
1623                 xfs_btree_setbuf(cur, lev, bp);
1624                 cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1625         }
1626 out1:
1627         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1628         *stat = 1;
1629         return 0;
1630
1631 out0:
1632         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1633         *stat = 0;
1634         return 0;
1635
1636 error0:
1637         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1638         return error;
1639 }
1640
1641 STATIC int
1642 xfs_btree_lookup_get_block(
1643         struct xfs_btree_cur    *cur,   /* btree cursor */
1644         int                     level,  /* level in the btree */
1645         union xfs_btree_ptr     *pp,    /* ptr to btree block */
1646         struct xfs_btree_block  **blkp) /* return btree block */
1647 {
1648         struct xfs_buf          *bp;    /* buffer pointer for btree block */
1649         int                     error = 0;
1650
1651         /* special case the root block if in an inode */
1652         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1653             (level == cur->bc_nlevels - 1)) {
1654                 *blkp = xfs_btree_get_iroot(cur);
1655                 return 0;
1656         }
1657
1658         /*
1659          * If the old buffer at this level for the disk address we are
1660          * looking for re-use it.
1661          *
1662          * Otherwise throw it away and get a new one.
1663          */
1664         bp = cur->bc_bufs[level];
1665         if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1666                 *blkp = XFS_BUF_TO_BLOCK(bp);
1667                 return 0;
1668         }
1669
1670         error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
1671         if (error)
1672                 return error;
1673
1674         xfs_btree_setbuf(cur, level, bp);
1675         return 0;
1676 }
1677
1678 /*
1679  * Get current search key.  For level 0 we don't actually have a key
1680  * structure so we make one up from the record.  For all other levels
1681  * we just return the right key.
1682  */
1683 STATIC union xfs_btree_key *
1684 xfs_lookup_get_search_key(
1685         struct xfs_btree_cur    *cur,
1686         int                     level,
1687         int                     keyno,
1688         struct xfs_btree_block  *block,
1689         union xfs_btree_key     *kp)
1690 {
1691         if (level == 0) {
1692                 cur->bc_ops->init_key_from_rec(kp,
1693                                 xfs_btree_rec_addr(cur, keyno, block));
1694                 return kp;
1695         }
1696
1697         return xfs_btree_key_addr(cur, keyno, block);
1698 }
1699
1700 /*
1701  * Lookup the record.  The cursor is made to point to it, based on dir.
1702  * stat is set to 0 if can't find any such record, 1 for success.
1703  */
1704 int                                     /* error */
1705 xfs_btree_lookup(
1706         struct xfs_btree_cur    *cur,   /* btree cursor */
1707         xfs_lookup_t            dir,    /* <=, ==, or >= */
1708         int                     *stat)  /* success/failure */
1709 {
1710         struct xfs_btree_block  *block; /* current btree block */
1711         __int64_t               diff;   /* difference for the current key */
1712         int                     error;  /* error return value */
1713         int                     keyno;  /* current key number */
1714         int                     level;  /* level in the btree */
1715         union xfs_btree_ptr     *pp;    /* ptr to btree block */
1716         union xfs_btree_ptr     ptr;    /* ptr to btree block */
1717
1718         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1719         XFS_BTREE_TRACE_ARGI(cur, dir);
1720
1721         XFS_BTREE_STATS_INC(cur, lookup);
1722
1723         block = NULL;
1724         keyno = 0;
1725
1726         /* initialise start pointer from cursor */
1727         cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1728         pp = &ptr;
1729
1730         /*
1731          * Iterate over each level in the btree, starting at the root.
1732          * For each level above the leaves, find the key we need, based
1733          * on the lookup record, then follow the corresponding block
1734          * pointer down to the next level.
1735          */
1736         for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1737                 /* Get the block we need to do the lookup on. */
1738                 error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1739                 if (error)
1740                         goto error0;
1741
1742                 if (diff == 0) {
1743                         /*
1744                          * If we already had a key match at a higher level, we
1745                          * know we need to use the first entry in this block.
1746                          */
1747                         keyno = 1;
1748                 } else {
1749                         /* Otherwise search this block. Do a binary search. */
1750
1751                         int     high;   /* high entry number */
1752                         int     low;    /* low entry number */
1753
1754                         /* Set low and high entry numbers, 1-based. */
1755                         low = 1;
1756                         high = xfs_btree_get_numrecs(block);
1757                         if (!high) {
1758                                 /* Block is empty, must be an empty leaf. */
1759                                 ASSERT(level == 0 && cur->bc_nlevels == 1);
1760
1761                                 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1762                                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1763                                 *stat = 0;
1764                                 return 0;
1765                         }
1766
1767                         /* Binary search the block. */
1768                         while (low <= high) {
1769                                 union xfs_btree_key     key;
1770                                 union xfs_btree_key     *kp;
1771
1772                                 XFS_BTREE_STATS_INC(cur, compare);
1773
1774                                 /* keyno is average of low and high. */
1775                                 keyno = (low + high) >> 1;
1776
1777                                 /* Get current search key */
1778                                 kp = xfs_lookup_get_search_key(cur, level,
1779                                                 keyno, block, &key);
1780
1781                                 /*
1782                                  * Compute difference to get next direction:
1783                                  *  - less than, move right
1784                                  *  - greater than, move left
1785                                  *  - equal, we're done
1786                                  */
1787                                 diff = cur->bc_ops->key_diff(cur, kp);
1788                                 if (diff < 0)
1789                                         low = keyno + 1;
1790                                 else if (diff > 0)
1791                                         high = keyno - 1;
1792                                 else
1793                                         break;
1794                         }
1795                 }
1796
1797                 /*
1798                  * If there are more levels, set up for the next level
1799                  * by getting the block number and filling in the cursor.
1800                  */
1801                 if (level > 0) {
1802                         /*
1803                          * If we moved left, need the previous key number,
1804                          * unless there isn't one.
1805                          */
1806                         if (diff > 0 && --keyno < 1)
1807                                 keyno = 1;
1808                         pp = xfs_btree_ptr_addr(cur, keyno, block);
1809
1810 #ifdef DEBUG
1811                         error = xfs_btree_check_ptr(cur, pp, 0, level);
1812                         if (error)
1813                                 goto error0;
1814 #endif
1815                         cur->bc_ptrs[level] = keyno;
1816                 }
1817         }
1818
1819         /* Done with the search. See if we need to adjust the results. */
1820         if (dir != XFS_LOOKUP_LE && diff < 0) {
1821                 keyno++;
1822                 /*
1823                  * If ge search and we went off the end of the block, but it's
1824                  * not the last block, we're in the wrong block.
1825                  */
1826                 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1827                 if (dir == XFS_LOOKUP_GE &&
1828                     keyno > xfs_btree_get_numrecs(block) &&
1829                     !xfs_btree_ptr_is_null(cur, &ptr)) {
1830                         int     i;
1831
1832                         cur->bc_ptrs[0] = keyno;
1833                         error = xfs_btree_increment(cur, 0, &i);
1834                         if (error)
1835                                 goto error0;
1836                         XFS_WANT_CORRUPTED_RETURN(i == 1);
1837                         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1838                         *stat = 1;
1839                         return 0;
1840                 }
1841         } else if (dir == XFS_LOOKUP_LE && diff > 0)
1842                 keyno--;
1843         cur->bc_ptrs[0] = keyno;
1844
1845         /* Return if we succeeded or not. */
1846         if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1847                 *stat = 0;
1848         else if (dir != XFS_LOOKUP_EQ || diff == 0)
1849                 *stat = 1;
1850         else
1851                 *stat = 0;
1852         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1853         return 0;
1854
1855 error0:
1856         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1857         return error;
1858 }
1859
1860 /*
1861  * Update keys at all levels from here to the root along the cursor's path.
1862  */
1863 STATIC int
1864 xfs_btree_updkey(
1865         struct xfs_btree_cur    *cur,
1866         union xfs_btree_key     *keyp,
1867         int                     level)
1868 {
1869         struct xfs_btree_block  *block;
1870         struct xfs_buf          *bp;
1871         union xfs_btree_key     *kp;
1872         int                     ptr;
1873
1874         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1875         XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
1876
1877         ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
1878
1879         /*
1880          * Go up the tree from this level toward the root.
1881          * At each level, update the key value to the value input.
1882          * Stop when we reach a level where the cursor isn't pointing
1883          * at the first entry in the block.
1884          */
1885         for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1886 #ifdef DEBUG
1887                 int             error;
1888 #endif
1889                 block = xfs_btree_get_block(cur, level, &bp);
1890 #ifdef DEBUG
1891                 error = xfs_btree_check_block(cur, block, level, bp);
1892                 if (error) {
1893                         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1894                         return error;
1895                 }
1896 #endif
1897                 ptr = cur->bc_ptrs[level];
1898                 kp = xfs_btree_key_addr(cur, ptr, block);
1899                 xfs_btree_copy_keys(cur, kp, keyp, 1);
1900                 xfs_btree_log_keys(cur, bp, ptr, ptr);
1901         }
1902
1903         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1904         return 0;
1905 }
1906
1907 /*
1908  * Update the record referred to by cur to the value in the
1909  * given record. This either works (return 0) or gets an
1910  * EFSCORRUPTED error.
1911  */
1912 int
1913 xfs_btree_update(
1914         struct xfs_btree_cur    *cur,
1915         union xfs_btree_rec     *rec)
1916 {
1917         struct xfs_btree_block  *block;
1918         struct xfs_buf          *bp;
1919         int                     error;
1920         int                     ptr;
1921         union xfs_btree_rec     *rp;
1922
1923         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1924         XFS_BTREE_TRACE_ARGR(cur, rec);
1925
1926         /* Pick up the current block. */
1927         block = xfs_btree_get_block(cur, 0, &bp);
1928
1929 #ifdef DEBUG
1930         error = xfs_btree_check_block(cur, block, 0, bp);
1931         if (error)
1932                 goto error0;
1933 #endif
1934         /* Get the address of the rec to be updated. */
1935         ptr = cur->bc_ptrs[0];
1936         rp = xfs_btree_rec_addr(cur, ptr, block);
1937
1938         /* Fill in the new contents and log them. */
1939         xfs_btree_copy_recs(cur, rp, rec, 1);
1940         xfs_btree_log_recs(cur, bp, ptr, ptr);
1941
1942         /*
1943          * If we are tracking the last record in the tree and
1944          * we are at the far right edge of the tree, update it.
1945          */
1946         if (xfs_btree_is_lastrec(cur, block, 0)) {
1947                 cur->bc_ops->update_lastrec(cur, block, rec,
1948                                             ptr, LASTREC_UPDATE);
1949         }
1950
1951         /* Updating first rec in leaf. Pass new key value up to our parent. */
1952         if (ptr == 1) {
1953                 union xfs_btree_key     key;
1954
1955                 cur->bc_ops->init_key_from_rec(&key, rec);
1956                 error = xfs_btree_updkey(cur, &key, 1);
1957                 if (error)
1958                         goto error0;
1959         }
1960
1961         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1962         return 0;
1963
1964 error0:
1965         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1966         return error;
1967 }
1968
1969 /*
1970  * Move 1 record left from cur/level if possible.
1971  * Update cur to reflect the new path.
1972  */
1973 STATIC int                                      /* error */
1974 xfs_btree_lshift(
1975         struct xfs_btree_cur    *cur,
1976         int                     level,
1977         int                     *stat)          /* success/failure */
1978 {
1979         union xfs_btree_key     key;            /* btree key */
1980         struct xfs_buf          *lbp;           /* left buffer pointer */
1981         struct xfs_btree_block  *left;          /* left btree block */
1982         int                     lrecs;          /* left record count */
1983         struct xfs_buf          *rbp;           /* right buffer pointer */
1984         struct xfs_btree_block  *right;         /* right btree block */
1985         int                     rrecs;          /* right record count */
1986         union xfs_btree_ptr     lptr;           /* left btree pointer */
1987         union xfs_btree_key     *rkp = NULL;    /* right btree key */
1988         union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
1989         union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
1990         int                     error;          /* error return value */
1991
1992         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1993         XFS_BTREE_TRACE_ARGI(cur, level);
1994
1995         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1996             level == cur->bc_nlevels - 1)
1997                 goto out0;
1998
1999         /* Set up variables for this block as "right". */
2000         right = xfs_btree_get_block(cur, level, &rbp);
2001
2002 #ifdef DEBUG
2003         error = xfs_btree_check_block(cur, right, level, rbp);
2004         if (error)
2005                 goto error0;
2006 #endif
2007
2008         /* If we've got no left sibling then we can't shift an entry left. */
2009         xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2010         if (xfs_btree_ptr_is_null(cur, &lptr))
2011                 goto out0;
2012
2013         /*
2014          * If the cursor entry is the one that would be moved, don't
2015          * do it... it's too complicated.
2016          */
2017         if (cur->bc_ptrs[level] <= 1)
2018                 goto out0;
2019
2020         /* Set up the left neighbor as "left". */
2021         error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
2022         if (error)
2023                 goto error0;
2024
2025         /* If it's full, it can't take another entry. */
2026         lrecs = xfs_btree_get_numrecs(left);
2027         if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2028                 goto out0;
2029
2030         rrecs = xfs_btree_get_numrecs(right);
2031
2032         /*
2033          * We add one entry to the left side and remove one for the right side.
2034          * Account for it here, the changes will be updated on disk and logged
2035          * later.
2036          */
2037         lrecs++;
2038         rrecs--;
2039
2040         XFS_BTREE_STATS_INC(cur, lshift);
2041         XFS_BTREE_STATS_ADD(cur, moves, 1);
2042
2043         /*
2044          * If non-leaf, copy a key and a ptr to the left block.
2045          * Log the changes to the left block.
2046          */
2047         if (level > 0) {
2048                 /* It's a non-leaf.  Move keys and pointers. */
2049                 union xfs_btree_key     *lkp;   /* left btree key */
2050                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2051
2052                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2053                 rkp = xfs_btree_key_addr(cur, 1, right);
2054
2055                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2056                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2057 #ifdef DEBUG
2058                 error = xfs_btree_check_ptr(cur, rpp, 0, level);
2059                 if (error)
2060                         goto error0;
2061 #endif
2062                 xfs_btree_copy_keys(cur, lkp, rkp, 1);
2063                 xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2064
2065                 xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2066                 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2067
2068                 ASSERT(cur->bc_ops->keys_inorder(cur,
2069                         xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2070         } else {
2071                 /* It's a leaf.  Move records.  */
2072                 union xfs_btree_rec     *lrp;   /* left record pointer */
2073
2074                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2075                 rrp = xfs_btree_rec_addr(cur, 1, right);
2076
2077                 xfs_btree_copy_recs(cur, lrp, rrp, 1);
2078                 xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2079
2080                 ASSERT(cur->bc_ops->recs_inorder(cur,
2081                         xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2082         }
2083
2084         xfs_btree_set_numrecs(left, lrecs);
2085         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2086
2087         xfs_btree_set_numrecs(right, rrecs);
2088         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2089
2090         /*
2091          * Slide the contents of right down one entry.
2092          */
2093         XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2094         if (level > 0) {
2095                 /* It's a nonleaf. operate on keys and ptrs */
2096 #ifdef DEBUG
2097                 int                     i;              /* loop index */
2098
2099                 for (i = 0; i < rrecs; i++) {
2100                         error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2101                         if (error)
2102                                 goto error0;
2103                 }
2104 #endif
2105                 xfs_btree_shift_keys(cur,
2106                                 xfs_btree_key_addr(cur, 2, right),
2107                                 -1, rrecs);
2108                 xfs_btree_shift_ptrs(cur,
2109                                 xfs_btree_ptr_addr(cur, 2, right),
2110                                 -1, rrecs);
2111
2112                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2113                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2114         } else {
2115                 /* It's a leaf. operate on records */
2116                 xfs_btree_shift_recs(cur,
2117                         xfs_btree_rec_addr(cur, 2, right),
2118                         -1, rrecs);
2119                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2120
2121                 /*
2122                  * If it's the first record in the block, we'll need a key
2123                  * structure to pass up to the next level (updkey).
2124                  */
2125                 cur->bc_ops->init_key_from_rec(&key,
2126                         xfs_btree_rec_addr(cur, 1, right));
2127                 rkp = &key;
2128         }
2129
2130         /* Update the parent key values of right. */
2131         error = xfs_btree_updkey(cur, rkp, level + 1);
2132         if (error)
2133                 goto error0;
2134
2135         /* Slide the cursor value left one. */
2136         cur->bc_ptrs[level]--;
2137
2138         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2139         *stat = 1;
2140         return 0;
2141
2142 out0:
2143         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2144         *stat = 0;
2145         return 0;
2146
2147 error0:
2148         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2149         return error;
2150 }
2151
2152 /*
2153  * Move 1 record right from cur/level if possible.
2154  * Update cur to reflect the new path.
2155  */
2156 STATIC int                                      /* error */
2157 xfs_btree_rshift(
2158         struct xfs_btree_cur    *cur,
2159         int                     level,
2160         int                     *stat)          /* success/failure */
2161 {
2162         union xfs_btree_key     key;            /* btree key */
2163         struct xfs_buf          *lbp;           /* left buffer pointer */
2164         struct xfs_btree_block  *left;          /* left btree block */
2165         struct xfs_buf          *rbp;           /* right buffer pointer */
2166         struct xfs_btree_block  *right;         /* right btree block */
2167         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2168         union xfs_btree_ptr     rptr;           /* right block pointer */
2169         union xfs_btree_key     *rkp;           /* right btree key */
2170         int                     rrecs;          /* right record count */
2171         int                     lrecs;          /* left record count */
2172         int                     error;          /* error return value */
2173         int                     i;              /* loop counter */
2174
2175         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2176         XFS_BTREE_TRACE_ARGI(cur, level);
2177
2178         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2179             (level == cur->bc_nlevels - 1))
2180                 goto out0;
2181
2182         /* Set up variables for this block as "left". */
2183         left = xfs_btree_get_block(cur, level, &lbp);
2184
2185 #ifdef DEBUG
2186         error = xfs_btree_check_block(cur, left, level, lbp);
2187         if (error)
2188                 goto error0;
2189 #endif
2190
2191         /* If we've got no right sibling then we can't shift an entry right. */
2192         xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2193         if (xfs_btree_ptr_is_null(cur, &rptr))
2194                 goto out0;
2195
2196         /*
2197          * If the cursor entry is the one that would be moved, don't
2198          * do it... it's too complicated.
2199          */
2200         lrecs = xfs_btree_get_numrecs(left);
2201         if (cur->bc_ptrs[level] >= lrecs)
2202                 goto out0;
2203
2204         /* Set up the right neighbor as "right". */
2205         error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
2206         if (error)
2207                 goto error0;
2208
2209         /* If it's full, it can't take another entry. */
2210         rrecs = xfs_btree_get_numrecs(right);
2211         if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2212                 goto out0;
2213
2214         XFS_BTREE_STATS_INC(cur, rshift);
2215         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2216
2217         /*
2218          * Make a hole at the start of the right neighbor block, then
2219          * copy the last left block entry to the hole.
2220          */
2221         if (level > 0) {
2222                 /* It's a nonleaf. make a hole in the keys and ptrs */
2223                 union xfs_btree_key     *lkp;
2224                 union xfs_btree_ptr     *lpp;
2225                 union xfs_btree_ptr     *rpp;
2226
2227                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2228                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2229                 rkp = xfs_btree_key_addr(cur, 1, right);
2230                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2231
2232 #ifdef DEBUG
2233                 for (i = rrecs - 1; i >= 0; i--) {
2234                         error = xfs_btree_check_ptr(cur, rpp, i, level);
2235                         if (error)
2236                                 goto error0;
2237                 }
2238 #endif
2239
2240                 xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2241                 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2242
2243 #ifdef DEBUG
2244                 error = xfs_btree_check_ptr(cur, lpp, 0, level);
2245                 if (error)
2246                         goto error0;
2247 #endif
2248
2249                 /* Now put the new data in, and log it. */
2250                 xfs_btree_copy_keys(cur, rkp, lkp, 1);
2251                 xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2252
2253                 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2254                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2255
2256                 ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2257                         xfs_btree_key_addr(cur, 2, right)));
2258         } else {
2259                 /* It's a leaf. make a hole in the records */
2260                 union xfs_btree_rec     *lrp;
2261                 union xfs_btree_rec     *rrp;
2262
2263                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2264                 rrp = xfs_btree_rec_addr(cur, 1, right);
2265
2266                 xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2267
2268                 /* Now put the new data in, and log it. */
2269                 xfs_btree_copy_recs(cur, rrp, lrp, 1);
2270                 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2271
2272                 cur->bc_ops->init_key_from_rec(&key, rrp);
2273                 rkp = &key;
2274
2275                 ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
2276                         xfs_btree_rec_addr(cur, 2, right)));
2277         }
2278
2279         /*
2280          * Decrement and log left's numrecs, bump and log right's numrecs.
2281          */
2282         xfs_btree_set_numrecs(left, --lrecs);
2283         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2284
2285         xfs_btree_set_numrecs(right, ++rrecs);
2286         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2287
2288         /*
2289          * Using a temporary cursor, update the parent key values of the
2290          * block on the right.
2291          */
2292         error = xfs_btree_dup_cursor(cur, &tcur);
2293         if (error)
2294                 goto error0;
2295         i = xfs_btree_lastrec(tcur, level);
2296         XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
2297
2298         error = xfs_btree_increment(tcur, level, &i);
2299         if (error)
2300                 goto error1;
2301
2302         error = xfs_btree_updkey(tcur, rkp, level + 1);
2303         if (error)
2304                 goto error1;
2305
2306         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2307
2308         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2309         *stat = 1;
2310         return 0;
2311
2312 out0:
2313         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2314         *stat = 0;
2315         return 0;
2316
2317 error0:
2318         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2319         return error;
2320
2321 error1:
2322         XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2323         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2324         return error;
2325 }
2326
2327 /*
2328  * Split cur/level block in half.
2329  * Return new block number and the key to its first
2330  * record (to be inserted into parent).
2331  */
2332 STATIC int                                      /* error */
2333 xfs_btree_split(
2334         struct xfs_btree_cur    *cur,
2335         int                     level,
2336         union xfs_btree_ptr     *ptrp,
2337         union xfs_btree_key     *key,
2338         struct xfs_btree_cur    **curp,
2339         int                     *stat)          /* success/failure */
2340 {
2341         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2342         struct xfs_buf          *lbp;           /* left buffer pointer */
2343         struct xfs_btree_block  *left;          /* left btree block */
2344         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2345         struct xfs_buf          *rbp;           /* right buffer pointer */
2346         struct xfs_btree_block  *right;         /* right btree block */
2347         union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2348         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2349         struct xfs_btree_block  *rrblock;       /* right-right btree block */
2350         int                     lrecs;
2351         int                     rrecs;
2352         int                     src_index;
2353         int                     error;          /* error return value */
2354 #ifdef DEBUG
2355         int                     i;
2356 #endif
2357
2358         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2359         XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
2360
2361         XFS_BTREE_STATS_INC(cur, split);
2362
2363         /* Set up left block (current one). */
2364         left = xfs_btree_get_block(cur, level, &lbp);
2365
2366 #ifdef DEBUG
2367         error = xfs_btree_check_block(cur, left, level, lbp);
2368         if (error)
2369                 goto error0;
2370 #endif
2371
2372         xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2373
2374         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2375         error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
2376         if (error)
2377                 goto error0;
2378         if (*stat == 0)
2379                 goto out0;
2380         XFS_BTREE_STATS_INC(cur, alloc);
2381
2382         /* Set up the new block as "right". */
2383         error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2384         if (error)
2385                 goto error0;
2386
2387         /* Fill in the btree header for the new right block. */
2388         xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2389
2390         /*
2391          * Split the entries between the old and the new block evenly.
2392          * Make sure that if there's an odd number of entries now, that
2393          * each new block will have the same number of entries.
2394          */
2395         lrecs = xfs_btree_get_numrecs(left);
2396         rrecs = lrecs / 2;
2397         if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2398                 rrecs++;
2399         src_index = (lrecs - rrecs + 1);
2400
2401         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2402
2403         /*
2404          * Copy btree block entries from the left block over to the
2405          * new block, the right. Update the right block and log the
2406          * changes.
2407          */
2408         if (level > 0) {
2409                 /* It's a non-leaf.  Move keys and pointers. */
2410                 union xfs_btree_key     *lkp;   /* left btree key */
2411                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2412                 union xfs_btree_key     *rkp;   /* right btree key */
2413                 union xfs_btree_ptr     *rpp;   /* right address pointer */
2414
2415                 lkp = xfs_btree_key_addr(cur, src_index, left);
2416                 lpp = xfs_btree_ptr_addr(cur, src_index, left);
2417                 rkp = xfs_btree_key_addr(cur, 1, right);
2418                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2419
2420 #ifdef DEBUG
2421                 for (i = src_index; i < rrecs; i++) {
2422                         error = xfs_btree_check_ptr(cur, lpp, i, level);
2423                         if (error)
2424                                 goto error0;
2425                 }
2426 #endif
2427
2428                 xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2429                 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2430
2431                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2432                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2433
2434                 /* Grab the keys to the entries moved to the right block */
2435                 xfs_btree_copy_keys(cur, key, rkp, 1);
2436         } else {
2437                 /* It's a leaf.  Move records.  */
2438                 union xfs_btree_rec     *lrp;   /* left record pointer */
2439                 union xfs_btree_rec     *rrp;   /* right record pointer */
2440
2441                 lrp = xfs_btree_rec_addr(cur, src_index, left);
2442                 rrp = xfs_btree_rec_addr(cur, 1, right);
2443
2444                 xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2445                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2446
2447                 cur->bc_ops->init_key_from_rec(key,
2448                         xfs_btree_rec_addr(cur, 1, right));
2449         }
2450
2451
2452         /*
2453          * Find the left block number by looking in the buffer.
2454          * Adjust numrecs, sibling pointers.
2455          */
2456         xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2457         xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2458         xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2459         xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2460
2461         lrecs -= rrecs;
2462         xfs_btree_set_numrecs(left, lrecs);
2463         xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2464
2465         xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2466         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2467
2468         /*
2469          * If there's a block to the new block's right, make that block
2470          * point back to right instead of to left.
2471          */
2472         if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2473                 error = xfs_btree_read_buf_block(cur, &rrptr, level,
2474                                                         0, &rrblock, &rrbp);
2475                 if (error)
2476                         goto error0;
2477                 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2478                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2479         }
2480         /*
2481          * If the cursor is really in the right block, move it there.
2482          * If it's just pointing past the last entry in left, then we'll
2483          * insert there, so don't change anything in that case.
2484          */
2485         if (cur->bc_ptrs[level] > lrecs + 1) {
2486                 xfs_btree_setbuf(cur, level, rbp);
2487                 cur->bc_ptrs[level] -= lrecs;
2488         }
2489         /*
2490          * If there are more levels, we'll need another cursor which refers
2491          * the right block, no matter where this cursor was.
2492          */
2493         if (level + 1 < cur->bc_nlevels) {
2494                 error = xfs_btree_dup_cursor(cur, curp);
2495                 if (error)
2496                         goto error0;
2497                 (*curp)->bc_ptrs[level + 1]++;
2498         }
2499         *ptrp = rptr;
2500         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2501         *stat = 1;
2502         return 0;
2503 out0:
2504         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2505         *stat = 0;
2506         return 0;
2507
2508 error0:
2509         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2510         return error;
2511 }
2512
2513 /*
2514  * Copy the old inode root contents into a real block and make the
2515  * broot point to it.
2516  */
2517 int                                             /* error */
2518 xfs_btree_new_iroot(
2519         struct xfs_btree_cur    *cur,           /* btree cursor */
2520         int                     *logflags,      /* logging flags for inode */
2521         int                     *stat)          /* return status - 0 fail */
2522 {
2523         struct xfs_buf          *cbp;           /* buffer for cblock */
2524         struct xfs_btree_block  *block;         /* btree block */
2525         struct xfs_btree_block  *cblock;        /* child btree block */
2526         union xfs_btree_key     *ckp;           /* child key pointer */
2527         union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2528         union xfs_btree_key     *kp;            /* pointer to btree key */
2529         union xfs_btree_ptr     *pp;            /* pointer to block addr */
2530         union xfs_btree_ptr     nptr;           /* new block addr */
2531         int                     level;          /* btree level */
2532         int                     error;          /* error return code */
2533 #ifdef DEBUG
2534         int                     i;              /* loop counter */
2535 #endif
2536
2537         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2538         XFS_BTREE_STATS_INC(cur, newroot);
2539
2540         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2541
2542         level = cur->bc_nlevels - 1;
2543
2544         block = xfs_btree_get_iroot(cur);
2545         pp = xfs_btree_ptr_addr(cur, 1, block);
2546
2547         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2548         error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
2549         if (error)
2550                 goto error0;
2551         if (*stat == 0) {
2552                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2553                 return 0;
2554         }
2555         XFS_BTREE_STATS_INC(cur, alloc);
2556
2557         /* Copy the root into a real block. */
2558         error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2559         if (error)
2560                 goto error0;
2561
2562         /*
2563          * we can't just memcpy() the root in for CRC enabled btree blocks.
2564          * In that case have to also ensure the blkno remains correct
2565          */
2566         memcpy(cblock, block, xfs_btree_block_len(cur));
2567         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2568                 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2569                         cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2570                 else
2571                         cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2572         }
2573
2574         be16_add_cpu(&block->bb_level, 1);
2575         xfs_btree_set_numrecs(block, 1);
2576         cur->bc_nlevels++;
2577         cur->bc_ptrs[level + 1] = 1;
2578
2579         kp = xfs_btree_key_addr(cur, 1, block);
2580         ckp = xfs_btree_key_addr(cur, 1, cblock);
2581         xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2582
2583         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2584 #ifdef DEBUG
2585         for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2586                 error = xfs_btree_check_ptr(cur, pp, i, level);
2587                 if (error)
2588                         goto error0;
2589         }
2590 #endif
2591         xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2592
2593 #ifdef DEBUG
2594         error = xfs_btree_check_ptr(cur, &nptr, 0, level);
2595         if (error)
2596                 goto error0;
2597 #endif
2598         xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2599
2600         xfs_iroot_realloc(cur->bc_private.b.ip,
2601                           1 - xfs_btree_get_numrecs(cblock),
2602                           cur->bc_private.b.whichfork);
2603
2604         xfs_btree_setbuf(cur, level, cbp);
2605
2606         /*
2607          * Do all this logging at the end so that
2608          * the root is at the right level.
2609          */
2610         xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2611         xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2612         xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2613
2614         *logflags |=
2615                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
2616         *stat = 1;
2617         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2618         return 0;
2619 error0:
2620         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2621         return error;
2622 }
2623
2624 /*
2625  * Allocate a new root block, fill it in.
2626  */
2627 STATIC int                              /* error */
2628 xfs_btree_new_root(
2629         struct xfs_btree_cur    *cur,   /* btree cursor */
2630         int                     *stat)  /* success/failure */
2631 {
2632         struct xfs_btree_block  *block; /* one half of the old root block */
2633         struct xfs_buf          *bp;    /* buffer containing block */
2634         int                     error;  /* error return value */
2635         struct xfs_buf          *lbp;   /* left buffer pointer */
2636         struct xfs_btree_block  *left;  /* left btree block */
2637         struct xfs_buf          *nbp;   /* new (root) buffer */
2638         struct xfs_btree_block  *new;   /* new (root) btree block */
2639         int                     nptr;   /* new value for key index, 1 or 2 */
2640         struct xfs_buf          *rbp;   /* right buffer pointer */
2641         struct xfs_btree_block  *right; /* right btree block */
2642         union xfs_btree_ptr     rptr;
2643         union xfs_btree_ptr     lptr;
2644
2645         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2646         XFS_BTREE_STATS_INC(cur, newroot);
2647
2648         /* initialise our start point from the cursor */
2649         cur->bc_ops->init_ptr_from_cur(cur, &rptr);
2650
2651         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2652         error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
2653         if (error)
2654                 goto error0;
2655         if (*stat == 0)
2656                 goto out0;
2657         XFS_BTREE_STATS_INC(cur, alloc);
2658
2659         /* Set up the new block. */
2660         error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
2661         if (error)
2662                 goto error0;
2663
2664         /* Set the root in the holding structure  increasing the level by 1. */
2665         cur->bc_ops->set_root(cur, &lptr, 1);
2666
2667         /*
2668          * At the previous root level there are now two blocks: the old root,
2669          * and the new block generated when it was split.  We don't know which
2670          * one the cursor is pointing at, so we set up variables "left" and
2671          * "right" for each case.
2672          */
2673         block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
2674
2675 #ifdef DEBUG
2676         error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
2677         if (error)
2678                 goto error0;
2679 #endif
2680
2681         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
2682         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
2683                 /* Our block is left, pick up the right block. */
2684                 lbp = bp;
2685                 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2686                 left = block;
2687                 error = xfs_btree_read_buf_block(cur, &rptr,
2688                                         cur->bc_nlevels - 1, 0, &right, &rbp);
2689                 if (error)
2690                         goto error0;
2691                 bp = rbp;
2692                 nptr = 1;
2693         } else {
2694                 /* Our block is right, pick up the left block. */
2695                 rbp = bp;
2696                 xfs_btree_buf_to_ptr(cur, rbp, &rptr);
2697                 right = block;
2698                 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2699                 error = xfs_btree_read_buf_block(cur, &lptr,
2700                                         cur->bc_nlevels - 1, 0, &left, &lbp);
2701                 if (error)
2702                         goto error0;
2703                 bp = lbp;
2704                 nptr = 2;
2705         }
2706         /* Fill in the new block's btree header and log it. */
2707         xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
2708         xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2709         ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2710                         !xfs_btree_ptr_is_null(cur, &rptr));
2711
2712         /* Fill in the key data in the new root. */
2713         if (xfs_btree_get_level(left) > 0) {
2714                 xfs_btree_copy_keys(cur,
2715                                 xfs_btree_key_addr(cur, 1, new),
2716                                 xfs_btree_key_addr(cur, 1, left), 1);
2717                 xfs_btree_copy_keys(cur,
2718                                 xfs_btree_key_addr(cur, 2, new),
2719                                 xfs_btree_key_addr(cur, 1, right), 1);
2720         } else {
2721                 cur->bc_ops->init_key_from_rec(
2722                                 xfs_btree_key_addr(cur, 1, new),
2723                                 xfs_btree_rec_addr(cur, 1, left));
2724                 cur->bc_ops->init_key_from_rec(
2725                                 xfs_btree_key_addr(cur, 2, new),
2726                                 xfs_btree_rec_addr(cur, 1, right));
2727         }
2728         xfs_btree_log_keys(cur, nbp, 1, 2);
2729
2730         /* Fill in the pointer data in the new root. */
2731         xfs_btree_copy_ptrs(cur,
2732                 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
2733         xfs_btree_copy_ptrs(cur,
2734                 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
2735         xfs_btree_log_ptrs(cur, nbp, 1, 2);
2736
2737         /* Fix up the cursor. */
2738         xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
2739         cur->bc_ptrs[cur->bc_nlevels] = nptr;
2740         cur->bc_nlevels++;
2741         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2742         *stat = 1;
2743         return 0;
2744 error0:
2745         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2746         return error;
2747 out0:
2748         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2749         *stat = 0;
2750         return 0;
2751 }
2752
2753 STATIC int
2754 xfs_btree_make_block_unfull(
2755         struct xfs_btree_cur    *cur,   /* btree cursor */
2756         int                     level,  /* btree level */
2757         int                     numrecs,/* # of recs in block */
2758         int                     *oindex,/* old tree index */
2759         int                     *index, /* new tree index */
2760         union xfs_btree_ptr     *nptr,  /* new btree ptr */
2761         struct xfs_btree_cur    **ncur, /* new btree cursor */
2762         union xfs_btree_rec     *nrec,  /* new record */
2763         int                     *stat)
2764 {
2765         union xfs_btree_key     key;    /* new btree key value */
2766         int                     error = 0;
2767
2768         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2769             level == cur->bc_nlevels - 1) {
2770                 struct xfs_inode *ip = cur->bc_private.b.ip;
2771
2772                 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
2773                         /* A root block that can be made bigger. */
2774                         xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
2775                 } else {
2776                         /* A root block that needs replacing */
2777                         int     logflags = 0;
2778
2779                         error = xfs_btree_new_iroot(cur, &logflags, stat);
2780                         if (error || *stat == 0)
2781                                 return error;
2782
2783                         xfs_trans_log_inode(cur->bc_tp, ip, logflags);
2784                 }
2785
2786                 return 0;
2787         }
2788
2789         /* First, try shifting an entry to the right neighbor. */
2790         error = xfs_btree_rshift(cur, level, stat);
2791         if (error || *stat)
2792                 return error;
2793
2794         /* Next, try shifting an entry to the left neighbor. */
2795         error = xfs_btree_lshift(cur, level, stat);
2796         if (error)
2797                 return error;
2798
2799         if (*stat) {
2800                 *oindex = *index = cur->bc_ptrs[level];
2801                 return 0;
2802         }
2803
2804         /*
2805          * Next, try splitting the current block in half.
2806          *
2807          * If this works we have to re-set our variables because we
2808          * could be in a different block now.
2809          */
2810         error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
2811         if (error || *stat == 0)
2812                 return error;
2813
2814
2815         *index = cur->bc_ptrs[level];
2816         cur->bc_ops->init_rec_from_key(&key, nrec);
2817         return 0;
2818 }
2819
2820 /*
2821  * Insert one record/level.  Return information to the caller
2822  * allowing the next level up to proceed if necessary.
2823  */
2824 STATIC int
2825 xfs_btree_insrec(
2826         struct xfs_btree_cur    *cur,   /* btree cursor */
2827         int                     level,  /* level to insert record at */
2828         union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
2829         union xfs_btree_rec     *recp,  /* i/o: record data inserted */
2830         struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
2831         int                     *stat)  /* success/failure */
2832 {
2833         struct xfs_btree_block  *block; /* btree block */
2834         struct xfs_buf          *bp;    /* buffer for block */
2835         union xfs_btree_key     key;    /* btree key */
2836         union xfs_btree_ptr     nptr;   /* new block ptr */
2837         struct xfs_btree_cur    *ncur;  /* new btree cursor */
2838         union xfs_btree_rec     nrec;   /* new record count */
2839         int                     optr;   /* old key/record index */
2840         int                     ptr;    /* key/record index */
2841         int                     numrecs;/* number of records */
2842         int                     error;  /* error return value */
2843 #ifdef DEBUG
2844         int                     i;
2845 #endif
2846
2847         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2848         XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
2849
2850         ncur = NULL;
2851
2852         /*
2853          * If we have an external root pointer, and we've made it to the
2854          * root level, allocate a new root block and we're done.
2855          */
2856         if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2857             (level >= cur->bc_nlevels)) {
2858                 error = xfs_btree_new_root(cur, stat);
2859                 xfs_btree_set_ptr_null(cur, ptrp);
2860
2861                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2862                 return error;
2863         }
2864
2865         /* If we're off the left edge, return failure. */
2866         ptr = cur->bc_ptrs[level];
2867         if (ptr == 0) {
2868                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2869                 *stat = 0;
2870                 return 0;
2871         }
2872
2873         /* Make a key out of the record data to be inserted, and save it. */
2874         cur->bc_ops->init_key_from_rec(&key, recp);
2875
2876         optr = ptr;
2877
2878         XFS_BTREE_STATS_INC(cur, insrec);
2879
2880         /* Get pointers to the btree buffer and block. */
2881         block = xfs_btree_get_block(cur, level, &bp);
2882         numrecs = xfs_btree_get_numrecs(block);
2883
2884 #ifdef DEBUG
2885         error = xfs_btree_check_block(cur, block, level, bp);
2886         if (error)
2887                 goto error0;
2888
2889         /* Check that the new entry is being inserted in the right place. */
2890         if (ptr <= numrecs) {
2891                 if (level == 0) {
2892                         ASSERT(cur->bc_ops->recs_inorder(cur, recp,
2893                                 xfs_btree_rec_addr(cur, ptr, block)));
2894                 } else {
2895                         ASSERT(cur->bc_ops->keys_inorder(cur, &key,
2896                                 xfs_btree_key_addr(cur, ptr, block)));
2897                 }
2898         }
2899 #endif
2900
2901         /*
2902          * If the block is full, we can't insert the new entry until we
2903          * make the block un-full.
2904          */
2905         xfs_btree_set_ptr_null(cur, &nptr);
2906         if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
2907                 error = xfs_btree_make_block_unfull(cur, level, numrecs,
2908                                         &optr, &ptr, &nptr, &ncur, &nrec, stat);
2909                 if (error || *stat == 0)
2910                         goto error0;
2911         }
2912
2913         /*
2914          * The current block may have changed if the block was
2915          * previously full and we have just made space in it.
2916          */
2917         block = xfs_btree_get_block(cur, level, &bp);
2918         numrecs = xfs_btree_get_numrecs(block);
2919
2920 #ifdef DEBUG
2921         error = xfs_btree_check_block(cur, block, level, bp);
2922         if (error)
2923                 return error;
2924 #endif
2925
2926         /*
2927          * At this point we know there's room for our new entry in the block
2928          * we're pointing at.
2929          */
2930         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
2931
2932         if (level > 0) {
2933                 /* It's a nonleaf. make a hole in the keys and ptrs */
2934                 union xfs_btree_key     *kp;
2935                 union xfs_btree_ptr     *pp;
2936
2937                 kp = xfs_btree_key_addr(cur, ptr, block);
2938                 pp = xfs_btree_ptr_addr(cur, ptr, block);
2939
2940 #ifdef DEBUG
2941                 for (i = numrecs - ptr; i >= 0; i--) {
2942                         error = xfs_btree_check_ptr(cur, pp, i, level);
2943                         if (error)
2944                                 return error;
2945                 }
2946 #endif
2947
2948                 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
2949                 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
2950
2951 #ifdef DEBUG
2952                 error = xfs_btree_check_ptr(cur, ptrp, 0, level);
2953                 if (error)
2954                         goto error0;
2955 #endif
2956
2957                 /* Now put the new data in, bump numrecs and log it. */
2958                 xfs_btree_copy_keys(cur, kp, &key, 1);
2959                 xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
2960                 numrecs++;
2961                 xfs_btree_set_numrecs(block, numrecs);
2962                 xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
2963                 xfs_btree_log_keys(cur, bp, ptr, numrecs);
2964 #ifdef DEBUG
2965                 if (ptr < numrecs) {
2966                         ASSERT(cur->bc_ops->keys_inorder(cur, kp,
2967                                 xfs_btree_key_addr(cur, ptr + 1, block)));
2968                 }
2969 #endif
2970         } else {
2971                 /* It's a leaf. make a hole in the records */
2972                 union xfs_btree_rec             *rp;
2973
2974                 rp = xfs_btree_rec_addr(cur, ptr, block);
2975
2976                 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
2977
2978                 /* Now put the new data in, bump numrecs and log it. */
2979                 xfs_btree_copy_recs(cur, rp, recp, 1);
2980                 xfs_btree_set_numrecs(block, ++numrecs);
2981                 xfs_btree_log_recs(cur, bp, ptr, numrecs);
2982 #ifdef DEBUG
2983                 if (ptr < numrecs) {
2984                         ASSERT(cur->bc_ops->recs_inorder(cur, rp,
2985                                 xfs_btree_rec_addr(cur, ptr + 1, block)));
2986                 }
2987 #endif
2988         }
2989
2990         /* Log the new number of records in the btree header. */
2991         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
2992
2993         /* If we inserted at the start of a block, update the parents' keys. */
2994         if (optr == 1) {
2995                 error = xfs_btree_updkey(cur, &key, level + 1);
2996                 if (error)
2997                         goto error0;
2998         }
2999
3000         /*
3001          * If we are tracking the last record in the tree and
3002          * we are at the far right edge of the tree, update it.
3003          */
3004         if (xfs_btree_is_lastrec(cur, block, level)) {
3005                 cur->bc_ops->update_lastrec(cur, block, recp,
3006                                             ptr, LASTREC_INSREC);
3007         }
3008
3009         /*
3010          * Return the new block number, if any.
3011          * If there is one, give back a record value and a cursor too.
3012          */
3013         *ptrp = nptr;
3014         if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3015                 *recp = nrec;
3016                 *curp = ncur;
3017         }
3018
3019         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3020         *stat = 1;
3021         return 0;
3022
3023 error0:
3024         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3025         return error;
3026 }
3027
3028 /*
3029  * Insert the record at the point referenced by cur.
3030  *
3031  * A multi-level split of the tree on insert will invalidate the original
3032  * cursor.  All callers of this function should assume that the cursor is
3033  * no longer valid and revalidate it.
3034  */
3035 int
3036 xfs_btree_insert(
3037         struct xfs_btree_cur    *cur,
3038         int                     *stat)
3039 {
3040         int                     error;  /* error return value */
3041         int                     i;      /* result value, 0 for failure */
3042         int                     level;  /* current level number in btree */
3043         union xfs_btree_ptr     nptr;   /* new block number (split result) */
3044         struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3045         struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3046         union xfs_btree_rec     rec;    /* record to insert */
3047
3048         level = 0;
3049         ncur = NULL;
3050         pcur = cur;
3051
3052         xfs_btree_set_ptr_null(cur, &nptr);
3053         cur->bc_ops->init_rec_from_cur(cur, &rec);
3054
3055         /*
3056          * Loop going up the tree, starting at the leaf level.
3057          * Stop when we don't get a split block, that must mean that
3058          * the insert is finished with this level.
3059          */
3060         do {
3061                 /*
3062                  * Insert nrec/nptr into this level of the tree.
3063                  * Note if we fail, nptr will be null.
3064                  */
3065                 error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
3066                 if (error) {
3067                         if (pcur != cur)
3068                                 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3069                         goto error0;
3070                 }
3071
3072                 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3073                 level++;
3074
3075                 /*
3076                  * See if the cursor we just used is trash.
3077                  * Can't trash the caller's cursor, but otherwise we should
3078                  * if ncur is a new cursor or we're about to be done.
3079                  */
3080                 if (pcur != cur &&
3081                     (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3082                         /* Save the state from the cursor before we trash it */
3083                         if (cur->bc_ops->update_cursor)
3084                                 cur->bc_ops->update_cursor(pcur, cur);
3085                         cur->bc_nlevels = pcur->bc_nlevels;
3086                         xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3087                 }
3088                 /* If we got a new cursor, switch to it. */
3089                 if (ncur) {
3090                         pcur = ncur;
3091                         ncur = NULL;
3092                 }
3093         } while (!xfs_btree_ptr_is_null(cur, &nptr));
3094
3095         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3096         *stat = i;
3097         return 0;
3098 error0:
3099         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3100         return error;
3101 }
3102
3103 /*
3104  * Try to merge a non-leaf block back into the inode root.
3105  *
3106  * Note: the killroot names comes from the fact that we're effectively
3107  * killing the old root block.  But because we can't just delete the
3108  * inode we have to copy the single block it was pointing to into the
3109  * inode.
3110  */
3111 STATIC int
3112 xfs_btree_kill_iroot(
3113         struct xfs_btree_cur    *cur)
3114 {
3115         int                     whichfork = cur->bc_private.b.whichfork;
3116         struct xfs_inode        *ip = cur->bc_private.b.ip;
3117         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3118         struct xfs_btree_block  *block;
3119         struct xfs_btree_block  *cblock;
3120         union xfs_btree_key     *kp;
3121         union xfs_btree_key     *ckp;
3122         union xfs_btree_ptr     *pp;
3123         union xfs_btree_ptr     *cpp;
3124         struct xfs_buf          *cbp;
3125         int                     level;
3126         int                     index;
3127         int                     numrecs;
3128 #ifdef DEBUG
3129         union xfs_btree_ptr     ptr;
3130         int                     i;
3131 #endif
3132
3133         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3134
3135         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3136         ASSERT(cur->bc_nlevels > 1);
3137
3138         /*
3139          * Don't deal with the root block needs to be a leaf case.
3140          * We're just going to turn the thing back into extents anyway.
3141          */
3142         level = cur->bc_nlevels - 1;
3143         if (level == 1)
3144                 goto out0;
3145
3146         /*
3147          * Give up if the root has multiple children.
3148          */
3149         block = xfs_btree_get_iroot(cur);
3150         if (xfs_btree_get_numrecs(block) != 1)
3151                 goto out0;
3152
3153         cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3154         numrecs = xfs_btree_get_numrecs(cblock);
3155
3156         /*
3157          * Only do this if the next level will fit.
3158          * Then the data must be copied up to the inode,
3159          * instead of freeing the root you free the next level.
3160          */
3161         if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3162                 goto out0;
3163
3164         XFS_BTREE_STATS_INC(cur, killroot);
3165
3166 #ifdef DEBUG
3167         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3168         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3169         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3170         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3171 #endif
3172
3173         index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3174         if (index) {
3175                 xfs_iroot_realloc(cur->bc_private.b.ip, index,
3176                                   cur->bc_private.b.whichfork);
3177                 block = ifp->if_broot;
3178         }
3179
3180         be16_add_cpu(&block->bb_numrecs, index);
3181         ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3182
3183         kp = xfs_btree_key_addr(cur, 1, block);
3184         ckp = xfs_btree_key_addr(cur, 1, cblock);
3185         xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3186
3187         pp = xfs_btree_ptr_addr(cur, 1, block);
3188         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3189 #ifdef DEBUG
3190         for (i = 0; i < numrecs; i++) {
3191                 int             error;
3192
3193                 error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3194                 if (error) {
3195                         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3196                         return error;
3197                 }
3198         }
3199 #endif
3200         xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3201
3202         cur->bc_ops->free_block(cur, cbp);
3203         XFS_BTREE_STATS_INC(cur, free);
3204
3205         cur->bc_bufs[level - 1] = NULL;
3206         be16_add_cpu(&block->bb_level, -1);
3207         xfs_trans_log_inode(cur->bc_tp, ip,
3208                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3209         cur->bc_nlevels--;
3210 out0:
3211         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3212         return 0;
3213 }
3214
3215 /*
3216  * Kill the current root node, and replace it with it's only child node.
3217  */
3218 STATIC int
3219 xfs_btree_kill_root(
3220         struct xfs_btree_cur    *cur,
3221         struct xfs_buf          *bp,
3222         int                     level,
3223         union xfs_btree_ptr     *newroot)
3224 {
3225         int                     error;
3226
3227         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3228         XFS_BTREE_STATS_INC(cur, killroot);
3229
3230         /*
3231          * Update the root pointer, decreasing the level by 1 and then
3232          * free the old root.
3233          */
3234         cur->bc_ops->set_root(cur, newroot, -1);
3235
3236         error = cur->bc_ops->free_block(cur, bp);
3237         if (error) {
3238                 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3239                 return error;
3240         }
3241
3242         XFS_BTREE_STATS_INC(cur, free);
3243
3244         cur->bc_bufs[level] = NULL;
3245         cur->bc_ra[level] = 0;
3246         cur->bc_nlevels--;
3247
3248         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3249         return 0;
3250 }
3251
3252 STATIC int
3253 xfs_btree_dec_cursor(
3254         struct xfs_btree_cur    *cur,
3255         int                     level,
3256         int                     *stat)
3257 {
3258         int                     error;
3259         int                     i;
3260
3261         if (level > 0) {
3262                 error = xfs_btree_decrement(cur, level, &i);
3263                 if (error)
3264                         return error;
3265         }
3266
3267         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3268         *stat = 1;
3269         return 0;
3270 }
3271
3272 /*
3273  * Single level of the btree record deletion routine.
3274  * Delete record pointed to by cur/level.
3275  * Remove the record from its block then rebalance the tree.
3276  * Return 0 for error, 1 for done, 2 to go on to the next level.
3277  */
3278 STATIC int                                      /* error */
3279 xfs_btree_delrec(
3280         struct xfs_btree_cur    *cur,           /* btree cursor */
3281         int                     level,          /* level removing record from */
3282         int                     *stat)          /* fail/done/go-on */
3283 {
3284         struct xfs_btree_block  *block;         /* btree block */
3285         union xfs_btree_ptr     cptr;           /* current block ptr */
3286         struct xfs_buf          *bp;            /* buffer for block */
3287         int                     error;          /* error return value */
3288         int                     i;              /* loop counter */
3289         union xfs_btree_key     key;            /* storage for keyp */
3290         union xfs_btree_key     *keyp = &key;   /* passed to the next level */
3291         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3292         struct xfs_buf          *lbp;           /* left buffer pointer */
3293         struct xfs_btree_block  *left;          /* left btree block */
3294         int                     lrecs = 0;      /* left record count */
3295         int                     ptr;            /* key/record index */
3296         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3297         struct xfs_buf          *rbp;           /* right buffer pointer */
3298         struct xfs_btree_block  *right;         /* right btree block */
3299         struct xfs_btree_block  *rrblock;       /* right-right btree block */
3300         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3301         int                     rrecs = 0;      /* right record count */
3302         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3303         int                     numrecs;        /* temporary numrec count */
3304
3305         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3306         XFS_BTREE_TRACE_ARGI(cur, level);
3307
3308         tcur = NULL;
3309
3310         /* Get the index of the entry being deleted, check for nothing there. */
3311         ptr = cur->bc_ptrs[level];
3312         if (ptr == 0) {
3313                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3314                 *stat = 0;
3315                 return 0;
3316         }
3317
3318         /* Get the buffer & block containing the record or key/ptr. */
3319         block = xfs_btree_get_block(cur, level, &bp);
3320         numrecs = xfs_btree_get_numrecs(block);
3321
3322 #ifdef DEBUG
3323         error = xfs_btree_check_block(cur, block, level, bp);
3324         if (error)
3325                 goto error0;
3326 #endif
3327
3328         /* Fail if we're off the end of the block. */
3329         if (ptr > numrecs) {
3330                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3331                 *stat = 0;
3332                 return 0;
3333         }
3334
3335         XFS_BTREE_STATS_INC(cur, delrec);
3336         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3337
3338         /* Excise the entries being deleted. */
3339         if (level > 0) {
3340                 /* It's a nonleaf. operate on keys and ptrs */
3341                 union xfs_btree_key     *lkp;
3342                 union xfs_btree_ptr     *lpp;
3343
3344                 lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3345                 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3346
3347 #ifdef DEBUG
3348                 for (i = 0; i < numrecs - ptr; i++) {
3349                         error = xfs_btree_check_ptr(cur, lpp, i, level);
3350                         if (error)
3351                                 goto error0;
3352                 }
3353 #endif
3354
3355                 if (ptr < numrecs) {
3356                         xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3357                         xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3358                         xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3359                         xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3360                 }
3361
3362                 /*
3363                  * If it's the first record in the block, we'll need to pass a
3364                  * key up to the next level (updkey).
3365                  */
3366                 if (ptr == 1)
3367                         keyp = xfs_btree_key_addr(cur, 1, block);
3368         } else {
3369                 /* It's a leaf. operate on records */
3370                 if (ptr < numrecs) {
3371                         xfs_btree_shift_recs(cur,
3372                                 xfs_btree_rec_addr(cur, ptr + 1, block),
3373                                 -1, numrecs - ptr);
3374                         xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3375                 }
3376
3377                 /*
3378                  * If it's the first record in the block, we'll need a key
3379                  * structure to pass up to the next level (updkey).
3380                  */
3381                 if (ptr == 1) {
3382                         cur->bc_ops->init_key_from_rec(&key,
3383                                         xfs_btree_rec_addr(cur, 1, block));
3384                         keyp = &key;
3385                 }
3386         }
3387
3388         /*
3389          * Decrement and log the number of entries in the block.
3390          */
3391         xfs_btree_set_numrecs(block, --numrecs);
3392         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3393
3394         /*
3395          * If we are tracking the last record in the tree and
3396          * we are at the far right edge of the tree, update it.
3397          */
3398         if (xfs_btree_is_lastrec(cur, block, level)) {
3399                 cur->bc_ops->update_lastrec(cur, block, NULL,
3400                                             ptr, LASTREC_DELREC);
3401         }
3402
3403         /*
3404          * We're at the root level.  First, shrink the root block in-memory.
3405          * Try to get rid of the next level down.  If we can't then there's
3406          * nothing left to do.
3407          */
3408         if (level == cur->bc_nlevels - 1) {
3409                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3410                         xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3411                                           cur->bc_private.b.whichfork);
3412
3413                         error = xfs_btree_kill_iroot(cur);
3414                         if (error)
3415                                 goto error0;
3416
3417                         error = xfs_btree_dec_cursor(cur, level, stat);
3418                         if (error)
3419                                 goto error0;
3420                         *stat = 1;
3421                         return 0;
3422                 }
3423
3424                 /*
3425                  * If this is the root level, and there's only one entry left,
3426                  * and it's NOT the leaf level, then we can get rid of this
3427                  * level.
3428                  */
3429                 if (numrecs == 1 && level > 0) {
3430                         union xfs_btree_ptr     *pp;
3431                         /*
3432                          * pp is still set to the first pointer in the block.
3433                          * Make it the new root of the btree.
3434                          */
3435                         pp = xfs_btree_ptr_addr(cur, 1, block);
3436                         error = xfs_btree_kill_root(cur, bp, level, pp);
3437                         if (error)
3438                                 goto error0;
3439                 } else if (level > 0) {
3440                         error = xfs_btree_dec_cursor(cur, level, stat);
3441                         if (error)
3442                                 goto error0;
3443                 }
3444                 *stat = 1;
3445                 return 0;
3446         }
3447
3448         /*
3449          * If we deleted the leftmost entry in the block, update the
3450          * key values above us in the tree.
3451          */
3452         if (ptr == 1) {
3453                 error = xfs_btree_updkey(cur, keyp, level + 1);
3454                 if (error)
3455                         goto error0;
3456         }
3457
3458         /*
3459          * If the number of records remaining in the block is at least
3460          * the minimum, we're done.
3461          */
3462         if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3463                 error = xfs_btree_dec_cursor(cur, level, stat);
3464                 if (error)
3465                         goto error0;
3466                 return 0;
3467         }
3468
3469         /*
3470          * Otherwise, we have to move some records around to keep the
3471          * tree balanced.  Look at the left and right sibling blocks to
3472          * see if we can re-balance by moving only one record.
3473          */
3474         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3475         xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3476
3477         if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3478                 /*
3479                  * One child of root, need to get a chance to copy its contents
3480                  * into the root and delete it. Can't go up to next level,
3481                  * there's nothing to delete there.
3482                  */
3483                 if (xfs_btree_ptr_is_null(cur, &rptr) &&
3484                     xfs_btree_ptr_is_null(cur, &lptr) &&
3485                     level == cur->bc_nlevels - 2) {
3486                         error = xfs_btree_kill_iroot(cur);
3487                         if (!error)
3488                                 error = xfs_btree_dec_cursor(cur, level, stat);
3489                         if (error)
3490                                 goto error0;
3491                         return 0;
3492                 }
3493         }
3494
3495         ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3496                !xfs_btree_ptr_is_null(cur, &lptr));
3497
3498         /*
3499          * Duplicate the cursor so our btree manipulations here won't
3500          * disrupt the next level up.
3501          */
3502         error = xfs_btree_dup_cursor(cur, &tcur);
3503         if (error)
3504                 goto error0;
3505
3506         /*
3507          * If there's a right sibling, see if it's ok to shift an entry
3508          * out of it.
3509          */
3510         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3511                 /*
3512                  * Move the temp cursor to the last entry in the next block.
3513                  * Actually any entry but the first would suffice.
3514                  */
3515                 i = xfs_btree_lastrec(tcur, level);
3516                 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3517
3518                 error = xfs_btree_increment(tcur, level, &i);
3519                 if (error)
3520                         goto error0;
3521                 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3522
3523                 i = xfs_btree_lastrec(tcur, level);
3524                 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3525
3526                 /* Grab a pointer to the block. */
3527                 right = xfs_btree_get_block(tcur, level, &rbp);
3528 #ifdef DEBUG
3529                 error = xfs_btree_check_block(tcur, right, level, rbp);
3530                 if (error)
3531                         goto error0;
3532 #endif
3533                 /* Grab the current block number, for future use. */
3534                 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3535
3536                 /*
3537                  * If right block is full enough so that removing one entry
3538                  * won't make it too empty, and left-shifting an entry out
3539                  * of right to us works, we're done.
3540                  */
3541                 if (xfs_btree_get_numrecs(right) - 1 >=
3542                     cur->bc_ops->get_minrecs(tcur, level)) {
3543                         error = xfs_btree_lshift(tcur, level, &i);
3544                         if (error)
3545                                 goto error0;
3546                         if (i) {
3547                                 ASSERT(xfs_btree_get_numrecs(block) >=
3548                                        cur->bc_ops->get_minrecs(tcur, level));
3549
3550                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3551                                 tcur = NULL;
3552
3553                                 error = xfs_btree_dec_cursor(cur, level, stat);
3554                                 if (error)
3555                                         goto error0;
3556                                 return 0;
3557                         }
3558                 }
3559
3560                 /*
3561                  * Otherwise, grab the number of records in right for
3562                  * future reference, and fix up the temp cursor to point
3563                  * to our block again (last record).
3564                  */
3565                 rrecs = xfs_btree_get_numrecs(right);
3566                 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3567                         i = xfs_btree_firstrec(tcur, level);
3568                         XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3569
3570                         error = xfs_btree_decrement(tcur, level, &i);
3571                         if (error)
3572                                 goto error0;
3573                         XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3574                 }
3575         }
3576
3577         /*
3578          * If there's a left sibling, see if it's ok to shift an entry
3579          * out of it.
3580          */
3581         if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3582                 /*
3583                  * Move the temp cursor to the first entry in the
3584                  * previous block.
3585                  */
3586                 i = xfs_btree_firstrec(tcur, level);
3587                 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3588
3589                 error = xfs_btree_decrement(tcur, level, &i);
3590                 if (error)
3591                         goto error0;
3592                 i = xfs_btree_firstrec(tcur, level);
3593                 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
3594
3595                 /* Grab a pointer to the block. */
3596                 left = xfs_btree_get_block(tcur, level, &lbp);
3597 #ifdef DEBUG
3598                 error = xfs_btree_check_block(cur, left, level, lbp);
3599                 if (error)
3600                         goto error0;
3601 #endif
3602                 /* Grab the current block number, for future use. */
3603                 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3604
3605                 /*
3606                  * If left block is full enough so that removing one entry
3607                  * won't make it too empty, and right-shifting an entry out
3608                  * of left to us works, we're done.
3609                  */
3610                 if (xfs_btree_get_numrecs(left) - 1 >=
3611                     cur->bc_ops->get_minrecs(tcur, level)) {
3612                         error = xfs_btree_rshift(tcur, level, &i);
3613                         if (error)
3614                                 goto error0;
3615                         if (i) {
3616                                 ASSERT(xfs_btree_get_numrecs(block) >=
3617                                        cur->bc_ops->get_minrecs(tcur, level));
3618                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3619                                 tcur = NULL;
3620                                 if (level == 0)
3621                                         cur->bc_ptrs[0]++;
3622                                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3623                                 *stat = 1;
3624                                 return 0;
3625                         }
3626                 }
3627
3628                 /*
3629                  * Otherwise, grab the number of records in right for
3630                  * future reference.
3631                  */
3632                 lrecs = xfs_btree_get_numrecs(left);
3633         }
3634
3635         /* Delete the temp cursor, we're done with it. */
3636         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3637         tcur = NULL;
3638
3639         /* If here, we need to do a join to keep the tree balanced. */
3640         ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3641
3642         if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3643             lrecs + xfs_btree_get_numrecs(block) <=
3644                         cur->bc_ops->get_maxrecs(cur, level)) {
3645                 /*
3646                  * Set "right" to be the starting block,
3647                  * "left" to be the left neighbor.
3648                  */
3649                 rptr = cptr;
3650                 right = block;
3651                 rbp = bp;
3652                 error = xfs_btree_read_buf_block(cur, &lptr, level,
3653                                                         0, &left, &lbp);
3654                 if (error)
3655                         goto error0;
3656
3657         /*
3658          * If that won't work, see if we can join with the right neighbor block.
3659          */
3660         } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
3661                    rrecs + xfs_btree_get_numrecs(block) <=
3662                         cur->bc_ops->get_maxrecs(cur, level)) {
3663                 /*
3664                  * Set "left" to be the starting block,
3665                  * "right" to be the right neighbor.
3666                  */
3667                 lptr = cptr;
3668                 left = block;
3669                 lbp = bp;
3670                 error = xfs_btree_read_buf_block(cur, &rptr, level,
3671                                                         0, &right, &rbp);
3672                 if (error)
3673                         goto error0;
3674
3675         /*
3676          * Otherwise, we can't fix the imbalance.
3677          * Just return.  This is probably a logic error, but it's not fatal.
3678          */
3679         } else {
3680                 error = xfs_btree_dec_cursor(cur, level, stat);
3681                 if (error)
3682                         goto error0;
3683                 return 0;
3684         }
3685
3686         rrecs = xfs_btree_get_numrecs(right);
3687         lrecs = xfs_btree_get_numrecs(left);
3688
3689         /*
3690          * We're now going to join "left" and "right" by moving all the stuff
3691          * in "right" to "left" and deleting "right".
3692          */
3693         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
3694         if (level > 0) {
3695                 /* It's a non-leaf.  Move keys and pointers. */
3696                 union xfs_btree_key     *lkp;   /* left btree key */
3697                 union xfs_btree_ptr     *lpp;   /* left address pointer */
3698                 union xfs_btree_key     *rkp;   /* right btree key */
3699                 union xfs_btree_ptr     *rpp;   /* right address pointer */
3700
3701                 lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
3702                 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
3703                 rkp = xfs_btree_key_addr(cur, 1, right);
3704                 rpp = xfs_btree_ptr_addr(cur, 1, right);
3705 #ifdef DEBUG
3706                 for (i = 1; i < rrecs; i++) {
3707                         error = xfs_btree_check_ptr(cur, rpp, i, level);
3708                         if (error)
3709                                 goto error0;
3710                 }
3711 #endif
3712                 xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
3713                 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
3714
3715                 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
3716                 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
3717         } else {
3718                 /* It's a leaf.  Move records.  */
3719                 union xfs_btree_rec     *lrp;   /* left record pointer */
3720                 union xfs_btree_rec     *rrp;   /* right record pointer */
3721
3722                 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
3723                 rrp = xfs_btree_rec_addr(cur, 1, right);
3724
3725                 xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
3726                 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
3727         }
3728
3729         XFS_BTREE_STATS_INC(cur, join);
3730
3731         /*
3732          * Fix up the number of records and right block pointer in the
3733          * surviving block, and log it.
3734          */
3735         xfs_btree_set_numrecs(left, lrecs + rrecs);
3736         xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
3737         xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3738         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
3739
3740         /* If there is a right sibling, point it to the remaining block. */
3741         xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3742         if (!xfs_btree_ptr_is_null(cur, &cptr)) {
3743                 error = xfs_btree_read_buf_block(cur, &cptr, level,
3744                                                         0, &rrblock, &rrbp);
3745                 if (error)
3746                         goto error0;
3747                 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
3748                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
3749         }
3750
3751         /* Free the deleted block. */
3752         error = cur->bc_ops->free_block(cur, rbp);
3753         if (error)
3754                 goto error0;
3755         XFS_BTREE_STATS_INC(cur, free);
3756
3757         /*
3758          * If we joined with the left neighbor, set the buffer in the
3759          * cursor to the left block, and fix up the index.
3760          */
3761         if (bp != lbp) {
3762                 cur->bc_bufs[level] = lbp;
3763                 cur->bc_ptrs[level] += lrecs;
3764                 cur->bc_ra[level] = 0;
3765         }
3766         /*
3767          * If we joined with the right neighbor and there's a level above
3768          * us, increment the cursor at that level.
3769          */
3770         else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
3771                    (level + 1 < cur->bc_nlevels)) {
3772                 error = xfs_btree_increment(cur, level + 1, &i);
3773                 if (error)
3774                         goto error0;
3775         }
3776
3777         /*
3778          * Readjust the ptr at this level if it's not a leaf, since it's
3779          * still pointing at the deletion point, which makes the cursor
3780          * inconsistent.  If this makes the ptr 0, the caller fixes it up.
3781          * We can't use decrement because it would change the next level up.
3782          */
3783         if (level > 0)
3784                 cur->bc_ptrs[level]--;
3785
3786         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3787         /* Return value means the next level up has something to do. */
3788         *stat = 2;
3789         return 0;
3790
3791 error0:
3792         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3793         if (tcur)
3794                 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
3795         return error;
3796 }
3797
3798 /*
3799  * Delete the record pointed to by cur.
3800  * The cursor refers to the place where the record was (could be inserted)
3801  * when the operation returns.
3802  */
3803 int                                     /* error */
3804 xfs_btree_delete(
3805         struct xfs_btree_cur    *cur,
3806         int                     *stat)  /* success/failure */
3807 {
3808         int                     error;  /* error return value */
3809         int                     level;
3810         int                     i;
3811
3812         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3813
3814         /*
3815          * Go up the tree, starting at leaf level.
3816          *
3817          * If 2 is returned then a join was done; go to the next level.
3818          * Otherwise we are done.
3819          */
3820         for (level = 0, i = 2; i == 2; level++) {
3821                 error = xfs_btree_delrec(cur, level, &i);
3822                 if (error)
3823                         goto error0;
3824         }
3825
3826         if (i == 0) {
3827                 for (level = 1; level < cur->bc_nlevels; level++) {
3828                         if (cur->bc_ptrs[level] == 0) {
3829                                 error = xfs_btree_decrement(cur, level, &i);
3830                                 if (error)
3831                                         goto error0;
3832                                 break;
3833                         }
3834                 }
3835         }
3836
3837         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3838         *stat = i;
3839         return 0;
3840 error0:
3841         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3842         return error;
3843 }
3844
3845 /*
3846  * Get the data from the pointed-to record.
3847  */
3848 int                                     /* error */
3849 xfs_btree_get_rec(
3850         struct xfs_btree_cur    *cur,   /* btree cursor */
3851         union xfs_btree_rec     **recp, /* output: btree record */
3852         int                     *stat)  /* output: success/failure */
3853 {
3854         struct xfs_btree_block  *block; /* btree block */
3855         struct xfs_buf          *bp;    /* buffer pointer */
3856         int                     ptr;    /* record number */
3857 #ifdef DEBUG
3858         int                     error;  /* error return value */
3859 #endif
3860
3861         ptr = cur->bc_ptrs[0];
3862         block = xfs_btree_get_block(cur, 0, &bp);
3863
3864 #ifdef DEBUG
3865         error = xfs_btree_check_block(cur, block, 0, bp);
3866         if (error)
3867                 return error;
3868 #endif
3869
3870         /*
3871          * Off the right end or left end, return failure.
3872          */
3873         if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
3874                 *stat = 0;
3875                 return 0;
3876         }
3877
3878         /*
3879          * Point to the record and extract its data.
3880          */
3881         *recp = xfs_btree_rec_addr(cur, ptr, block);
3882         *stat = 1;
3883         return 0;
3884 }
3885
3886 /*
3887  * Change the owner of a btree.
3888  *
3889  * The mechanism we use here is ordered buffer logging. Because we don't know
3890  * how many buffers were are going to need to modify, we don't really want to
3891  * have to make transaction reservations for the worst case of every buffer in a
3892  * full size btree as that may be more space that we can fit in the log....
3893  *
3894  * We do the btree walk in the most optimal manner possible - we have sibling
3895  * pointers so we can just walk all the blocks on each level from left to right
3896  * in a single pass, and then move to the next level and do the same. We can
3897  * also do readahead on the sibling pointers to get IO moving more quickly,
3898  * though for slow disks this is unlikely to make much difference to performance
3899  * as the amount of CPU work we have to do before moving to the next block is
3900  * relatively small.
3901  *
3902  * For each btree block that we load, modify the owner appropriately, set the
3903  * buffer as an ordered buffer and log it appropriately. We need to ensure that
3904  * we mark the region we change dirty so that if the buffer is relogged in
3905  * a subsequent transaction the changes we make here as an ordered buffer are
3906  * correctly relogged in that transaction.  If we are in recovery context, then
3907  * just queue the modified buffer as delayed write buffer so the transaction
3908  * recovery completion writes the changes to disk.
3909  */
3910 static int
3911 xfs_btree_block_change_owner(
3912         struct xfs_btree_cur    *cur,
3913         int                     level,
3914         __uint64_t              new_owner,
3915         struct list_head        *buffer_list)
3916 {
3917         struct xfs_btree_block  *block;
3918         struct xfs_buf          *bp;
3919         union xfs_btree_ptr     rptr;
3920
3921         /* do right sibling readahead */
3922         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
3923
3924         /* modify the owner */
3925         block = xfs_btree_get_block(cur, level, &bp);
3926         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
3927                 block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
3928         else
3929                 block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
3930
3931         /*
3932          * If the block is a root block hosted in an inode, we might not have a
3933          * buffer pointer here and we shouldn't attempt to log the change as the
3934          * information is already held in the inode and discarded when the root
3935          * block is formatted into the on-disk inode fork. We still change it,
3936          * though, so everything is consistent in memory.
3937          */
3938         if (bp) {
3939                 if (cur->bc_tp) {
3940                         xfs_trans_ordered_buf(cur->bc_tp, bp);
3941                         xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
3942                 } else {
3943                         xfs_buf_delwri_queue(bp, buffer_list);
3944                 }
3945         } else {
3946                 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3947                 ASSERT(level == cur->bc_nlevels - 1);
3948         }
3949
3950         /* now read rh sibling block for next iteration */
3951         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3952         if (xfs_btree_ptr_is_null(cur, &rptr))
3953                 return ENOENT;
3954
3955         return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
3956 }
3957
3958 int
3959 xfs_btree_change_owner(
3960         struct xfs_btree_cur    *cur,
3961         __uint64_t              new_owner,
3962         struct list_head        *buffer_list)
3963 {
3964         union xfs_btree_ptr     lptr;
3965         int                     level;
3966         struct xfs_btree_block  *block = NULL;
3967         int                     error = 0;
3968
3969         cur->bc_ops->init_ptr_from_cur(cur, &lptr);
3970
3971         /* for each level */
3972         for (level = cur->bc_nlevels - 1; level >= 0; level--) {
3973                 /* grab the left hand block */
3974                 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
3975                 if (error)
3976                         return error;
3977
3978                 /* readahead the left most block for the next level down */
3979                 if (level > 0) {
3980                         union xfs_btree_ptr     *ptr;
3981
3982                         ptr = xfs_btree_ptr_addr(cur, 1, block);
3983                         xfs_btree_readahead_ptr(cur, ptr, 1);
3984
3985                         /* save for the next iteration of the loop */
3986                         lptr = *ptr;
3987                 }
3988
3989                 /* for each buffer in the level */
3990                 do {
3991                         error = xfs_btree_block_change_owner(cur, level,
3992                                                              new_owner,
3993                                                              buffer_list);
3994                 } while (!error);
3995
3996                 if (error != ENOENT)
3997                         return error;
3998         }
3999
4000         return 0;
4001 }