]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - fs/ext4/balloc.c
ext4: Make sure all the block allocation paths reserve blocks
[karo-tx-linux.git] / fs / ext4 / balloc.c
1 /*
2  *  linux/fs/ext4/balloc.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
10  *  Big-endian to little-endian byte-swapping/bitmaps by
11  *        David S. Miller (davem@caip.rutgers.edu), 1995
12  */
13
14 #include <linux/time.h>
15 #include <linux/capability.h>
16 #include <linux/fs.h>
17 #include <linux/jbd2.h>
18 #include <linux/quotaops.h>
19 #include <linux/buffer_head.h>
20 #include "ext4.h"
21 #include "ext4_jbd2.h"
22 #include "group.h"
23 #include "mballoc.h"
24
25 /*
26  * balloc.c contains the blocks allocation and deallocation routines
27  */
28
29 /*
30  * Calculate the block group number and offset, given a block number
31  */
32 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
33                 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
34 {
35         struct ext4_super_block *es = EXT4_SB(sb)->s_es;
36         ext4_grpblk_t offset;
37
38         blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
39         offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
40         if (offsetp)
41                 *offsetp = offset;
42         if (blockgrpp)
43                 *blockgrpp = blocknr;
44
45 }
46
47 static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
48                         ext4_group_t block_group)
49 {
50         ext4_group_t actual_group;
51         ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
52         if (actual_group == block_group)
53                 return 1;
54         return 0;
55 }
56
57 static int ext4_group_used_meta_blocks(struct super_block *sb,
58                                 ext4_group_t block_group)
59 {
60         ext4_fsblk_t tmp;
61         struct ext4_sb_info *sbi = EXT4_SB(sb);
62         /* block bitmap, inode bitmap, and inode table blocks */
63         int used_blocks = sbi->s_itb_per_group + 2;
64
65         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
66                 struct ext4_group_desc *gdp;
67                 struct buffer_head *bh;
68
69                 gdp = ext4_get_group_desc(sb, block_group, &bh);
70                 if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
71                                         block_group))
72                         used_blocks--;
73
74                 if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp),
75                                         block_group))
76                         used_blocks--;
77
78                 tmp = ext4_inode_table(sb, gdp);
79                 for (; tmp < ext4_inode_table(sb, gdp) +
80                                 sbi->s_itb_per_group; tmp++) {
81                         if (!ext4_block_in_group(sb, tmp, block_group))
82                                 used_blocks -= 1;
83                 }
84         }
85         return used_blocks;
86 }
87 /* Initializes an uninitialized block bitmap if given, and returns the
88  * number of blocks free in the group. */
89 unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
90                  ext4_group_t block_group, struct ext4_group_desc *gdp)
91 {
92         int bit, bit_max;
93         unsigned free_blocks, group_blocks;
94         struct ext4_sb_info *sbi = EXT4_SB(sb);
95
96         if (bh) {
97                 J_ASSERT_BH(bh, buffer_locked(bh));
98
99                 /* If checksum is bad mark all blocks used to prevent allocation
100                  * essentially implementing a per-group read-only flag. */
101                 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
102                         ext4_error(sb, __func__,
103                                   "Checksum bad for group %lu\n", block_group);
104                         gdp->bg_free_blocks_count = 0;
105                         gdp->bg_free_inodes_count = 0;
106                         gdp->bg_itable_unused = 0;
107                         memset(bh->b_data, 0xff, sb->s_blocksize);
108                         return 0;
109                 }
110                 memset(bh->b_data, 0, sb->s_blocksize);
111         }
112
113         /* Check for superblock and gdt backups in this group */
114         bit_max = ext4_bg_has_super(sb, block_group);
115
116         if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
117             block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
118                           sbi->s_desc_per_block) {
119                 if (bit_max) {
120                         bit_max += ext4_bg_num_gdb(sb, block_group);
121                         bit_max +=
122                                 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
123                 }
124         } else { /* For META_BG_BLOCK_GROUPS */
125                 bit_max += ext4_bg_num_gdb(sb, block_group);
126         }
127
128         if (block_group == sbi->s_groups_count - 1) {
129                 /*
130                  * Even though mke2fs always initialize first and last group
131                  * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
132                  * to make sure we calculate the right free blocks
133                  */
134                 group_blocks = ext4_blocks_count(sbi->s_es) -
135                         le32_to_cpu(sbi->s_es->s_first_data_block) -
136                         (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1));
137         } else {
138                 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
139         }
140
141         free_blocks = group_blocks - bit_max;
142
143         if (bh) {
144                 ext4_fsblk_t start, tmp;
145                 int flex_bg = 0;
146
147                 for (bit = 0; bit < bit_max; bit++)
148                         ext4_set_bit(bit, bh->b_data);
149
150                 start = ext4_group_first_block_no(sb, block_group);
151
152                 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
153                                               EXT4_FEATURE_INCOMPAT_FLEX_BG))
154                         flex_bg = 1;
155
156                 /* Set bits for block and inode bitmaps, and inode table */
157                 tmp = ext4_block_bitmap(sb, gdp);
158                 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
159                         ext4_set_bit(tmp - start, bh->b_data);
160
161                 tmp = ext4_inode_bitmap(sb, gdp);
162                 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
163                         ext4_set_bit(tmp - start, bh->b_data);
164
165                 tmp = ext4_inode_table(sb, gdp);
166                 for (; tmp < ext4_inode_table(sb, gdp) +
167                                 sbi->s_itb_per_group; tmp++) {
168                         if (!flex_bg ||
169                                 ext4_block_in_group(sb, tmp, block_group))
170                                 ext4_set_bit(tmp - start, bh->b_data);
171                 }
172                 /*
173                  * Also if the number of blocks within the group is
174                  * less than the blocksize * 8 ( which is the size
175                  * of bitmap ), set rest of the block bitmap to 1
176                  */
177                 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
178         }
179         return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
180 }
181
182
183 /*
184  * The free blocks are managed by bitmaps.  A file system contains several
185  * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
186  * block for inodes, N blocks for the inode table and data blocks.
187  *
188  * The file system contains group descriptors which are located after the
189  * super block.  Each descriptor contains the number of the bitmap block and
190  * the free blocks count in the block.  The descriptors are loaded in memory
191  * when a file system is mounted (see ext4_fill_super).
192  */
193
194
195 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
196
197 /**
198  * ext4_get_group_desc() -- load group descriptor from disk
199  * @sb:                 super block
200  * @block_group:        given block group
201  * @bh:                 pointer to the buffer head to store the block
202  *                      group descriptor
203  */
204 struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
205                                              ext4_group_t block_group,
206                                              struct buffer_head ** bh)
207 {
208         unsigned long group_desc;
209         unsigned long offset;
210         struct ext4_group_desc * desc;
211         struct ext4_sb_info *sbi = EXT4_SB(sb);
212
213         if (block_group >= sbi->s_groups_count) {
214                 ext4_error (sb, "ext4_get_group_desc",
215                             "block_group >= groups_count - "
216                             "block_group = %lu, groups_count = %lu",
217                             block_group, sbi->s_groups_count);
218
219                 return NULL;
220         }
221         smp_rmb();
222
223         group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
224         offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
225         if (!sbi->s_group_desc[group_desc]) {
226                 ext4_error (sb, "ext4_get_group_desc",
227                             "Group descriptor not loaded - "
228                             "block_group = %lu, group_desc = %lu, desc = %lu",
229                              block_group, group_desc, offset);
230                 return NULL;
231         }
232
233         desc = (struct ext4_group_desc *)(
234                 (__u8 *)sbi->s_group_desc[group_desc]->b_data +
235                 offset * EXT4_DESC_SIZE(sb));
236         if (bh)
237                 *bh = sbi->s_group_desc[group_desc];
238         return desc;
239 }
240
241 static int ext4_valid_block_bitmap(struct super_block *sb,
242                                         struct ext4_group_desc *desc,
243                                         unsigned int block_group,
244                                         struct buffer_head *bh)
245 {
246         ext4_grpblk_t offset;
247         ext4_grpblk_t next_zero_bit;
248         ext4_fsblk_t bitmap_blk;
249         ext4_fsblk_t group_first_block;
250
251         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
252                 /* with FLEX_BG, the inode/block bitmaps and itable
253                  * blocks may not be in the group at all
254                  * so the bitmap validation will be skipped for those groups
255                  * or it has to also read the block group where the bitmaps
256                  * are located to verify they are set.
257                  */
258                 return 1;
259         }
260         group_first_block = ext4_group_first_block_no(sb, block_group);
261
262         /* check whether block bitmap block number is set */
263         bitmap_blk = ext4_block_bitmap(sb, desc);
264         offset = bitmap_blk - group_first_block;
265         if (!ext4_test_bit(offset, bh->b_data))
266                 /* bad block bitmap */
267                 goto err_out;
268
269         /* check whether the inode bitmap block number is set */
270         bitmap_blk = ext4_inode_bitmap(sb, desc);
271         offset = bitmap_blk - group_first_block;
272         if (!ext4_test_bit(offset, bh->b_data))
273                 /* bad block bitmap */
274                 goto err_out;
275
276         /* check whether the inode table block number is set */
277         bitmap_blk = ext4_inode_table(sb, desc);
278         offset = bitmap_blk - group_first_block;
279         next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
280                                 offset + EXT4_SB(sb)->s_itb_per_group,
281                                 offset);
282         if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
283                 /* good bitmap for inode tables */
284                 return 1;
285
286 err_out:
287         ext4_error(sb, __func__,
288                         "Invalid block bitmap - "
289                         "block_group = %d, block = %llu",
290                         block_group, bitmap_blk);
291         return 0;
292 }
293 /**
294  * ext4_read_block_bitmap()
295  * @sb:                 super block
296  * @block_group:        given block group
297  *
298  * Read the bitmap for a given block_group,and validate the
299  * bits for block/inode/inode tables are set in the bitmaps
300  *
301  * Return buffer_head on success or NULL in case of failure.
302  */
303 struct buffer_head *
304 ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
305 {
306         struct ext4_group_desc * desc;
307         struct buffer_head * bh = NULL;
308         ext4_fsblk_t bitmap_blk;
309
310         desc = ext4_get_group_desc(sb, block_group, NULL);
311         if (!desc)
312                 return NULL;
313         bitmap_blk = ext4_block_bitmap(sb, desc);
314         bh = sb_getblk(sb, bitmap_blk);
315         if (unlikely(!bh)) {
316                 ext4_error(sb, __func__,
317                             "Cannot read block bitmap - "
318                             "block_group = %lu, block_bitmap = %llu",
319                             block_group, bitmap_blk);
320                 return NULL;
321         }
322
323         if (bitmap_uptodate(bh))
324                 return bh;
325
326         lock_buffer(bh);
327         if (bitmap_uptodate(bh)) {
328                 unlock_buffer(bh);
329                 return bh;
330         }
331         spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
332         if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
333                 ext4_init_block_bitmap(sb, bh, block_group, desc);
334                 set_bitmap_uptodate(bh);
335                 set_buffer_uptodate(bh);
336                 unlock_buffer(bh);
337                 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
338                 return bh;
339         }
340         spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
341         if (buffer_uptodate(bh)) {
342                 /*
343                  * if not uninit if bh is uptodate,
344                  * bitmap is also uptodate
345                  */
346                 set_bitmap_uptodate(bh);
347                 unlock_buffer(bh);
348                 return bh;
349         }
350         /*
351          * submit the buffer_head for read. We can
352          * safely mark the bitmap as uptodate now.
353          * We do it here so the bitmap uptodate bit
354          * get set with buffer lock held.
355          */
356         set_bitmap_uptodate(bh);
357         if (bh_submit_read(bh) < 0) {
358                 put_bh(bh);
359                 ext4_error(sb, __func__,
360                             "Cannot read block bitmap - "
361                             "block_group = %lu, block_bitmap = %llu",
362                             block_group, bitmap_blk);
363                 return NULL;
364         }
365         ext4_valid_block_bitmap(sb, desc, block_group, bh);
366         /*
367          * file system mounted not to panic on error,
368          * continue with corrupt bitmap
369          */
370         return bh;
371 }
372 /*
373  * The reservation window structure operations
374  * --------------------------------------------
375  * Operations include:
376  * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
377  *
378  * We use a red-black tree to represent per-filesystem reservation
379  * windows.
380  *
381  */
382
383 /**
384  * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
385  * @rb_root:            root of per-filesystem reservation rb tree
386  * @verbose:            verbose mode
387  * @fn:                 function which wishes to dump the reservation map
388  *
389  * If verbose is turned on, it will print the whole block reservation
390  * windows(start, end). Otherwise, it will only print out the "bad" windows,
391  * those windows that overlap with their immediate neighbors.
392  */
393 #if 1
394 static void __rsv_window_dump(struct rb_root *root, int verbose,
395                               const char *fn)
396 {
397         struct rb_node *n;
398         struct ext4_reserve_window_node *rsv, *prev;
399         int bad;
400
401 restart:
402         n = rb_first(root);
403         bad = 0;
404         prev = NULL;
405
406         printk("Block Allocation Reservation Windows Map (%s):\n", fn);
407         while (n) {
408                 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
409                 if (verbose)
410                         printk("reservation window 0x%p "
411                                "start:  %llu, end:  %llu\n",
412                                rsv, rsv->rsv_start, rsv->rsv_end);
413                 if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
414                         printk("Bad reservation %p (start >= end)\n",
415                                rsv);
416                         bad = 1;
417                 }
418                 if (prev && prev->rsv_end >= rsv->rsv_start) {
419                         printk("Bad reservation %p (prev->end >= start)\n",
420                                rsv);
421                         bad = 1;
422                 }
423                 if (bad) {
424                         if (!verbose) {
425                                 printk("Restarting reservation walk in verbose mode\n");
426                                 verbose = 1;
427                                 goto restart;
428                         }
429                 }
430                 n = rb_next(n);
431                 prev = rsv;
432         }
433         printk("Window map complete.\n");
434         BUG_ON(bad);
435 }
436 #define rsv_window_dump(root, verbose) \
437         __rsv_window_dump((root), (verbose), __func__)
438 #else
439 #define rsv_window_dump(root, verbose) do {} while (0)
440 #endif
441
442 /**
443  * goal_in_my_reservation()
444  * @rsv:                inode's reservation window
445  * @grp_goal:           given goal block relative to the allocation block group
446  * @group:              the current allocation block group
447  * @sb:                 filesystem super block
448  *
449  * Test if the given goal block (group relative) is within the file's
450  * own block reservation window range.
451  *
452  * If the reservation window is outside the goal allocation group, return 0;
453  * grp_goal (given goal block) could be -1, which means no specific
454  * goal block. In this case, always return 1.
455  * If the goal block is within the reservation window, return 1;
456  * otherwise, return 0;
457  */
458 static int
459 goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
460                         ext4_group_t group, struct super_block *sb)
461 {
462         ext4_fsblk_t group_first_block, group_last_block;
463
464         group_first_block = ext4_group_first_block_no(sb, group);
465         group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
466
467         if ((rsv->_rsv_start > group_last_block) ||
468             (rsv->_rsv_end < group_first_block))
469                 return 0;
470         if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
471                 || (grp_goal + group_first_block > rsv->_rsv_end)))
472                 return 0;
473         return 1;
474 }
475
476 /**
477  * search_reserve_window()
478  * @rb_root:            root of reservation tree
479  * @goal:               target allocation block
480  *
481  * Find the reserved window which includes the goal, or the previous one
482  * if the goal is not in any window.
483  * Returns NULL if there are no windows or if all windows start after the goal.
484  */
485 static struct ext4_reserve_window_node *
486 search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
487 {
488         struct rb_node *n = root->rb_node;
489         struct ext4_reserve_window_node *rsv;
490
491         if (!n)
492                 return NULL;
493
494         do {
495                 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
496
497                 if (goal < rsv->rsv_start)
498                         n = n->rb_left;
499                 else if (goal > rsv->rsv_end)
500                         n = n->rb_right;
501                 else
502                         return rsv;
503         } while (n);
504         /*
505          * We've fallen off the end of the tree: the goal wasn't inside
506          * any particular node.  OK, the previous node must be to one
507          * side of the interval containing the goal.  If it's the RHS,
508          * we need to back up one.
509          */
510         if (rsv->rsv_start > goal) {
511                 n = rb_prev(&rsv->rsv_node);
512                 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
513         }
514         return rsv;
515 }
516
517 /**
518  * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
519  * @sb:                 super block
520  * @rsv:                reservation window to add
521  *
522  * Must be called with rsv_lock hold.
523  */
524 void ext4_rsv_window_add(struct super_block *sb,
525                     struct ext4_reserve_window_node *rsv)
526 {
527         struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
528         struct rb_node *node = &rsv->rsv_node;
529         ext4_fsblk_t start = rsv->rsv_start;
530
531         struct rb_node ** p = &root->rb_node;
532         struct rb_node * parent = NULL;
533         struct ext4_reserve_window_node *this;
534
535         while (*p)
536         {
537                 parent = *p;
538                 this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
539
540                 if (start < this->rsv_start)
541                         p = &(*p)->rb_left;
542                 else if (start > this->rsv_end)
543                         p = &(*p)->rb_right;
544                 else {
545                         rsv_window_dump(root, 1);
546                         BUG();
547                 }
548         }
549
550         rb_link_node(node, parent, p);
551         rb_insert_color(node, root);
552 }
553
554 /**
555  * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
556  * @sb:                 super block
557  * @rsv:                reservation window to remove
558  *
559  * Mark the block reservation window as not allocated, and unlink it
560  * from the filesystem reservation window rb tree. Must be called with
561  * rsv_lock hold.
562  */
563 static void rsv_window_remove(struct super_block *sb,
564                               struct ext4_reserve_window_node *rsv)
565 {
566         rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
567         rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
568         rsv->rsv_alloc_hit = 0;
569         rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
570 }
571
572 /*
573  * rsv_is_empty() -- Check if the reservation window is allocated.
574  * @rsv:                given reservation window to check
575  *
576  * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
577  */
578 static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
579 {
580         /* a valid reservation end block could not be 0 */
581         return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
582 }
583
584 /**
585  * ext4_init_block_alloc_info()
586  * @inode:              file inode structure
587  *
588  * Allocate and initialize the  reservation window structure, and
589  * link the window to the ext4 inode structure at last
590  *
591  * The reservation window structure is only dynamically allocated
592  * and linked to ext4 inode the first time the open file
593  * needs a new block. So, before every ext4_new_block(s) call, for
594  * regular files, we should check whether the reservation window
595  * structure exists or not. In the latter case, this function is called.
596  * Fail to do so will result in block reservation being turned off for that
597  * open file.
598  *
599  * This function is called from ext4_get_blocks_handle(), also called
600  * when setting the reservation window size through ioctl before the file
601  * is open for write (needs block allocation).
602  *
603  * Needs down_write(i_data_sem) protection prior to call this function.
604  */
605 void ext4_init_block_alloc_info(struct inode *inode)
606 {
607         struct ext4_inode_info *ei = EXT4_I(inode);
608         struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
609         struct super_block *sb = inode->i_sb;
610
611         block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
612         if (block_i) {
613                 struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
614
615                 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
616                 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
617
618                 /*
619                  * if filesystem is mounted with NORESERVATION, the goal
620                  * reservation window size is set to zero to indicate
621                  * block reservation is off
622                  */
623                 if (!test_opt(sb, RESERVATION))
624                         rsv->rsv_goal_size = 0;
625                 else
626                         rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
627                 rsv->rsv_alloc_hit = 0;
628                 block_i->last_alloc_logical_block = 0;
629                 block_i->last_alloc_physical_block = 0;
630         }
631         ei->i_block_alloc_info = block_i;
632 }
633
634 /**
635  * ext4_discard_reservation()
636  * @inode:              inode
637  *
638  * Discard(free) block reservation window on last file close, or truncate
639  * or at last iput().
640  *
641  * It is being called in three cases:
642  *      ext4_release_file(): last writer close the file
643  *      ext4_clear_inode(): last iput(), when nobody link to this file.
644  *      ext4_truncate(): when the block indirect map is about to change.
645  *
646  */
647 void ext4_discard_reservation(struct inode *inode)
648 {
649         struct ext4_inode_info *ei = EXT4_I(inode);
650         struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
651         struct ext4_reserve_window_node *rsv;
652         spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
653
654         ext4_mb_discard_inode_preallocations(inode);
655
656         if (!block_i)
657                 return;
658
659         rsv = &block_i->rsv_window_node;
660         if (!rsv_is_empty(&rsv->rsv_window)) {
661                 spin_lock(rsv_lock);
662                 if (!rsv_is_empty(&rsv->rsv_window))
663                         rsv_window_remove(inode->i_sb, rsv);
664                 spin_unlock(rsv_lock);
665         }
666 }
667
668 /**
669  * ext4_free_blocks_sb() -- Free given blocks and update quota
670  * @handle:                     handle to this transaction
671  * @sb:                         super block
672  * @block:                      start physcial block to free
673  * @count:                      number of blocks to free
674  * @pdquot_freed_blocks:        pointer to quota
675  */
676 void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
677                          ext4_fsblk_t block, unsigned long count,
678                          unsigned long *pdquot_freed_blocks)
679 {
680         struct buffer_head *bitmap_bh = NULL;
681         struct buffer_head *gd_bh;
682         ext4_group_t block_group;
683         ext4_grpblk_t bit;
684         unsigned long i;
685         unsigned long overflow;
686         struct ext4_group_desc * desc;
687         struct ext4_super_block * es;
688         struct ext4_sb_info *sbi;
689         int err = 0, ret;
690         ext4_grpblk_t group_freed;
691
692         *pdquot_freed_blocks = 0;
693         sbi = EXT4_SB(sb);
694         es = sbi->s_es;
695         if (block < le32_to_cpu(es->s_first_data_block) ||
696             block + count < block ||
697             block + count > ext4_blocks_count(es)) {
698                 ext4_error (sb, "ext4_free_blocks",
699                             "Freeing blocks not in datazone - "
700                             "block = %llu, count = %lu", block, count);
701                 goto error_return;
702         }
703
704         ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
705
706 do_more:
707         overflow = 0;
708         ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
709         /*
710          * Check to see if we are freeing blocks across a group
711          * boundary.
712          */
713         if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
714                 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
715                 count -= overflow;
716         }
717         brelse(bitmap_bh);
718         bitmap_bh = ext4_read_block_bitmap(sb, block_group);
719         if (!bitmap_bh)
720                 goto error_return;
721         desc = ext4_get_group_desc (sb, block_group, &gd_bh);
722         if (!desc)
723                 goto error_return;
724
725         if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
726             in_range(ext4_inode_bitmap(sb, desc), block, count) ||
727             in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
728             in_range(block + count - 1, ext4_inode_table(sb, desc),
729                      sbi->s_itb_per_group)) {
730                 ext4_error (sb, "ext4_free_blocks",
731                             "Freeing blocks in system zones - "
732                             "Block = %llu, count = %lu",
733                             block, count);
734                 goto error_return;
735         }
736
737         /*
738          * We are about to start releasing blocks in the bitmap,
739          * so we need undo access.
740          */
741         /* @@@ check errors */
742         BUFFER_TRACE(bitmap_bh, "getting undo access");
743         err = ext4_journal_get_undo_access(handle, bitmap_bh);
744         if (err)
745                 goto error_return;
746
747         /*
748          * We are about to modify some metadata.  Call the journal APIs
749          * to unshare ->b_data if a currently-committing transaction is
750          * using it
751          */
752         BUFFER_TRACE(gd_bh, "get_write_access");
753         err = ext4_journal_get_write_access(handle, gd_bh);
754         if (err)
755                 goto error_return;
756
757         jbd_lock_bh_state(bitmap_bh);
758
759         for (i = 0, group_freed = 0; i < count; i++) {
760                 /*
761                  * An HJ special.  This is expensive...
762                  */
763 #ifdef CONFIG_JBD2_DEBUG
764                 jbd_unlock_bh_state(bitmap_bh);
765                 {
766                         struct buffer_head *debug_bh;
767                         debug_bh = sb_find_get_block(sb, block + i);
768                         if (debug_bh) {
769                                 BUFFER_TRACE(debug_bh, "Deleted!");
770                                 if (!bh2jh(bitmap_bh)->b_committed_data)
771                                         BUFFER_TRACE(debug_bh,
772                                                 "No commited data in bitmap");
773                                 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
774                                 __brelse(debug_bh);
775                         }
776                 }
777                 jbd_lock_bh_state(bitmap_bh);
778 #endif
779                 if (need_resched()) {
780                         jbd_unlock_bh_state(bitmap_bh);
781                         cond_resched();
782                         jbd_lock_bh_state(bitmap_bh);
783                 }
784                 /* @@@ This prevents newly-allocated data from being
785                  * freed and then reallocated within the same
786                  * transaction.
787                  *
788                  * Ideally we would want to allow that to happen, but to
789                  * do so requires making jbd2_journal_forget() capable of
790                  * revoking the queued write of a data block, which
791                  * implies blocking on the journal lock.  *forget()
792                  * cannot block due to truncate races.
793                  *
794                  * Eventually we can fix this by making jbd2_journal_forget()
795                  * return a status indicating whether or not it was able
796                  * to revoke the buffer.  On successful revoke, it is
797                  * safe not to set the allocation bit in the committed
798                  * bitmap, because we know that there is no outstanding
799                  * activity on the buffer any more and so it is safe to
800                  * reallocate it.
801                  */
802                 BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
803                 J_ASSERT_BH(bitmap_bh,
804                                 bh2jh(bitmap_bh)->b_committed_data != NULL);
805                 ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
806                                 bh2jh(bitmap_bh)->b_committed_data);
807
808                 /*
809                  * We clear the bit in the bitmap after setting the committed
810                  * data bit, because this is the reverse order to that which
811                  * the allocator uses.
812                  */
813                 BUFFER_TRACE(bitmap_bh, "clear bit");
814                 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
815                                                 bit + i, bitmap_bh->b_data)) {
816                         jbd_unlock_bh_state(bitmap_bh);
817                         ext4_error(sb, __func__,
818                                    "bit already cleared for block %llu",
819                                    (ext4_fsblk_t)(block + i));
820                         jbd_lock_bh_state(bitmap_bh);
821                         BUFFER_TRACE(bitmap_bh, "bit already cleared");
822                 } else {
823                         group_freed++;
824                 }
825         }
826         jbd_unlock_bh_state(bitmap_bh);
827
828         spin_lock(sb_bgl_lock(sbi, block_group));
829         le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
830         desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
831         spin_unlock(sb_bgl_lock(sbi, block_group));
832         percpu_counter_add(&sbi->s_freeblocks_counter, count);
833
834         if (sbi->s_log_groups_per_flex) {
835                 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
836                 spin_lock(sb_bgl_lock(sbi, flex_group));
837                 sbi->s_flex_groups[flex_group].free_blocks += count;
838                 spin_unlock(sb_bgl_lock(sbi, flex_group));
839         }
840
841         /* We dirtied the bitmap block */
842         BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
843         err = ext4_journal_dirty_metadata(handle, bitmap_bh);
844
845         /* And the group descriptor block */
846         BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
847         ret = ext4_journal_dirty_metadata(handle, gd_bh);
848         if (!err) err = ret;
849         *pdquot_freed_blocks += group_freed;
850
851         if (overflow && !err) {
852                 block += count;
853                 count = overflow;
854                 goto do_more;
855         }
856         sb->s_dirt = 1;
857 error_return:
858         brelse(bitmap_bh);
859         ext4_std_error(sb, err);
860         return;
861 }
862
863 /**
864  * ext4_add_groupblocks() -- Add given blocks to an existing group
865  * @handle:                     handle to this transaction
866  * @sb:                         super block
867  * @block:                      start physcial block to add to the block group
868  * @count:                      number of blocks to free
869  *
870  * This marks the blocks as free in the bitmap. We ask the
871  * mballoc to reload the buddy after this by setting group
872  * EXT4_GROUP_INFO_NEED_INIT_BIT flag
873  */
874 void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
875                          ext4_fsblk_t block, unsigned long count)
876 {
877         struct buffer_head *bitmap_bh = NULL;
878         struct buffer_head *gd_bh;
879         ext4_group_t block_group;
880         ext4_grpblk_t bit;
881         unsigned long i;
882         struct ext4_group_desc *desc;
883         struct ext4_super_block *es;
884         struct ext4_sb_info *sbi;
885         int err = 0, ret;
886         ext4_grpblk_t blocks_freed;
887         struct ext4_group_info *grp;
888
889         sbi = EXT4_SB(sb);
890         es = sbi->s_es;
891         ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
892
893         ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
894         grp = ext4_get_group_info(sb, block_group);
895         /*
896          * Check to see if we are freeing blocks across a group
897          * boundary.
898          */
899         if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
900                 goto error_return;
901
902         bitmap_bh = ext4_read_block_bitmap(sb, block_group);
903         if (!bitmap_bh)
904                 goto error_return;
905         desc = ext4_get_group_desc(sb, block_group, &gd_bh);
906         if (!desc)
907                 goto error_return;
908
909         if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
910             in_range(ext4_inode_bitmap(sb, desc), block, count) ||
911             in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
912             in_range(block + count - 1, ext4_inode_table(sb, desc),
913                      sbi->s_itb_per_group)) {
914                 ext4_error(sb, __func__,
915                            "Adding blocks in system zones - "
916                             "Block = %llu, count = %lu",
917                             block, count);
918                 goto error_return;
919         }
920
921         /*
922          * We are about to add blocks to the bitmap,
923          * so we need undo access.
924          */
925         BUFFER_TRACE(bitmap_bh, "getting undo access");
926         err = ext4_journal_get_undo_access(handle, bitmap_bh);
927         if (err)
928                 goto error_return;
929
930         /*
931          * We are about to modify some metadata.  Call the journal APIs
932          * to unshare ->b_data if a currently-committing transaction is
933          * using it
934          */
935         BUFFER_TRACE(gd_bh, "get_write_access");
936         err = ext4_journal_get_write_access(handle, gd_bh);
937         if (err)
938                 goto error_return;
939         /*
940          * make sure we don't allow a parallel init on other groups in the
941          * same buddy cache
942          */
943         down_write(&grp->alloc_sem);
944         for (i = 0, blocks_freed = 0; i < count; i++) {
945                 BUFFER_TRACE(bitmap_bh, "clear bit");
946                 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
947                                                 bit + i, bitmap_bh->b_data)) {
948                         ext4_error(sb, __func__,
949                                    "bit already cleared for block %llu",
950                                    (ext4_fsblk_t)(block + i));
951                         BUFFER_TRACE(bitmap_bh, "bit already cleared");
952                 } else {
953                         blocks_freed++;
954                 }
955         }
956         spin_lock(sb_bgl_lock(sbi, block_group));
957         le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed);
958         desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
959         spin_unlock(sb_bgl_lock(sbi, block_group));
960         percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
961
962         if (sbi->s_log_groups_per_flex) {
963                 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
964                 spin_lock(sb_bgl_lock(sbi, flex_group));
965                 sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
966                 spin_unlock(sb_bgl_lock(sbi, flex_group));
967         }
968         /*
969          * request to reload the buddy with the
970          * new bitmap information
971          */
972         set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
973         ext4_mb_update_group_info(grp, blocks_freed);
974         up_write(&grp->alloc_sem);
975
976         /* We dirtied the bitmap block */
977         BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
978         err = ext4_journal_dirty_metadata(handle, bitmap_bh);
979
980         /* And the group descriptor block */
981         BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
982         ret = ext4_journal_dirty_metadata(handle, gd_bh);
983         if (!err)
984                 err = ret;
985         sb->s_dirt = 1;
986
987 error_return:
988         brelse(bitmap_bh);
989         ext4_std_error(sb, err);
990         return;
991 }
992
993 /**
994  * ext4_free_blocks() -- Free given blocks and update quota
995  * @handle:             handle for this transaction
996  * @inode:              inode
997  * @block:              start physical block to free
998  * @count:              number of blocks to count
999  * @metadata:           Are these metadata blocks
1000  */
1001 void ext4_free_blocks(handle_t *handle, struct inode *inode,
1002                         ext4_fsblk_t block, unsigned long count,
1003                         int metadata)
1004 {
1005         struct super_block * sb;
1006         unsigned long dquot_freed_blocks;
1007
1008         /* this isn't the right place to decide whether block is metadata
1009          * inode.c/extents.c knows better, but for safety ... */
1010         if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
1011                         ext4_should_journal_data(inode))
1012                 metadata = 1;
1013
1014         sb = inode->i_sb;
1015
1016         if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
1017                 ext4_free_blocks_sb(handle, sb, block, count,
1018                                                 &dquot_freed_blocks);
1019         else
1020                 ext4_mb_free_blocks(handle, inode, block, count,
1021                                                 metadata, &dquot_freed_blocks);
1022         if (dquot_freed_blocks)
1023                 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
1024         return;
1025 }
1026
1027 /**
1028  * ext4_test_allocatable()
1029  * @nr:                 given allocation block group
1030  * @bh:                 bufferhead contains the bitmap of the given block group
1031  *
1032  * For ext4 allocations, we must not reuse any blocks which are
1033  * allocated in the bitmap buffer's "last committed data" copy.  This
1034  * prevents deletes from freeing up the page for reuse until we have
1035  * committed the delete transaction.
1036  *
1037  * If we didn't do this, then deleting something and reallocating it as
1038  * data would allow the old block to be overwritten before the
1039  * transaction committed (because we force data to disk before commit).
1040  * This would lead to corruption if we crashed between overwriting the
1041  * data and committing the delete.
1042  *
1043  * @@@ We may want to make this allocation behaviour conditional on
1044  * data-writes at some point, and disable it for metadata allocations or
1045  * sync-data inodes.
1046  */
1047 static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
1048 {
1049         int ret;
1050         struct journal_head *jh = bh2jh(bh);
1051
1052         if (ext4_test_bit(nr, bh->b_data))
1053                 return 0;
1054
1055         jbd_lock_bh_state(bh);
1056         if (!jh->b_committed_data)
1057                 ret = 1;
1058         else
1059                 ret = !ext4_test_bit(nr, jh->b_committed_data);
1060         jbd_unlock_bh_state(bh);
1061         return ret;
1062 }
1063
1064 /**
1065  * bitmap_search_next_usable_block()
1066  * @start:              the starting block (group relative) of the search
1067  * @bh:                 bufferhead contains the block group bitmap
1068  * @maxblocks:          the ending block (group relative) of the reservation
1069  *
1070  * The bitmap search --- search forward alternately through the actual
1071  * bitmap on disk and the last-committed copy in journal, until we find a
1072  * bit free in both bitmaps.
1073  */
1074 static ext4_grpblk_t
1075 bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
1076                                         ext4_grpblk_t maxblocks)
1077 {
1078         ext4_grpblk_t next;
1079         struct journal_head *jh = bh2jh(bh);
1080
1081         while (start < maxblocks) {
1082                 next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
1083                 if (next >= maxblocks)
1084                         return -1;
1085                 if (ext4_test_allocatable(next, bh))
1086                         return next;
1087                 jbd_lock_bh_state(bh);
1088                 if (jh->b_committed_data)
1089                         start = ext4_find_next_zero_bit(jh->b_committed_data,
1090                                                         maxblocks, next);
1091                 jbd_unlock_bh_state(bh);
1092         }
1093         return -1;
1094 }
1095
1096 /**
1097  * find_next_usable_block()
1098  * @start:              the starting block (group relative) to find next
1099  *                      allocatable block in bitmap.
1100  * @bh:                 bufferhead contains the block group bitmap
1101  * @maxblocks:          the ending block (group relative) for the search
1102  *
1103  * Find an allocatable block in a bitmap.  We honor both the bitmap and
1104  * its last-committed copy (if that exists), and perform the "most
1105  * appropriate allocation" algorithm of looking for a free block near
1106  * the initial goal; then for a free byte somewhere in the bitmap; then
1107  * for any free bit in the bitmap.
1108  */
1109 static ext4_grpblk_t
1110 find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
1111                         ext4_grpblk_t maxblocks)
1112 {
1113         ext4_grpblk_t here, next;
1114         char *p, *r;
1115
1116         if (start > 0) {
1117                 /*
1118                  * The goal was occupied; search forward for a free
1119                  * block within the next XX blocks.
1120                  *
1121                  * end_goal is more or less random, but it has to be
1122                  * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
1123                  * next 64-bit boundary is simple..
1124                  */
1125                 ext4_grpblk_t end_goal = (start + 63) & ~63;
1126                 if (end_goal > maxblocks)
1127                         end_goal = maxblocks;
1128                 here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
1129                 if (here < end_goal && ext4_test_allocatable(here, bh))
1130                         return here;
1131                 ext4_debug("Bit not found near goal\n");
1132         }
1133
1134         here = start;
1135         if (here < 0)
1136                 here = 0;
1137
1138         p = ((char *)bh->b_data) + (here >> 3);
1139         r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
1140         next = (r - ((char *)bh->b_data)) << 3;
1141
1142         if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
1143                 return next;
1144
1145         /*
1146          * The bitmap search --- search forward alternately through the actual
1147          * bitmap and the last-committed copy until we find a bit free in
1148          * both
1149          */
1150         here = bitmap_search_next_usable_block(here, bh, maxblocks);
1151         return here;
1152 }
1153
1154 /**
1155  * claim_block()
1156  * @block:              the free block (group relative) to allocate
1157  * @bh:                 the bufferhead containts the block group bitmap
1158  *
1159  * We think we can allocate this block in this bitmap.  Try to set the bit.
1160  * If that succeeds then check that nobody has allocated and then freed the
1161  * block since we saw that is was not marked in b_committed_data.  If it _was_
1162  * allocated and freed then clear the bit in the bitmap again and return
1163  * zero (failure).
1164  */
1165 static inline int
1166 claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
1167 {
1168         struct journal_head *jh = bh2jh(bh);
1169         int ret;
1170
1171         if (ext4_set_bit_atomic(lock, block, bh->b_data))
1172                 return 0;
1173         jbd_lock_bh_state(bh);
1174         if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
1175                 ext4_clear_bit_atomic(lock, block, bh->b_data);
1176                 ret = 0;
1177         } else {
1178                 ret = 1;
1179         }
1180         jbd_unlock_bh_state(bh);
1181         return ret;
1182 }
1183
1184 /**
1185  * ext4_try_to_allocate()
1186  * @sb:                 superblock
1187  * @handle:             handle to this transaction
1188  * @group:              given allocation block group
1189  * @bitmap_bh:          bufferhead holds the block bitmap
1190  * @grp_goal:           given target block within the group
1191  * @count:              target number of blocks to allocate
1192  * @my_rsv:             reservation window
1193  *
1194  * Attempt to allocate blocks within a give range. Set the range of allocation
1195  * first, then find the first free bit(s) from the bitmap (within the range),
1196  * and at last, allocate the blocks by claiming the found free bit as allocated.
1197  *
1198  * To set the range of this allocation:
1199  *      if there is a reservation window, only try to allocate block(s) from the
1200  *      file's own reservation window;
1201  *      Otherwise, the allocation range starts from the give goal block, ends at
1202  *      the block group's last block.
1203  *
1204  * If we failed to allocate the desired block then we may end up crossing to a
1205  * new bitmap.  In that case we must release write access to the old one via
1206  * ext4_journal_release_buffer(), else we'll run out of credits.
1207  */
1208 static ext4_grpblk_t
1209 ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
1210                         ext4_group_t group, struct buffer_head *bitmap_bh,
1211                         ext4_grpblk_t grp_goal, unsigned long *count,
1212                         struct ext4_reserve_window *my_rsv)
1213 {
1214         ext4_fsblk_t group_first_block;
1215         ext4_grpblk_t start, end;
1216         unsigned long num = 0;
1217
1218         /* we do allocation within the reservation window if we have a window */
1219         if (my_rsv) {
1220                 group_first_block = ext4_group_first_block_no(sb, group);
1221                 if (my_rsv->_rsv_start >= group_first_block)
1222                         start = my_rsv->_rsv_start - group_first_block;
1223                 else
1224                         /* reservation window cross group boundary */
1225                         start = 0;
1226                 end = my_rsv->_rsv_end - group_first_block + 1;
1227                 if (end > EXT4_BLOCKS_PER_GROUP(sb))
1228                         /* reservation window crosses group boundary */
1229                         end = EXT4_BLOCKS_PER_GROUP(sb);
1230                 if ((start <= grp_goal) && (grp_goal < end))
1231                         start = grp_goal;
1232                 else
1233                         grp_goal = -1;
1234         } else {
1235                 if (grp_goal > 0)
1236                         start = grp_goal;
1237                 else
1238                         start = 0;
1239                 end = EXT4_BLOCKS_PER_GROUP(sb);
1240         }
1241
1242         BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
1243
1244 repeat:
1245         if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
1246                 grp_goal = find_next_usable_block(start, bitmap_bh, end);
1247                 if (grp_goal < 0)
1248                         goto fail_access;
1249                 if (!my_rsv) {
1250                         int i;
1251
1252                         for (i = 0; i < 7 && grp_goal > start &&
1253                                         ext4_test_allocatable(grp_goal - 1,
1254                                                                 bitmap_bh);
1255                                         i++, grp_goal--)
1256                                 ;
1257                 }
1258         }
1259         start = grp_goal;
1260
1261         if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
1262                 grp_goal, bitmap_bh)) {
1263                 /*
1264                  * The block was allocated by another thread, or it was
1265                  * allocated and then freed by another thread
1266                  */
1267                 start++;
1268                 grp_goal++;
1269                 if (start >= end)
1270                         goto fail_access;
1271                 goto repeat;
1272         }
1273         num++;
1274         grp_goal++;
1275         while (num < *count && grp_goal < end
1276                 && ext4_test_allocatable(grp_goal, bitmap_bh)
1277                 && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
1278                                 grp_goal, bitmap_bh)) {
1279                 num++;
1280                 grp_goal++;
1281         }
1282         *count = num;
1283         return grp_goal - num;
1284 fail_access:
1285         *count = num;
1286         return -1;
1287 }
1288
1289 /**
1290  *      find_next_reservable_window():
1291  *              find a reservable space within the given range.
1292  *              It does not allocate the reservation window for now:
1293  *              alloc_new_reservation() will do the work later.
1294  *
1295  *      @search_head: the head of the searching list;
1296  *              This is not necessarily the list head of the whole filesystem
1297  *
1298  *              We have both head and start_block to assist the search
1299  *              for the reservable space. The list starts from head,
1300  *              but we will shift to the place where start_block is,
1301  *              then start from there, when looking for a reservable space.
1302  *
1303  *      @size: the target new reservation window size
1304  *
1305  *      @group_first_block: the first block we consider to start
1306  *                      the real search from
1307  *
1308  *      @last_block:
1309  *              the maximum block number that our goal reservable space
1310  *              could start from. This is normally the last block in this
1311  *              group. The search will end when we found the start of next
1312  *              possible reservable space is out of this boundary.
1313  *              This could handle the cross boundary reservation window
1314  *              request.
1315  *
1316  *      basically we search from the given range, rather than the whole
1317  *      reservation double linked list, (start_block, last_block)
1318  *      to find a free region that is of my size and has not
1319  *      been reserved.
1320  *
1321  */
1322 static int find_next_reservable_window(
1323                                 struct ext4_reserve_window_node *search_head,
1324                                 struct ext4_reserve_window_node *my_rsv,
1325                                 struct super_block * sb,
1326                                 ext4_fsblk_t start_block,
1327                                 ext4_fsblk_t last_block)
1328 {
1329         struct rb_node *next;
1330         struct ext4_reserve_window_node *rsv, *prev;
1331         ext4_fsblk_t cur;
1332         int size = my_rsv->rsv_goal_size;
1333
1334         /* TODO: make the start of the reservation window byte-aligned */
1335         /* cur = *start_block & ~7;*/
1336         cur = start_block;
1337         rsv = search_head;
1338         if (!rsv)
1339                 return -1;
1340
1341         while (1) {
1342                 if (cur <= rsv->rsv_end)
1343                         cur = rsv->rsv_end + 1;
1344
1345                 /* TODO?
1346                  * in the case we could not find a reservable space
1347                  * that is what is expected, during the re-search, we could
1348                  * remember what's the largest reservable space we could have
1349                  * and return that one.
1350                  *
1351                  * For now it will fail if we could not find the reservable
1352                  * space with expected-size (or more)...
1353                  */
1354                 if (cur > last_block)
1355                         return -1;              /* fail */
1356
1357                 prev = rsv;
1358                 next = rb_next(&rsv->rsv_node);
1359                 rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
1360
1361                 /*
1362                  * Reached the last reservation, we can just append to the
1363                  * previous one.
1364                  */
1365                 if (!next)
1366                         break;
1367
1368                 if (cur + size <= rsv->rsv_start) {
1369                         /*
1370                          * Found a reserveable space big enough.  We could
1371                          * have a reservation across the group boundary here
1372                          */
1373                         break;
1374                 }
1375         }
1376         /*
1377          * we come here either :
1378          * when we reach the end of the whole list,
1379          * and there is empty reservable space after last entry in the list.
1380          * append it to the end of the list.
1381          *
1382          * or we found one reservable space in the middle of the list,
1383          * return the reservation window that we could append to.
1384          * succeed.
1385          */
1386
1387         if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
1388                 rsv_window_remove(sb, my_rsv);
1389
1390         /*
1391          * Let's book the whole avaliable window for now.  We will check the
1392          * disk bitmap later and then, if there are free blocks then we adjust
1393          * the window size if it's larger than requested.
1394          * Otherwise, we will remove this node from the tree next time
1395          * call find_next_reservable_window.
1396          */
1397         my_rsv->rsv_start = cur;
1398         my_rsv->rsv_end = cur + size - 1;
1399         my_rsv->rsv_alloc_hit = 0;
1400
1401         if (prev != my_rsv)
1402                 ext4_rsv_window_add(sb, my_rsv);
1403
1404         return 0;
1405 }
1406
1407 /**
1408  *      alloc_new_reservation()--allocate a new reservation window
1409  *
1410  *              To make a new reservation, we search part of the filesystem
1411  *              reservation list (the list that inside the group). We try to
1412  *              allocate a new reservation window near the allocation goal,
1413  *              or the beginning of the group, if there is no goal.
1414  *
1415  *              We first find a reservable space after the goal, then from
1416  *              there, we check the bitmap for the first free block after
1417  *              it. If there is no free block until the end of group, then the
1418  *              whole group is full, we failed. Otherwise, check if the free
1419  *              block is inside the expected reservable space, if so, we
1420  *              succeed.
1421  *              If the first free block is outside the reservable space, then
1422  *              start from the first free block, we search for next available
1423  *              space, and go on.
1424  *
1425  *      on succeed, a new reservation will be found and inserted into the list
1426  *      It contains at least one free block, and it does not overlap with other
1427  *      reservation windows.
1428  *
1429  *      failed: we failed to find a reservation window in this group
1430  *
1431  *      @rsv: the reservation
1432  *
1433  *      @grp_goal: The goal (group-relative).  It is where the search for a
1434  *              free reservable space should start from.
1435  *              if we have a grp_goal(grp_goal >0 ), then start from there,
1436  *              no grp_goal(grp_goal = -1), we start from the first block
1437  *              of the group.
1438  *
1439  *      @sb: the super block
1440  *      @group: the group we are trying to allocate in
1441  *      @bitmap_bh: the block group block bitmap
1442  *
1443  */
1444 static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
1445                 ext4_grpblk_t grp_goal, struct super_block *sb,
1446                 ext4_group_t group, struct buffer_head *bitmap_bh)
1447 {
1448         struct ext4_reserve_window_node *search_head;
1449         ext4_fsblk_t group_first_block, group_end_block, start_block;
1450         ext4_grpblk_t first_free_block;
1451         struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
1452         unsigned long size;
1453         int ret;
1454         spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1455
1456         group_first_block = ext4_group_first_block_no(sb, group);
1457         group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1458
1459         if (grp_goal < 0)
1460                 start_block = group_first_block;
1461         else
1462                 start_block = grp_goal + group_first_block;
1463
1464         size = my_rsv->rsv_goal_size;
1465
1466         if (!rsv_is_empty(&my_rsv->rsv_window)) {
1467                 /*
1468                  * if the old reservation is cross group boundary
1469                  * and if the goal is inside the old reservation window,
1470                  * we will come here when we just failed to allocate from
1471                  * the first part of the window. We still have another part
1472                  * that belongs to the next group. In this case, there is no
1473                  * point to discard our window and try to allocate a new one
1474                  * in this group(which will fail). we should
1475                  * keep the reservation window, just simply move on.
1476                  *
1477                  * Maybe we could shift the start block of the reservation
1478                  * window to the first block of next group.
1479                  */
1480
1481                 if ((my_rsv->rsv_start <= group_end_block) &&
1482                                 (my_rsv->rsv_end > group_end_block) &&
1483                                 (start_block >= my_rsv->rsv_start))
1484                         return -1;
1485
1486                 if ((my_rsv->rsv_alloc_hit >
1487                      (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1488                         /*
1489                          * if the previously allocation hit ratio is
1490                          * greater than 1/2, then we double the size of
1491                          * the reservation window the next time,
1492                          * otherwise we keep the same size window
1493                          */
1494                         size = size * 2;
1495                         if (size > EXT4_MAX_RESERVE_BLOCKS)
1496                                 size = EXT4_MAX_RESERVE_BLOCKS;
1497                         my_rsv->rsv_goal_size= size;
1498                 }
1499         }
1500
1501         spin_lock(rsv_lock);
1502         /*
1503          * shift the search start to the window near the goal block
1504          */
1505         search_head = search_reserve_window(fs_rsv_root, start_block);
1506
1507         /*
1508          * find_next_reservable_window() simply finds a reservable window
1509          * inside the given range(start_block, group_end_block).
1510          *
1511          * To make sure the reservation window has a free bit inside it, we
1512          * need to check the bitmap after we found a reservable window.
1513          */
1514 retry:
1515         ret = find_next_reservable_window(search_head, my_rsv, sb,
1516                                                 start_block, group_end_block);
1517
1518         if (ret == -1) {
1519                 if (!rsv_is_empty(&my_rsv->rsv_window))
1520                         rsv_window_remove(sb, my_rsv);
1521                 spin_unlock(rsv_lock);
1522                 return -1;
1523         }
1524
1525         /*
1526          * On success, find_next_reservable_window() returns the
1527          * reservation window where there is a reservable space after it.
1528          * Before we reserve this reservable space, we need
1529          * to make sure there is at least a free block inside this region.
1530          *
1531          * searching the first free bit on the block bitmap and copy of
1532          * last committed bitmap alternatively, until we found a allocatable
1533          * block. Search start from the start block of the reservable space
1534          * we just found.
1535          */
1536         spin_unlock(rsv_lock);
1537         first_free_block = bitmap_search_next_usable_block(
1538                         my_rsv->rsv_start - group_first_block,
1539                         bitmap_bh, group_end_block - group_first_block + 1);
1540
1541         if (first_free_block < 0) {
1542                 /*
1543                  * no free block left on the bitmap, no point
1544                  * to reserve the space. return failed.
1545                  */
1546                 spin_lock(rsv_lock);
1547                 if (!rsv_is_empty(&my_rsv->rsv_window))
1548                         rsv_window_remove(sb, my_rsv);
1549                 spin_unlock(rsv_lock);
1550                 return -1;              /* failed */
1551         }
1552
1553         start_block = first_free_block + group_first_block;
1554         /*
1555          * check if the first free block is within the
1556          * free space we just reserved
1557          */
1558         if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
1559                 return 0;               /* success */
1560         /*
1561          * if the first free bit we found is out of the reservable space
1562          * continue search for next reservable space,
1563          * start from where the free block is,
1564          * we also shift the list head to where we stopped last time
1565          */
1566         search_head = my_rsv;
1567         spin_lock(rsv_lock);
1568         goto retry;
1569 }
1570
1571 /**
1572  * try_to_extend_reservation()
1573  * @my_rsv:             given reservation window
1574  * @sb:                 super block
1575  * @size:               the delta to extend
1576  *
1577  * Attempt to expand the reservation window large enough to have
1578  * required number of free blocks
1579  *
1580  * Since ext4_try_to_allocate() will always allocate blocks within
1581  * the reservation window range, if the window size is too small,
1582  * multiple blocks allocation has to stop at the end of the reservation
1583  * window. To make this more efficient, given the total number of
1584  * blocks needed and the current size of the window, we try to
1585  * expand the reservation window size if necessary on a best-effort
1586  * basis before ext4_new_blocks() tries to allocate blocks,
1587  */
1588 static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
1589                         struct super_block *sb, int size)
1590 {
1591         struct ext4_reserve_window_node *next_rsv;
1592         struct rb_node *next;
1593         spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1594
1595         if (!spin_trylock(rsv_lock))
1596                 return;
1597
1598         next = rb_next(&my_rsv->rsv_node);
1599
1600         if (!next)
1601                 my_rsv->rsv_end += size;
1602         else {
1603                 next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
1604
1605                 if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1606                         my_rsv->rsv_end += size;
1607                 else
1608                         my_rsv->rsv_end = next_rsv->rsv_start - 1;
1609         }
1610         spin_unlock(rsv_lock);
1611 }
1612
1613 /**
1614  * ext4_try_to_allocate_with_rsv()
1615  * @sb:                 superblock
1616  * @handle:             handle to this transaction
1617  * @group:              given allocation block group
1618  * @bitmap_bh:          bufferhead holds the block bitmap
1619  * @grp_goal:           given target block within the group
1620  * @count:              target number of blocks to allocate
1621  * @my_rsv:             reservation window
1622  * @errp:               pointer to store the error code
1623  *
1624  * This is the main function used to allocate a new block and its reservation
1625  * window.
1626  *
1627  * Each time when a new block allocation is need, first try to allocate from
1628  * its own reservation.  If it does not have a reservation window, instead of
1629  * looking for a free bit on bitmap first, then look up the reservation list to
1630  * see if it is inside somebody else's reservation window, we try to allocate a
1631  * reservation window for it starting from the goal first. Then do the block
1632  * allocation within the reservation window.
1633  *
1634  * This will avoid keeping on searching the reservation list again and
1635  * again when somebody is looking for a free block (without
1636  * reservation), and there are lots of free blocks, but they are all
1637  * being reserved.
1638  *
1639  * We use a red-black tree for the per-filesystem reservation list.
1640  *
1641  */
1642 static ext4_grpblk_t
1643 ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1644                         ext4_group_t group, struct buffer_head *bitmap_bh,
1645                         ext4_grpblk_t grp_goal,
1646                         struct ext4_reserve_window_node * my_rsv,
1647                         unsigned long *count, int *errp)
1648 {
1649         ext4_fsblk_t group_first_block, group_last_block;
1650         ext4_grpblk_t ret = 0;
1651         int fatal;
1652         unsigned long num = *count;
1653
1654         *errp = 0;
1655
1656         /*
1657          * Make sure we use undo access for the bitmap, because it is critical
1658          * that we do the frozen_data COW on bitmap buffers in all cases even
1659          * if the buffer is in BJ_Forget state in the committing transaction.
1660          */
1661         BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1662         fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
1663         if (fatal) {
1664                 *errp = fatal;
1665                 return -1;
1666         }
1667
1668         /*
1669          * we don't deal with reservation when
1670          * filesystem is mounted without reservation
1671          * or the file is not a regular file
1672          * or last attempt to allocate a block with reservation turned on failed
1673          */
1674         if (my_rsv == NULL ) {
1675                 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1676                                                 grp_goal, count, NULL);
1677                 goto out;
1678         }
1679         /*
1680          * grp_goal is a group relative block number (if there is a goal)
1681          * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
1682          * first block is a filesystem wide block number
1683          * first block is the block number of the first block in this group
1684          */
1685         group_first_block = ext4_group_first_block_no(sb, group);
1686         group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1687
1688         /*
1689          * Basically we will allocate a new block from inode's reservation
1690          * window.
1691          *
1692          * We need to allocate a new reservation window, if:
1693          * a) inode does not have a reservation window; or
1694          * b) last attempt to allocate a block from existing reservation
1695          *    failed; or
1696          * c) we come here with a goal and with a reservation window
1697          *
1698          * We do not need to allocate a new reservation window if we come here
1699          * at the beginning with a goal and the goal is inside the window, or
1700          * we don't have a goal but already have a reservation window.
1701          * then we could go to allocate from the reservation window directly.
1702          */
1703         while (1) {
1704                 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1705                         !goal_in_my_reservation(&my_rsv->rsv_window,
1706                                                 grp_goal, group, sb)) {
1707                         if (my_rsv->rsv_goal_size < *count)
1708                                 my_rsv->rsv_goal_size = *count;
1709                         ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1710                                                         group, bitmap_bh);
1711                         if (ret < 0)
1712                                 break;                  /* failed */
1713
1714                         if (!goal_in_my_reservation(&my_rsv->rsv_window,
1715                                                         grp_goal, group, sb))
1716                                 grp_goal = -1;
1717                 } else if (grp_goal >= 0) {
1718                         int curr = my_rsv->rsv_end -
1719                                         (grp_goal + group_first_block) + 1;
1720
1721                         if (curr < *count)
1722                                 try_to_extend_reservation(my_rsv, sb,
1723                                                         *count - curr);
1724                 }
1725
1726                 if ((my_rsv->rsv_start > group_last_block) ||
1727                                 (my_rsv->rsv_end < group_first_block)) {
1728                         rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
1729                         BUG();
1730                 }
1731                 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1732                                            grp_goal, &num, &my_rsv->rsv_window);
1733                 if (ret >= 0) {
1734                         my_rsv->rsv_alloc_hit += num;
1735                         *count = num;
1736                         break;                          /* succeed */
1737                 }
1738                 num = *count;
1739         }
1740 out:
1741         if (ret >= 0) {
1742                 BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1743                                         "bitmap block");
1744                 fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
1745                 if (fatal) {
1746                         *errp = fatal;
1747                         return -1;
1748                 }
1749                 return ret;
1750         }
1751
1752         BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1753         ext4_journal_release_buffer(handle, bitmap_bh);
1754         return ret;
1755 }
1756
1757 int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
1758                                                 ext4_fsblk_t nblocks)
1759 {
1760         s64 free_blocks;
1761         ext4_fsblk_t root_blocks = 0;
1762         struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
1763
1764         free_blocks = percpu_counter_read(fbc);
1765
1766         if (!capable(CAP_SYS_RESOURCE) &&
1767                 sbi->s_resuid != current->fsuid &&
1768                 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
1769                 root_blocks = ext4_r_blocks_count(sbi->s_es);
1770
1771         if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
1772                 free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter);
1773
1774         if (free_blocks < (root_blocks + nblocks))
1775                 /* we don't have free space */
1776                 return -ENOSPC;
1777
1778         /* reduce fs free blocks counter */
1779         percpu_counter_sub(fbc, nblocks);
1780         return 0;
1781 }
1782
1783 /**
1784  * ext4_has_free_blocks()
1785  * @sbi:        in-core super block structure.
1786  * @nblocks:    number of neeed blocks
1787  *
1788  * Check if filesystem has free blocks available for allocation.
1789  * Return the number of blocks avaible for allocation for this request
1790  * On success, return nblocks
1791  */
1792 ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
1793                                                 ext4_fsblk_t nblocks)
1794 {
1795         ext4_fsblk_t free_blocks;
1796         ext4_fsblk_t root_blocks = 0;
1797
1798         free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1799
1800         if (!capable(CAP_SYS_RESOURCE) &&
1801                 sbi->s_resuid != current->fsuid &&
1802                 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
1803                 root_blocks = ext4_r_blocks_count(sbi->s_es);
1804
1805         if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
1806                 free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
1807
1808         if (free_blocks <= root_blocks)
1809                 /* we don't have free space */
1810                 return 0;
1811         if (free_blocks - root_blocks < nblocks)
1812                 return free_blocks - root_blocks;
1813         return nblocks;
1814 }
1815
1816
1817 /**
1818  * ext4_should_retry_alloc()
1819  * @sb:                 super block
1820  * @retries             number of attemps has been made
1821  *
1822  * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
1823  * it is profitable to retry the operation, this function will wait
1824  * for the current or commiting transaction to complete, and then
1825  * return TRUE.
1826  *
1827  * if the total number of retries exceed three times, return FALSE.
1828  */
1829 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1830 {
1831         if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
1832                 return 0;
1833
1834         jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
1835
1836         return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
1837 }
1838
1839 /**
1840  * ext4_old_new_blocks() -- core block bitmap based block allocation function
1841  *
1842  * @handle:             handle to this transaction
1843  * @inode:              file inode
1844  * @goal:               given target block(filesystem wide)
1845  * @count:              target number of blocks to allocate
1846  * @errp:               error code
1847  *
1848  * ext4_old_new_blocks uses a goal block to assist allocation and look up
1849  * the block bitmap directly to do block allocation.  It tries to
1850  * allocate block(s) from the block group contains the goal block first. If
1851  * that fails, it will try to allocate block(s) from other block groups
1852  * without any specific goal block.
1853  *
1854  * This function is called when -o nomballoc mount option is enabled
1855  *
1856  */
1857 ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
1858                         ext4_fsblk_t goal, unsigned long *count, int *errp)
1859 {
1860         struct buffer_head *bitmap_bh = NULL;
1861         struct buffer_head *gdp_bh;
1862         ext4_group_t group_no;
1863         ext4_group_t goal_group;
1864         ext4_grpblk_t grp_target_blk;   /* blockgroup relative goal block */
1865         ext4_grpblk_t grp_alloc_blk;    /* blockgroup-relative allocated block*/
1866         ext4_fsblk_t ret_block;         /* filesyetem-wide allocated block */
1867         ext4_group_t bgi;                       /* blockgroup iteration index */
1868         int fatal = 0, err;
1869         int performed_allocation = 0;
1870         ext4_grpblk_t free_blocks;      /* number of free blocks in a group */
1871         struct super_block *sb;
1872         struct ext4_group_desc *gdp;
1873         struct ext4_super_block *es;
1874         struct ext4_sb_info *sbi;
1875         struct ext4_reserve_window_node *my_rsv = NULL;
1876         struct ext4_block_alloc_info *block_i;
1877         unsigned short windowsz = 0;
1878         ext4_group_t ngroups;
1879         unsigned long num = *count;
1880
1881         sb = inode->i_sb;
1882         if (!sb) {
1883                 *errp = -ENODEV;
1884                 printk("ext4_new_block: nonexistent device");
1885                 return 0;
1886         }
1887
1888         sbi = EXT4_SB(sb);
1889         if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
1890                 /*
1891                  * With delalloc we already reserved the blocks
1892                  */
1893                 if (ext4_claim_free_blocks(sbi, *count)) {
1894                         *errp = -ENOSPC;
1895                         return 0;       /*return with ENOSPC error */
1896                 }
1897         }
1898         /*
1899          * Check quota for allocation of this block.
1900          */
1901         if (DQUOT_ALLOC_BLOCK(inode, num)) {
1902                 *errp = -EDQUOT;
1903                 return 0;
1904         }
1905
1906         sbi = EXT4_SB(sb);
1907         es = EXT4_SB(sb)->s_es;
1908         ext4_debug("goal=%llu.\n", goal);
1909         /*
1910          * Allocate a block from reservation only when
1911          * filesystem is mounted with reservation(default,-o reservation), and
1912          * it's a regular file, and
1913          * the desired window size is greater than 0 (One could use ioctl
1914          * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
1915          * reservation on that particular file)
1916          */
1917         block_i = EXT4_I(inode)->i_block_alloc_info;
1918         if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1919                 my_rsv = &block_i->rsv_window_node;
1920
1921         /*
1922          * First, test whether the goal block is free.
1923          */
1924         if (goal < le32_to_cpu(es->s_first_data_block) ||
1925             goal >= ext4_blocks_count(es))
1926                 goal = le32_to_cpu(es->s_first_data_block);
1927         ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
1928         goal_group = group_no;
1929 retry_alloc:
1930         gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1931         if (!gdp)
1932                 goto io_error;
1933
1934         free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1935         /*
1936          * if there is not enough free blocks to make a new resevation
1937          * turn off reservation for this allocation
1938          */
1939         if (my_rsv && (free_blocks < windowsz)
1940                 && (rsv_is_empty(&my_rsv->rsv_window)))
1941                 my_rsv = NULL;
1942
1943         if (free_blocks > 0) {
1944                 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1945                 if (!bitmap_bh)
1946                         goto io_error;
1947                 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1948                                         group_no, bitmap_bh, grp_target_blk,
1949                                         my_rsv, &num, &fatal);
1950                 if (fatal)
1951                         goto out;
1952                 if (grp_alloc_blk >= 0)
1953                         goto allocated;
1954         }
1955
1956         ngroups = EXT4_SB(sb)->s_groups_count;
1957         smp_rmb();
1958
1959         /*
1960          * Now search the rest of the groups.  We assume that
1961          * group_no and gdp correctly point to the last group visited.
1962          */
1963         for (bgi = 0; bgi < ngroups; bgi++) {
1964                 group_no++;
1965                 if (group_no >= ngroups)
1966                         group_no = 0;
1967                 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1968                 if (!gdp)
1969                         goto io_error;
1970                 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1971                 /*
1972                  * skip this group if the number of
1973                  * free blocks is less than half of the reservation
1974                  * window size.
1975                  */
1976                 if (free_blocks <= (windowsz/2))
1977                         continue;
1978
1979                 brelse(bitmap_bh);
1980                 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1981                 if (!bitmap_bh)
1982                         goto io_error;
1983                 /*
1984                  * try to allocate block(s) from this group, without a goal(-1).
1985                  */
1986                 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1987                                         group_no, bitmap_bh, -1, my_rsv,
1988                                         &num, &fatal);
1989                 if (fatal)
1990                         goto out;
1991                 if (grp_alloc_blk >= 0)
1992                         goto allocated;
1993         }
1994         /*
1995          * We may end up a bogus ealier ENOSPC error due to
1996          * filesystem is "full" of reservations, but
1997          * there maybe indeed free blocks avaliable on disk
1998          * In this case, we just forget about the reservations
1999          * just do block allocation as without reservations.
2000          */
2001         if (my_rsv) {
2002                 my_rsv = NULL;
2003                 windowsz = 0;
2004                 group_no = goal_group;
2005                 goto retry_alloc;
2006         }
2007         /* No space left on the device */
2008         *errp = -ENOSPC;
2009         goto out;
2010
2011 allocated:
2012
2013         ext4_debug("using block group %lu(%d)\n",
2014                         group_no, gdp->bg_free_blocks_count);
2015
2016         BUFFER_TRACE(gdp_bh, "get_write_access");
2017         fatal = ext4_journal_get_write_access(handle, gdp_bh);
2018         if (fatal)
2019                 goto out;
2020
2021         ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
2022
2023         if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
2024             in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
2025             in_range(ret_block, ext4_inode_table(sb, gdp),
2026                      EXT4_SB(sb)->s_itb_per_group) ||
2027             in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
2028                      EXT4_SB(sb)->s_itb_per_group)) {
2029                 ext4_error(sb, "ext4_new_block",
2030                             "Allocating block in system zone - "
2031                             "blocks from %llu, length %lu",
2032                              ret_block, num);
2033                 /*
2034                  * claim_block marked the blocks we allocated
2035                  * as in use. So we may want to selectively
2036                  * mark some of the blocks as free
2037                  */
2038                 goto retry_alloc;
2039         }
2040
2041         performed_allocation = 1;
2042
2043 #ifdef CONFIG_JBD2_DEBUG
2044         {
2045                 struct buffer_head *debug_bh;
2046
2047                 /* Record bitmap buffer state in the newly allocated block */
2048                 debug_bh = sb_find_get_block(sb, ret_block);
2049                 if (debug_bh) {
2050                         BUFFER_TRACE(debug_bh, "state when allocated");
2051                         BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
2052                         brelse(debug_bh);
2053                 }
2054         }
2055         jbd_lock_bh_state(bitmap_bh);
2056         spin_lock(sb_bgl_lock(sbi, group_no));
2057         if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
2058                 int i;
2059
2060                 for (i = 0; i < num; i++) {
2061                         if (ext4_test_bit(grp_alloc_blk+i,
2062                                         bh2jh(bitmap_bh)->b_committed_data)) {
2063                                 printk("%s: block was unexpectedly set in "
2064                                         "b_committed_data\n", __func__);
2065                         }
2066                 }
2067         }
2068         ext4_debug("found bit %d\n", grp_alloc_blk);
2069         spin_unlock(sb_bgl_lock(sbi, group_no));
2070         jbd_unlock_bh_state(bitmap_bh);
2071 #endif
2072
2073         if (ret_block + num - 1 >= ext4_blocks_count(es)) {
2074                 ext4_error(sb, "ext4_new_block",
2075                             "block(%llu) >= blocks count(%llu) - "
2076                             "block_group = %lu, es == %p ", ret_block,
2077                         ext4_blocks_count(es), group_no, es);
2078                 goto out;
2079         }
2080
2081         /*
2082          * It is up to the caller to add the new buffer to a journal
2083          * list of some description.  We don't know in advance whether
2084          * the caller wants to use it as metadata or data.
2085          */
2086         spin_lock(sb_bgl_lock(sbi, group_no));
2087         if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
2088                 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2089         le16_add_cpu(&gdp->bg_free_blocks_count, -num);
2090         gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
2091         spin_unlock(sb_bgl_lock(sbi, group_no));
2092         if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
2093                 /*
2094                  * we allocated less blocks than we
2095                  * claimed. Add the difference back.
2096                  */
2097                 percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
2098         }
2099         if (sbi->s_log_groups_per_flex) {
2100                 ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
2101                 spin_lock(sb_bgl_lock(sbi, flex_group));
2102                 sbi->s_flex_groups[flex_group].free_blocks -= num;
2103                 spin_unlock(sb_bgl_lock(sbi, flex_group));
2104         }
2105
2106         BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
2107         err = ext4_journal_dirty_metadata(handle, gdp_bh);
2108         if (!fatal)
2109                 fatal = err;
2110
2111         sb->s_dirt = 1;
2112         if (fatal)
2113                 goto out;
2114
2115         *errp = 0;
2116         brelse(bitmap_bh);
2117         DQUOT_FREE_BLOCK(inode, *count-num);
2118         *count = num;
2119         return ret_block;
2120
2121 io_error:
2122         *errp = -EIO;
2123 out:
2124         if (fatal) {
2125                 *errp = fatal;
2126                 ext4_std_error(sb, fatal);
2127         }
2128         /*
2129          * Undo the block allocation
2130          */
2131         if (!performed_allocation)
2132                 DQUOT_FREE_BLOCK(inode, *count);
2133         brelse(bitmap_bh);
2134         return 0;
2135 }
2136
2137 #define EXT4_META_BLOCK 0x1
2138
2139 static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
2140                                 ext4_lblk_t iblock, ext4_fsblk_t goal,
2141                                 unsigned long *count, int *errp, int flags)
2142 {
2143         struct ext4_allocation_request ar;
2144         ext4_fsblk_t ret;
2145
2146         if (!test_opt(inode->i_sb, MBALLOC)) {
2147                 return ext4_old_new_blocks(handle, inode, goal, count, errp);
2148         }
2149
2150         memset(&ar, 0, sizeof(ar));
2151         /* Fill with neighbour allocated blocks */
2152
2153         ar.inode = inode;
2154         ar.goal = goal;
2155         ar.len = *count;
2156         ar.logical = iblock;
2157
2158         if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
2159                 /* enable in-core preallocation for data block allocation */
2160                 ar.flags = EXT4_MB_HINT_DATA;
2161         else
2162                 /* disable in-core preallocation for non-regular files */
2163                 ar.flags = 0;
2164
2165         ret = ext4_mb_new_blocks(handle, &ar, errp);
2166         *count = ar.len;
2167         return ret;
2168 }
2169
2170 /*
2171  * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
2172  *
2173  * @handle:             handle to this transaction
2174  * @inode:              file inode
2175  * @goal:               given target block(filesystem wide)
2176  * @count:              total number of blocks need
2177  * @errp:               error code
2178  *
2179  * Return 1st allocated block numberon success, *count stores total account
2180  * error stores in errp pointer
2181  */
2182 ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
2183                 ext4_fsblk_t goal, unsigned long *count, int *errp)
2184 {
2185         ext4_fsblk_t ret;
2186         ret = do_blk_alloc(handle, inode, 0, goal,
2187                                 count, errp, EXT4_META_BLOCK);
2188         /*
2189          * Account for the allocated meta blocks
2190          */
2191         if (!(*errp)) {
2192                 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2193                 EXT4_I(inode)->i_allocated_meta_blocks += *count;
2194                 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2195         }
2196         return ret;
2197 }
2198
2199 /*
2200  * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
2201  *
2202  * @handle:             handle to this transaction
2203  * @inode:              file inode
2204  * @goal:               given target block(filesystem wide)
2205  * @errp:               error code
2206  *
2207  * Return allocated block number on success
2208  */
2209 ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
2210                 ext4_fsblk_t goal, int *errp)
2211 {
2212         unsigned long count = 1;
2213         return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
2214 }
2215
2216 /*
2217  * ext4_new_blocks() -- allocate data blocks
2218  *
2219  * @handle:             handle to this transaction
2220  * @inode:              file inode
2221  * @goal:               given target block(filesystem wide)
2222  * @count:              total number of blocks need
2223  * @errp:               error code
2224  *
2225  * Return 1st allocated block numberon success, *count stores total account
2226  * error stores in errp pointer
2227  */
2228
2229 ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
2230                                 ext4_lblk_t iblock, ext4_fsblk_t goal,
2231                                 unsigned long *count, int *errp)
2232 {
2233         return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
2234 }
2235
2236 /**
2237  * ext4_count_free_blocks() -- count filesystem free blocks
2238  * @sb:         superblock
2239  *
2240  * Adds up the number of free blocks from each block group.
2241  */
2242 ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
2243 {
2244         ext4_fsblk_t desc_count;
2245         struct ext4_group_desc *gdp;
2246         ext4_group_t i;
2247         ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
2248 #ifdef EXT4FS_DEBUG
2249         struct ext4_super_block *es;
2250         ext4_fsblk_t bitmap_count;
2251         unsigned long x;
2252         struct buffer_head *bitmap_bh = NULL;
2253
2254         es = EXT4_SB(sb)->s_es;
2255         desc_count = 0;
2256         bitmap_count = 0;
2257         gdp = NULL;
2258
2259         smp_rmb();
2260         for (i = 0; i < ngroups; i++) {
2261                 gdp = ext4_get_group_desc(sb, i, NULL);
2262                 if (!gdp)
2263                         continue;
2264                 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
2265                 brelse(bitmap_bh);
2266                 bitmap_bh = ext4_read_block_bitmap(sb, i);
2267                 if (bitmap_bh == NULL)
2268                         continue;
2269
2270                 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
2271                 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
2272                         i, le16_to_cpu(gdp->bg_free_blocks_count), x);
2273                 bitmap_count += x;
2274         }
2275         brelse(bitmap_bh);
2276         printk("ext4_count_free_blocks: stored = %llu"
2277                 ", computed = %llu, %llu\n",
2278                 ext4_free_blocks_count(es),
2279                 desc_count, bitmap_count);
2280         return bitmap_count;
2281 #else
2282         desc_count = 0;
2283         smp_rmb();
2284         for (i = 0; i < ngroups; i++) {
2285                 gdp = ext4_get_group_desc(sb, i, NULL);
2286                 if (!gdp)
2287                         continue;
2288                 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
2289         }
2290
2291         return desc_count;
2292 #endif
2293 }
2294
2295 static inline int test_root(ext4_group_t a, int b)
2296 {
2297         int num = b;
2298
2299         while (a > num)
2300                 num *= b;
2301         return num == a;
2302 }
2303
2304 static int ext4_group_sparse(ext4_group_t group)
2305 {
2306         if (group <= 1)
2307                 return 1;
2308         if (!(group & 1))
2309                 return 0;
2310         return (test_root(group, 7) || test_root(group, 5) ||
2311                 test_root(group, 3));
2312 }
2313
2314 /**
2315  *      ext4_bg_has_super - number of blocks used by the superblock in group
2316  *      @sb: superblock for filesystem
2317  *      @group: group number to check
2318  *
2319  *      Return the number of blocks used by the superblock (primary or backup)
2320  *      in this group.  Currently this will be only 0 or 1.
2321  */
2322 int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
2323 {
2324         if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
2325                                 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
2326                         !ext4_group_sparse(group))
2327                 return 0;
2328         return 1;
2329 }
2330
2331 static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
2332                                         ext4_group_t group)
2333 {
2334         unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
2335         ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
2336         ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
2337
2338         if (group == first || group == first + 1 || group == last)
2339                 return 1;
2340         return 0;
2341 }
2342
2343 static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
2344                                         ext4_group_t group)
2345 {
2346         return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
2347 }
2348
2349 /**
2350  *      ext4_bg_num_gdb - number of blocks used by the group table in group
2351  *      @sb: superblock for filesystem
2352  *      @group: group number to check
2353  *
2354  *      Return the number of blocks used by the group descriptor table
2355  *      (primary or backup) in this group.  In the future there may be a
2356  *      different number of descriptor blocks in each group.
2357  */
2358 unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
2359 {
2360         unsigned long first_meta_bg =
2361                         le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
2362         unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
2363
2364         if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
2365                         metagroup < first_meta_bg)
2366                 return ext4_bg_num_gdb_nometa(sb,group);
2367
2368         return ext4_bg_num_gdb_meta(sb,group);
2369
2370 }