]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - fs/ext4/extents.c
Merge remote-tracking branch 'trivial/for-next'
[karo-tx-linux.git] / fs / ext4 / extents.c
1 /*
2  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3  * Written by Alex Tomas <alex@clusterfs.com>
4  *
5  * Architecture independence:
6  *   Copyright (c) 2005, Bull S.A.
7  *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public Licens
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21  */
22
23 /*
24  * Extents support for EXT4
25  *
26  * TODO:
27  *   - ext4*_error() should be used in some situations
28  *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29  *   - smart tree reduction
30  */
31
32 #include <linux/fs.h>
33 #include <linux/time.h>
34 #include <linux/jbd2.h>
35 #include <linux/highuid.h>
36 #include <linux/pagemap.h>
37 #include <linux/quotaops.h>
38 #include <linux/string.h>
39 #include <linux/slab.h>
40 #include <asm/uaccess.h>
41 #include <linux/fiemap.h>
42 #include <linux/backing-dev.h>
43 #include "ext4_jbd2.h"
44 #include "ext4_extents.h"
45 #include "xattr.h"
46
47 #include <trace/events/ext4.h>
48
49 /*
50  * used by extent splitting.
51  */
52 #define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
53                                         due to ENOSPC */
54 #define EXT4_EXT_MARK_UNWRIT1   0x2  /* mark first half unwritten */
55 #define EXT4_EXT_MARK_UNWRIT2   0x4  /* mark second half unwritten */
56
57 #define EXT4_EXT_DATA_VALID1    0x8  /* first half contains valid data */
58 #define EXT4_EXT_DATA_VALID2    0x10 /* second half contains valid data */
59
60 static __le32 ext4_extent_block_csum(struct inode *inode,
61                                      struct ext4_extent_header *eh)
62 {
63         struct ext4_inode_info *ei = EXT4_I(inode);
64         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
65         __u32 csum;
66
67         csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
68                            EXT4_EXTENT_TAIL_OFFSET(eh));
69         return cpu_to_le32(csum);
70 }
71
72 static int ext4_extent_block_csum_verify(struct inode *inode,
73                                          struct ext4_extent_header *eh)
74 {
75         struct ext4_extent_tail *et;
76
77         if (!ext4_has_metadata_csum(inode->i_sb))
78                 return 1;
79
80         et = find_ext4_extent_tail(eh);
81         if (et->et_checksum != ext4_extent_block_csum(inode, eh))
82                 return 0;
83         return 1;
84 }
85
86 static void ext4_extent_block_csum_set(struct inode *inode,
87                                        struct ext4_extent_header *eh)
88 {
89         struct ext4_extent_tail *et;
90
91         if (!ext4_has_metadata_csum(inode->i_sb))
92                 return;
93
94         et = find_ext4_extent_tail(eh);
95         et->et_checksum = ext4_extent_block_csum(inode, eh);
96 }
97
98 static int ext4_split_extent(handle_t *handle,
99                                 struct inode *inode,
100                                 struct ext4_ext_path **ppath,
101                                 struct ext4_map_blocks *map,
102                                 int split_flag,
103                                 int flags);
104
105 static int ext4_split_extent_at(handle_t *handle,
106                              struct inode *inode,
107                              struct ext4_ext_path **ppath,
108                              ext4_lblk_t split,
109                              int split_flag,
110                              int flags);
111
112 static int ext4_find_delayed_extent(struct inode *inode,
113                                     struct extent_status *newes);
114
115 static int ext4_ext_truncate_extend_restart(handle_t *handle,
116                                             struct inode *inode,
117                                             int needed)
118 {
119         int err;
120
121         if (!ext4_handle_valid(handle))
122                 return 0;
123         if (handle->h_buffer_credits > needed)
124                 return 0;
125         err = ext4_journal_extend(handle, needed);
126         if (err <= 0)
127                 return err;
128         err = ext4_truncate_restart_trans(handle, inode, needed);
129         if (err == 0)
130                 err = -EAGAIN;
131
132         return err;
133 }
134
135 /*
136  * could return:
137  *  - EROFS
138  *  - ENOMEM
139  */
140 static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
141                                 struct ext4_ext_path *path)
142 {
143         if (path->p_bh) {
144                 /* path points to block */
145                 BUFFER_TRACE(path->p_bh, "get_write_access");
146                 return ext4_journal_get_write_access(handle, path->p_bh);
147         }
148         /* path points to leaf/index in inode body */
149         /* we use in-core data, no need to protect them */
150         return 0;
151 }
152
153 /*
154  * could return:
155  *  - EROFS
156  *  - ENOMEM
157  *  - EIO
158  */
159 int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
160                      struct inode *inode, struct ext4_ext_path *path)
161 {
162         int err;
163
164         WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
165         if (path->p_bh) {
166                 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
167                 /* path points to block */
168                 err = __ext4_handle_dirty_metadata(where, line, handle,
169                                                    inode, path->p_bh);
170         } else {
171                 /* path points to leaf/index in inode body */
172                 err = ext4_mark_inode_dirty(handle, inode);
173         }
174         return err;
175 }
176
177 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
178                               struct ext4_ext_path *path,
179                               ext4_lblk_t block)
180 {
181         if (path) {
182                 int depth = path->p_depth;
183                 struct ext4_extent *ex;
184
185                 /*
186                  * Try to predict block placement assuming that we are
187                  * filling in a file which will eventually be
188                  * non-sparse --- i.e., in the case of libbfd writing
189                  * an ELF object sections out-of-order but in a way
190                  * the eventually results in a contiguous object or
191                  * executable file, or some database extending a table
192                  * space file.  However, this is actually somewhat
193                  * non-ideal if we are writing a sparse file such as
194                  * qemu or KVM writing a raw image file that is going
195                  * to stay fairly sparse, since it will end up
196                  * fragmenting the file system's free space.  Maybe we
197                  * should have some hueristics or some way to allow
198                  * userspace to pass a hint to file system,
199                  * especially if the latter case turns out to be
200                  * common.
201                  */
202                 ex = path[depth].p_ext;
203                 if (ex) {
204                         ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
205                         ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
206
207                         if (block > ext_block)
208                                 return ext_pblk + (block - ext_block);
209                         else
210                                 return ext_pblk - (ext_block - block);
211                 }
212
213                 /* it looks like index is empty;
214                  * try to find starting block from index itself */
215                 if (path[depth].p_bh)
216                         return path[depth].p_bh->b_blocknr;
217         }
218
219         /* OK. use inode's group */
220         return ext4_inode_to_goal_block(inode);
221 }
222
223 /*
224  * Allocation for a meta data block
225  */
226 static ext4_fsblk_t
227 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
228                         struct ext4_ext_path *path,
229                         struct ext4_extent *ex, int *err, unsigned int flags)
230 {
231         ext4_fsblk_t goal, newblock;
232
233         goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
234         newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
235                                         NULL, err);
236         return newblock;
237 }
238
239 static inline int ext4_ext_space_block(struct inode *inode, int check)
240 {
241         int size;
242
243         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
244                         / sizeof(struct ext4_extent);
245 #ifdef AGGRESSIVE_TEST
246         if (!check && size > 6)
247                 size = 6;
248 #endif
249         return size;
250 }
251
252 static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
253 {
254         int size;
255
256         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
257                         / sizeof(struct ext4_extent_idx);
258 #ifdef AGGRESSIVE_TEST
259         if (!check && size > 5)
260                 size = 5;
261 #endif
262         return size;
263 }
264
265 static inline int ext4_ext_space_root(struct inode *inode, int check)
266 {
267         int size;
268
269         size = sizeof(EXT4_I(inode)->i_data);
270         size -= sizeof(struct ext4_extent_header);
271         size /= sizeof(struct ext4_extent);
272 #ifdef AGGRESSIVE_TEST
273         if (!check && size > 3)
274                 size = 3;
275 #endif
276         return size;
277 }
278
279 static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
280 {
281         int size;
282
283         size = sizeof(EXT4_I(inode)->i_data);
284         size -= sizeof(struct ext4_extent_header);
285         size /= sizeof(struct ext4_extent_idx);
286 #ifdef AGGRESSIVE_TEST
287         if (!check && size > 4)
288                 size = 4;
289 #endif
290         return size;
291 }
292
293 static inline int
294 ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
295                            struct ext4_ext_path **ppath, ext4_lblk_t lblk,
296                            int nofail)
297 {
298         struct ext4_ext_path *path = *ppath;
299         int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
300
301         return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
302                         EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
303                         EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
304                         (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
305 }
306
307 /*
308  * Calculate the number of metadata blocks needed
309  * to allocate @blocks
310  * Worse case is one block per extent
311  */
312 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
313 {
314         struct ext4_inode_info *ei = EXT4_I(inode);
315         int idxs;
316
317         idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
318                 / sizeof(struct ext4_extent_idx));
319
320         /*
321          * If the new delayed allocation block is contiguous with the
322          * previous da block, it can share index blocks with the
323          * previous block, so we only need to allocate a new index
324          * block every idxs leaf blocks.  At ldxs**2 blocks, we need
325          * an additional index block, and at ldxs**3 blocks, yet
326          * another index blocks.
327          */
328         if (ei->i_da_metadata_calc_len &&
329             ei->i_da_metadata_calc_last_lblock+1 == lblock) {
330                 int num = 0;
331
332                 if ((ei->i_da_metadata_calc_len % idxs) == 0)
333                         num++;
334                 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
335                         num++;
336                 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
337                         num++;
338                         ei->i_da_metadata_calc_len = 0;
339                 } else
340                         ei->i_da_metadata_calc_len++;
341                 ei->i_da_metadata_calc_last_lblock++;
342                 return num;
343         }
344
345         /*
346          * In the worst case we need a new set of index blocks at
347          * every level of the inode's extent tree.
348          */
349         ei->i_da_metadata_calc_len = 1;
350         ei->i_da_metadata_calc_last_lblock = lblock;
351         return ext_depth(inode) + 1;
352 }
353
354 static int
355 ext4_ext_max_entries(struct inode *inode, int depth)
356 {
357         int max;
358
359         if (depth == ext_depth(inode)) {
360                 if (depth == 0)
361                         max = ext4_ext_space_root(inode, 1);
362                 else
363                         max = ext4_ext_space_root_idx(inode, 1);
364         } else {
365                 if (depth == 0)
366                         max = ext4_ext_space_block(inode, 1);
367                 else
368                         max = ext4_ext_space_block_idx(inode, 1);
369         }
370
371         return max;
372 }
373
374 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
375 {
376         ext4_fsblk_t block = ext4_ext_pblock(ext);
377         int len = ext4_ext_get_actual_len(ext);
378         ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
379         ext4_lblk_t last = lblock + len - 1;
380
381         if (len == 0 || lblock > last)
382                 return 0;
383         return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
384 }
385
386 static int ext4_valid_extent_idx(struct inode *inode,
387                                 struct ext4_extent_idx *ext_idx)
388 {
389         ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
390
391         return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
392 }
393
394 static int ext4_valid_extent_entries(struct inode *inode,
395                                 struct ext4_extent_header *eh,
396                                 int depth)
397 {
398         unsigned short entries;
399         if (eh->eh_entries == 0)
400                 return 1;
401
402         entries = le16_to_cpu(eh->eh_entries);
403
404         if (depth == 0) {
405                 /* leaf entries */
406                 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
407                 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
408                 ext4_fsblk_t pblock = 0;
409                 ext4_lblk_t lblock = 0;
410                 ext4_lblk_t prev = 0;
411                 int len = 0;
412                 while (entries) {
413                         if (!ext4_valid_extent(inode, ext))
414                                 return 0;
415
416                         /* Check for overlapping extents */
417                         lblock = le32_to_cpu(ext->ee_block);
418                         len = ext4_ext_get_actual_len(ext);
419                         if ((lblock <= prev) && prev) {
420                                 pblock = ext4_ext_pblock(ext);
421                                 es->s_last_error_block = cpu_to_le64(pblock);
422                                 return 0;
423                         }
424                         ext++;
425                         entries--;
426                         prev = lblock + len - 1;
427                 }
428         } else {
429                 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
430                 while (entries) {
431                         if (!ext4_valid_extent_idx(inode, ext_idx))
432                                 return 0;
433                         ext_idx++;
434                         entries--;
435                 }
436         }
437         return 1;
438 }
439
440 static int __ext4_ext_check(const char *function, unsigned int line,
441                             struct inode *inode, struct ext4_extent_header *eh,
442                             int depth, ext4_fsblk_t pblk)
443 {
444         const char *error_msg;
445         int max = 0, err = -EFSCORRUPTED;
446
447         if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
448                 error_msg = "invalid magic";
449                 goto corrupted;
450         }
451         if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
452                 error_msg = "unexpected eh_depth";
453                 goto corrupted;
454         }
455         if (unlikely(eh->eh_max == 0)) {
456                 error_msg = "invalid eh_max";
457                 goto corrupted;
458         }
459         max = ext4_ext_max_entries(inode, depth);
460         if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
461                 error_msg = "too large eh_max";
462                 goto corrupted;
463         }
464         if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
465                 error_msg = "invalid eh_entries";
466                 goto corrupted;
467         }
468         if (!ext4_valid_extent_entries(inode, eh, depth)) {
469                 error_msg = "invalid extent entries";
470                 goto corrupted;
471         }
472         /* Verify checksum on non-root extent tree nodes */
473         if (ext_depth(inode) != depth &&
474             !ext4_extent_block_csum_verify(inode, eh)) {
475                 error_msg = "extent tree corrupted";
476                 err = -EFSBADCRC;
477                 goto corrupted;
478         }
479         return 0;
480
481 corrupted:
482         ext4_error_inode(inode, function, line, 0,
483                          "pblk %llu bad header/extent: %s - magic %x, "
484                          "entries %u, max %u(%u), depth %u(%u)",
485                          (unsigned long long) pblk, error_msg,
486                          le16_to_cpu(eh->eh_magic),
487                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
488                          max, le16_to_cpu(eh->eh_depth), depth);
489         return err;
490 }
491
492 #define ext4_ext_check(inode, eh, depth, pblk)                  \
493         __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
494
495 int ext4_ext_check_inode(struct inode *inode)
496 {
497         return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
498 }
499
500 static struct buffer_head *
501 __read_extent_tree_block(const char *function, unsigned int line,
502                          struct inode *inode, ext4_fsblk_t pblk, int depth,
503                          int flags)
504 {
505         struct buffer_head              *bh;
506         int                             err;
507
508         bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS);
509         if (unlikely(!bh))
510                 return ERR_PTR(-ENOMEM);
511
512         if (!bh_uptodate_or_lock(bh)) {
513                 trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
514                 err = bh_submit_read(bh);
515                 if (err < 0)
516                         goto errout;
517         }
518         if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
519                 return bh;
520         err = __ext4_ext_check(function, line, inode,
521                                ext_block_hdr(bh), depth, pblk);
522         if (err)
523                 goto errout;
524         set_buffer_verified(bh);
525         /*
526          * If this is a leaf block, cache all of its entries
527          */
528         if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
529                 struct ext4_extent_header *eh = ext_block_hdr(bh);
530                 struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
531                 ext4_lblk_t prev = 0;
532                 int i;
533
534                 for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
535                         unsigned int status = EXTENT_STATUS_WRITTEN;
536                         ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
537                         int len = ext4_ext_get_actual_len(ex);
538
539                         if (prev && (prev != lblk))
540                                 ext4_es_cache_extent(inode, prev,
541                                                      lblk - prev, ~0,
542                                                      EXTENT_STATUS_HOLE);
543
544                         if (ext4_ext_is_unwritten(ex))
545                                 status = EXTENT_STATUS_UNWRITTEN;
546                         ext4_es_cache_extent(inode, lblk, len,
547                                              ext4_ext_pblock(ex), status);
548                         prev = lblk + len;
549                 }
550         }
551         return bh;
552 errout:
553         put_bh(bh);
554         return ERR_PTR(err);
555
556 }
557
558 #define read_extent_tree_block(inode, pblk, depth, flags)               \
559         __read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
560                                  (depth), (flags))
561
562 /*
563  * This function is called to cache a file's extent information in the
564  * extent status tree
565  */
566 int ext4_ext_precache(struct inode *inode)
567 {
568         struct ext4_inode_info *ei = EXT4_I(inode);
569         struct ext4_ext_path *path = NULL;
570         struct buffer_head *bh;
571         int i = 0, depth, ret = 0;
572
573         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
574                 return 0;       /* not an extent-mapped inode */
575
576         down_read(&ei->i_data_sem);
577         depth = ext_depth(inode);
578
579         path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
580                        GFP_NOFS);
581         if (path == NULL) {
582                 up_read(&ei->i_data_sem);
583                 return -ENOMEM;
584         }
585
586         /* Don't cache anything if there are no external extent blocks */
587         if (depth == 0)
588                 goto out;
589         path[0].p_hdr = ext_inode_hdr(inode);
590         ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
591         if (ret)
592                 goto out;
593         path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
594         while (i >= 0) {
595                 /*
596                  * If this is a leaf block or we've reached the end of
597                  * the index block, go up
598                  */
599                 if ((i == depth) ||
600                     path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
601                         brelse(path[i].p_bh);
602                         path[i].p_bh = NULL;
603                         i--;
604                         continue;
605                 }
606                 bh = read_extent_tree_block(inode,
607                                             ext4_idx_pblock(path[i].p_idx++),
608                                             depth - i - 1,
609                                             EXT4_EX_FORCE_CACHE);
610                 if (IS_ERR(bh)) {
611                         ret = PTR_ERR(bh);
612                         break;
613                 }
614                 i++;
615                 path[i].p_bh = bh;
616                 path[i].p_hdr = ext_block_hdr(bh);
617                 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
618         }
619         ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
620 out:
621         up_read(&ei->i_data_sem);
622         ext4_ext_drop_refs(path);
623         kfree(path);
624         return ret;
625 }
626
627 #ifdef EXT_DEBUG
628 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
629 {
630         int k, l = path->p_depth;
631
632         ext_debug("path:");
633         for (k = 0; k <= l; k++, path++) {
634                 if (path->p_idx) {
635                   ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
636                             ext4_idx_pblock(path->p_idx));
637                 } else if (path->p_ext) {
638                         ext_debug("  %d:[%d]%d:%llu ",
639                                   le32_to_cpu(path->p_ext->ee_block),
640                                   ext4_ext_is_unwritten(path->p_ext),
641                                   ext4_ext_get_actual_len(path->p_ext),
642                                   ext4_ext_pblock(path->p_ext));
643                 } else
644                         ext_debug("  []");
645         }
646         ext_debug("\n");
647 }
648
649 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
650 {
651         int depth = ext_depth(inode);
652         struct ext4_extent_header *eh;
653         struct ext4_extent *ex;
654         int i;
655
656         if (!path)
657                 return;
658
659         eh = path[depth].p_hdr;
660         ex = EXT_FIRST_EXTENT(eh);
661
662         ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
663
664         for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
665                 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
666                           ext4_ext_is_unwritten(ex),
667                           ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
668         }
669         ext_debug("\n");
670 }
671
672 static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
673                         ext4_fsblk_t newblock, int level)
674 {
675         int depth = ext_depth(inode);
676         struct ext4_extent *ex;
677
678         if (depth != level) {
679                 struct ext4_extent_idx *idx;
680                 idx = path[level].p_idx;
681                 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
682                         ext_debug("%d: move %d:%llu in new index %llu\n", level,
683                                         le32_to_cpu(idx->ei_block),
684                                         ext4_idx_pblock(idx),
685                                         newblock);
686                         idx++;
687                 }
688
689                 return;
690         }
691
692         ex = path[depth].p_ext;
693         while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
694                 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
695                                 le32_to_cpu(ex->ee_block),
696                                 ext4_ext_pblock(ex),
697                                 ext4_ext_is_unwritten(ex),
698                                 ext4_ext_get_actual_len(ex),
699                                 newblock);
700                 ex++;
701         }
702 }
703
704 #else
705 #define ext4_ext_show_path(inode, path)
706 #define ext4_ext_show_leaf(inode, path)
707 #define ext4_ext_show_move(inode, path, newblock, level)
708 #endif
709
710 void ext4_ext_drop_refs(struct ext4_ext_path *path)
711 {
712         int depth, i;
713
714         if (!path)
715                 return;
716         depth = path->p_depth;
717         for (i = 0; i <= depth; i++, path++)
718                 if (path->p_bh) {
719                         brelse(path->p_bh);
720                         path->p_bh = NULL;
721                 }
722 }
723
724 /*
725  * ext4_ext_binsearch_idx:
726  * binary search for the closest index of the given block
727  * the header must be checked before calling this
728  */
729 static void
730 ext4_ext_binsearch_idx(struct inode *inode,
731                         struct ext4_ext_path *path, ext4_lblk_t block)
732 {
733         struct ext4_extent_header *eh = path->p_hdr;
734         struct ext4_extent_idx *r, *l, *m;
735
736
737         ext_debug("binsearch for %u(idx):  ", block);
738
739         l = EXT_FIRST_INDEX(eh) + 1;
740         r = EXT_LAST_INDEX(eh);
741         while (l <= r) {
742                 m = l + (r - l) / 2;
743                 if (block < le32_to_cpu(m->ei_block))
744                         r = m - 1;
745                 else
746                         l = m + 1;
747                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
748                                 m, le32_to_cpu(m->ei_block),
749                                 r, le32_to_cpu(r->ei_block));
750         }
751
752         path->p_idx = l - 1;
753         ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
754                   ext4_idx_pblock(path->p_idx));
755
756 #ifdef CHECK_BINSEARCH
757         {
758                 struct ext4_extent_idx *chix, *ix;
759                 int k;
760
761                 chix = ix = EXT_FIRST_INDEX(eh);
762                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
763                   if (k != 0 &&
764                       le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
765                                 printk(KERN_DEBUG "k=%d, ix=0x%p, "
766                                        "first=0x%p\n", k,
767                                        ix, EXT_FIRST_INDEX(eh));
768                                 printk(KERN_DEBUG "%u <= %u\n",
769                                        le32_to_cpu(ix->ei_block),
770                                        le32_to_cpu(ix[-1].ei_block));
771                         }
772                         BUG_ON(k && le32_to_cpu(ix->ei_block)
773                                            <= le32_to_cpu(ix[-1].ei_block));
774                         if (block < le32_to_cpu(ix->ei_block))
775                                 break;
776                         chix = ix;
777                 }
778                 BUG_ON(chix != path->p_idx);
779         }
780 #endif
781
782 }
783
784 /*
785  * ext4_ext_binsearch:
786  * binary search for closest extent of the given block
787  * the header must be checked before calling this
788  */
789 static void
790 ext4_ext_binsearch(struct inode *inode,
791                 struct ext4_ext_path *path, ext4_lblk_t block)
792 {
793         struct ext4_extent_header *eh = path->p_hdr;
794         struct ext4_extent *r, *l, *m;
795
796         if (eh->eh_entries == 0) {
797                 /*
798                  * this leaf is empty:
799                  * we get such a leaf in split/add case
800                  */
801                 return;
802         }
803
804         ext_debug("binsearch for %u:  ", block);
805
806         l = EXT_FIRST_EXTENT(eh) + 1;
807         r = EXT_LAST_EXTENT(eh);
808
809         while (l <= r) {
810                 m = l + (r - l) / 2;
811                 if (block < le32_to_cpu(m->ee_block))
812                         r = m - 1;
813                 else
814                         l = m + 1;
815                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
816                                 m, le32_to_cpu(m->ee_block),
817                                 r, le32_to_cpu(r->ee_block));
818         }
819
820         path->p_ext = l - 1;
821         ext_debug("  -> %d:%llu:[%d]%d ",
822                         le32_to_cpu(path->p_ext->ee_block),
823                         ext4_ext_pblock(path->p_ext),
824                         ext4_ext_is_unwritten(path->p_ext),
825                         ext4_ext_get_actual_len(path->p_ext));
826
827 #ifdef CHECK_BINSEARCH
828         {
829                 struct ext4_extent *chex, *ex;
830                 int k;
831
832                 chex = ex = EXT_FIRST_EXTENT(eh);
833                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
834                         BUG_ON(k && le32_to_cpu(ex->ee_block)
835                                           <= le32_to_cpu(ex[-1].ee_block));
836                         if (block < le32_to_cpu(ex->ee_block))
837                                 break;
838                         chex = ex;
839                 }
840                 BUG_ON(chex != path->p_ext);
841         }
842 #endif
843
844 }
845
846 int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
847 {
848         struct ext4_extent_header *eh;
849
850         eh = ext_inode_hdr(inode);
851         eh->eh_depth = 0;
852         eh->eh_entries = 0;
853         eh->eh_magic = EXT4_EXT_MAGIC;
854         eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
855         ext4_mark_inode_dirty(handle, inode);
856         return 0;
857 }
858
859 struct ext4_ext_path *
860 ext4_find_extent(struct inode *inode, ext4_lblk_t block,
861                  struct ext4_ext_path **orig_path, int flags)
862 {
863         struct ext4_extent_header *eh;
864         struct buffer_head *bh;
865         struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
866         short int depth, i, ppos = 0;
867         int ret;
868
869         eh = ext_inode_hdr(inode);
870         depth = ext_depth(inode);
871
872         if (path) {
873                 ext4_ext_drop_refs(path);
874                 if (depth > path[0].p_maxdepth) {
875                         kfree(path);
876                         *orig_path = path = NULL;
877                 }
878         }
879         if (!path) {
880                 /* account possible depth increase */
881                 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
882                                 GFP_NOFS);
883                 if (unlikely(!path))
884                         return ERR_PTR(-ENOMEM);
885                 path[0].p_maxdepth = depth + 1;
886         }
887         path[0].p_hdr = eh;
888         path[0].p_bh = NULL;
889
890         i = depth;
891         /* walk through the tree */
892         while (i) {
893                 ext_debug("depth %d: num %d, max %d\n",
894                           ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
895
896                 ext4_ext_binsearch_idx(inode, path + ppos, block);
897                 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
898                 path[ppos].p_depth = i;
899                 path[ppos].p_ext = NULL;
900
901                 bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
902                                             flags);
903                 if (IS_ERR(bh)) {
904                         ret = PTR_ERR(bh);
905                         goto err;
906                 }
907
908                 eh = ext_block_hdr(bh);
909                 ppos++;
910                 if (unlikely(ppos > depth)) {
911                         put_bh(bh);
912                         EXT4_ERROR_INODE(inode,
913                                          "ppos %d > depth %d", ppos, depth);
914                         ret = -EFSCORRUPTED;
915                         goto err;
916                 }
917                 path[ppos].p_bh = bh;
918                 path[ppos].p_hdr = eh;
919         }
920
921         path[ppos].p_depth = i;
922         path[ppos].p_ext = NULL;
923         path[ppos].p_idx = NULL;
924
925         /* find extent */
926         ext4_ext_binsearch(inode, path + ppos, block);
927         /* if not an empty leaf */
928         if (path[ppos].p_ext)
929                 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
930
931         ext4_ext_show_path(inode, path);
932
933         return path;
934
935 err:
936         ext4_ext_drop_refs(path);
937         kfree(path);
938         if (orig_path)
939                 *orig_path = NULL;
940         return ERR_PTR(ret);
941 }
942
943 /*
944  * ext4_ext_insert_index:
945  * insert new index [@logical;@ptr] into the block at @curp;
946  * check where to insert: before @curp or after @curp
947  */
948 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
949                                  struct ext4_ext_path *curp,
950                                  int logical, ext4_fsblk_t ptr)
951 {
952         struct ext4_extent_idx *ix;
953         int len, err;
954
955         err = ext4_ext_get_access(handle, inode, curp);
956         if (err)
957                 return err;
958
959         if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
960                 EXT4_ERROR_INODE(inode,
961                                  "logical %d == ei_block %d!",
962                                  logical, le32_to_cpu(curp->p_idx->ei_block));
963                 return -EFSCORRUPTED;
964         }
965
966         if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
967                              >= le16_to_cpu(curp->p_hdr->eh_max))) {
968                 EXT4_ERROR_INODE(inode,
969                                  "eh_entries %d >= eh_max %d!",
970                                  le16_to_cpu(curp->p_hdr->eh_entries),
971                                  le16_to_cpu(curp->p_hdr->eh_max));
972                 return -EFSCORRUPTED;
973         }
974
975         if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
976                 /* insert after */
977                 ext_debug("insert new index %d after: %llu\n", logical, ptr);
978                 ix = curp->p_idx + 1;
979         } else {
980                 /* insert before */
981                 ext_debug("insert new index %d before: %llu\n", logical, ptr);
982                 ix = curp->p_idx;
983         }
984
985         len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
986         BUG_ON(len < 0);
987         if (len > 0) {
988                 ext_debug("insert new index %d: "
989                                 "move %d indices from 0x%p to 0x%p\n",
990                                 logical, len, ix, ix + 1);
991                 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
992         }
993
994         if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
995                 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
996                 return -EFSCORRUPTED;
997         }
998
999         ix->ei_block = cpu_to_le32(logical);
1000         ext4_idx_store_pblock(ix, ptr);
1001         le16_add_cpu(&curp->p_hdr->eh_entries, 1);
1002
1003         if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
1004                 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
1005                 return -EFSCORRUPTED;
1006         }
1007
1008         err = ext4_ext_dirty(handle, inode, curp);
1009         ext4_std_error(inode->i_sb, err);
1010
1011         return err;
1012 }
1013
1014 /*
1015  * ext4_ext_split:
1016  * inserts new subtree into the path, using free index entry
1017  * at depth @at:
1018  * - allocates all needed blocks (new leaf and all intermediate index blocks)
1019  * - makes decision where to split
1020  * - moves remaining extents and index entries (right to the split point)
1021  *   into the newly allocated blocks
1022  * - initializes subtree
1023  */
1024 static int ext4_ext_split(handle_t *handle, struct inode *inode,
1025                           unsigned int flags,
1026                           struct ext4_ext_path *path,
1027                           struct ext4_extent *newext, int at)
1028 {
1029         struct buffer_head *bh = NULL;
1030         int depth = ext_depth(inode);
1031         struct ext4_extent_header *neh;
1032         struct ext4_extent_idx *fidx;
1033         int i = at, k, m, a;
1034         ext4_fsblk_t newblock, oldblock;
1035         __le32 border;
1036         ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1037         int err = 0;
1038
1039         /* make decision: where to split? */
1040         /* FIXME: now decision is simplest: at current extent */
1041
1042         /* if current leaf will be split, then we should use
1043          * border from split point */
1044         if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1045                 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1046                 return -EFSCORRUPTED;
1047         }
1048         if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1049                 border = path[depth].p_ext[1].ee_block;
1050                 ext_debug("leaf will be split."
1051                                 " next leaf starts at %d\n",
1052                                   le32_to_cpu(border));
1053         } else {
1054                 border = newext->ee_block;
1055                 ext_debug("leaf will be added."
1056                                 " next leaf starts at %d\n",
1057                                 le32_to_cpu(border));
1058         }
1059
1060         /*
1061          * If error occurs, then we break processing
1062          * and mark filesystem read-only. index won't
1063          * be inserted and tree will be in consistent
1064          * state. Next mount will repair buffers too.
1065          */
1066
1067         /*
1068          * Get array to track all allocated blocks.
1069          * We need this to handle errors and free blocks
1070          * upon them.
1071          */
1072         ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
1073         if (!ablocks)
1074                 return -ENOMEM;
1075
1076         /* allocate all needed blocks */
1077         ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1078         for (a = 0; a < depth - at; a++) {
1079                 newblock = ext4_ext_new_meta_block(handle, inode, path,
1080                                                    newext, &err, flags);
1081                 if (newblock == 0)
1082                         goto cleanup;
1083                 ablocks[a] = newblock;
1084         }
1085
1086         /* initialize new leaf */
1087         newblock = ablocks[--a];
1088         if (unlikely(newblock == 0)) {
1089                 EXT4_ERROR_INODE(inode, "newblock == 0!");
1090                 err = -EFSCORRUPTED;
1091                 goto cleanup;
1092         }
1093         bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1094         if (unlikely(!bh)) {
1095                 err = -ENOMEM;
1096                 goto cleanup;
1097         }
1098         lock_buffer(bh);
1099
1100         err = ext4_journal_get_create_access(handle, bh);
1101         if (err)
1102                 goto cleanup;
1103
1104         neh = ext_block_hdr(bh);
1105         neh->eh_entries = 0;
1106         neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1107         neh->eh_magic = EXT4_EXT_MAGIC;
1108         neh->eh_depth = 0;
1109
1110         /* move remainder of path[depth] to the new leaf */
1111         if (unlikely(path[depth].p_hdr->eh_entries !=
1112                      path[depth].p_hdr->eh_max)) {
1113                 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1114                                  path[depth].p_hdr->eh_entries,
1115                                  path[depth].p_hdr->eh_max);
1116                 err = -EFSCORRUPTED;
1117                 goto cleanup;
1118         }
1119         /* start copy from next extent */
1120         m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1121         ext4_ext_show_move(inode, path, newblock, depth);
1122         if (m) {
1123                 struct ext4_extent *ex;
1124                 ex = EXT_FIRST_EXTENT(neh);
1125                 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1126                 le16_add_cpu(&neh->eh_entries, m);
1127         }
1128
1129         ext4_extent_block_csum_set(inode, neh);
1130         set_buffer_uptodate(bh);
1131         unlock_buffer(bh);
1132
1133         err = ext4_handle_dirty_metadata(handle, inode, bh);
1134         if (err)
1135                 goto cleanup;
1136         brelse(bh);
1137         bh = NULL;
1138
1139         /* correct old leaf */
1140         if (m) {
1141                 err = ext4_ext_get_access(handle, inode, path + depth);
1142                 if (err)
1143                         goto cleanup;
1144                 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1145                 err = ext4_ext_dirty(handle, inode, path + depth);
1146                 if (err)
1147                         goto cleanup;
1148
1149         }
1150
1151         /* create intermediate indexes */
1152         k = depth - at - 1;
1153         if (unlikely(k < 0)) {
1154                 EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1155                 err = -EFSCORRUPTED;
1156                 goto cleanup;
1157         }
1158         if (k)
1159                 ext_debug("create %d intermediate indices\n", k);
1160         /* insert new index into current index block */
1161         /* current depth stored in i var */
1162         i = depth - 1;
1163         while (k--) {
1164                 oldblock = newblock;
1165                 newblock = ablocks[--a];
1166                 bh = sb_getblk(inode->i_sb, newblock);
1167                 if (unlikely(!bh)) {
1168                         err = -ENOMEM;
1169                         goto cleanup;
1170                 }
1171                 lock_buffer(bh);
1172
1173                 err = ext4_journal_get_create_access(handle, bh);
1174                 if (err)
1175                         goto cleanup;
1176
1177                 neh = ext_block_hdr(bh);
1178                 neh->eh_entries = cpu_to_le16(1);
1179                 neh->eh_magic = EXT4_EXT_MAGIC;
1180                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1181                 neh->eh_depth = cpu_to_le16(depth - i);
1182                 fidx = EXT_FIRST_INDEX(neh);
1183                 fidx->ei_block = border;
1184                 ext4_idx_store_pblock(fidx, oldblock);
1185
1186                 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1187                                 i, newblock, le32_to_cpu(border), oldblock);
1188
1189                 /* move remainder of path[i] to the new index block */
1190                 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1191                                         EXT_LAST_INDEX(path[i].p_hdr))) {
1192                         EXT4_ERROR_INODE(inode,
1193                                          "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1194                                          le32_to_cpu(path[i].p_ext->ee_block));
1195                         err = -EFSCORRUPTED;
1196                         goto cleanup;
1197                 }
1198                 /* start copy indexes */
1199                 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1200                 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1201                                 EXT_MAX_INDEX(path[i].p_hdr));
1202                 ext4_ext_show_move(inode, path, newblock, i);
1203                 if (m) {
1204                         memmove(++fidx, path[i].p_idx,
1205                                 sizeof(struct ext4_extent_idx) * m);
1206                         le16_add_cpu(&neh->eh_entries, m);
1207                 }
1208                 ext4_extent_block_csum_set(inode, neh);
1209                 set_buffer_uptodate(bh);
1210                 unlock_buffer(bh);
1211
1212                 err = ext4_handle_dirty_metadata(handle, inode, bh);
1213                 if (err)
1214                         goto cleanup;
1215                 brelse(bh);
1216                 bh = NULL;
1217
1218                 /* correct old index */
1219                 if (m) {
1220                         err = ext4_ext_get_access(handle, inode, path + i);
1221                         if (err)
1222                                 goto cleanup;
1223                         le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1224                         err = ext4_ext_dirty(handle, inode, path + i);
1225                         if (err)
1226                                 goto cleanup;
1227                 }
1228
1229                 i--;
1230         }
1231
1232         /* insert new index */
1233         err = ext4_ext_insert_index(handle, inode, path + at,
1234                                     le32_to_cpu(border), newblock);
1235
1236 cleanup:
1237         if (bh) {
1238                 if (buffer_locked(bh))
1239                         unlock_buffer(bh);
1240                 brelse(bh);
1241         }
1242
1243         if (err) {
1244                 /* free all allocated blocks in error case */
1245                 for (i = 0; i < depth; i++) {
1246                         if (!ablocks[i])
1247                                 continue;
1248                         ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1249                                          EXT4_FREE_BLOCKS_METADATA);
1250                 }
1251         }
1252         kfree(ablocks);
1253
1254         return err;
1255 }
1256
1257 /*
1258  * ext4_ext_grow_indepth:
1259  * implements tree growing procedure:
1260  * - allocates new block
1261  * - moves top-level data (index block or leaf) into the new block
1262  * - initializes new top-level, creating index that points to the
1263  *   just created block
1264  */
1265 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1266                                  unsigned int flags)
1267 {
1268         struct ext4_extent_header *neh;
1269         struct buffer_head *bh;
1270         ext4_fsblk_t newblock, goal = 0;
1271         struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
1272         int err = 0;
1273
1274         /* Try to prepend new index to old one */
1275         if (ext_depth(inode))
1276                 goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
1277         if (goal > le32_to_cpu(es->s_first_data_block)) {
1278                 flags |= EXT4_MB_HINT_TRY_GOAL;
1279                 goal--;
1280         } else
1281                 goal = ext4_inode_to_goal_block(inode);
1282         newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
1283                                         NULL, &err);
1284         if (newblock == 0)
1285                 return err;
1286
1287         bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1288         if (unlikely(!bh))
1289                 return -ENOMEM;
1290         lock_buffer(bh);
1291
1292         err = ext4_journal_get_create_access(handle, bh);
1293         if (err) {
1294                 unlock_buffer(bh);
1295                 goto out;
1296         }
1297
1298         /* move top-level index/leaf into new block */
1299         memmove(bh->b_data, EXT4_I(inode)->i_data,
1300                 sizeof(EXT4_I(inode)->i_data));
1301
1302         /* set size of new block */
1303         neh = ext_block_hdr(bh);
1304         /* old root could have indexes or leaves
1305          * so calculate e_max right way */
1306         if (ext_depth(inode))
1307                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1308         else
1309                 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1310         neh->eh_magic = EXT4_EXT_MAGIC;
1311         ext4_extent_block_csum_set(inode, neh);
1312         set_buffer_uptodate(bh);
1313         unlock_buffer(bh);
1314
1315         err = ext4_handle_dirty_metadata(handle, inode, bh);
1316         if (err)
1317                 goto out;
1318
1319         /* Update top-level index: num,max,pointer */
1320         neh = ext_inode_hdr(inode);
1321         neh->eh_entries = cpu_to_le16(1);
1322         ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1323         if (neh->eh_depth == 0) {
1324                 /* Root extent block becomes index block */
1325                 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1326                 EXT_FIRST_INDEX(neh)->ei_block =
1327                         EXT_FIRST_EXTENT(neh)->ee_block;
1328         }
1329         ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1330                   le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1331                   le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1332                   ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1333
1334         le16_add_cpu(&neh->eh_depth, 1);
1335         ext4_mark_inode_dirty(handle, inode);
1336 out:
1337         brelse(bh);
1338
1339         return err;
1340 }
1341
1342 /*
1343  * ext4_ext_create_new_leaf:
1344  * finds empty index and adds new leaf.
1345  * if no free index is found, then it requests in-depth growing.
1346  */
1347 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1348                                     unsigned int mb_flags,
1349                                     unsigned int gb_flags,
1350                                     struct ext4_ext_path **ppath,
1351                                     struct ext4_extent *newext)
1352 {
1353         struct ext4_ext_path *path = *ppath;
1354         struct ext4_ext_path *curp;
1355         int depth, i, err = 0;
1356
1357 repeat:
1358         i = depth = ext_depth(inode);
1359
1360         /* walk up to the tree and look for free index entry */
1361         curp = path + depth;
1362         while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1363                 i--;
1364                 curp--;
1365         }
1366
1367         /* we use already allocated block for index block,
1368          * so subsequent data blocks should be contiguous */
1369         if (EXT_HAS_FREE_INDEX(curp)) {
1370                 /* if we found index with free entry, then use that
1371                  * entry: create all needed subtree and add new leaf */
1372                 err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1373                 if (err)
1374                         goto out;
1375
1376                 /* refill path */
1377                 path = ext4_find_extent(inode,
1378                                     (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1379                                     ppath, gb_flags);
1380                 if (IS_ERR(path))
1381                         err = PTR_ERR(path);
1382         } else {
1383                 /* tree is full, time to grow in depth */
1384                 err = ext4_ext_grow_indepth(handle, inode, mb_flags);
1385                 if (err)
1386                         goto out;
1387
1388                 /* refill path */
1389                 path = ext4_find_extent(inode,
1390                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1391                                     ppath, gb_flags);
1392                 if (IS_ERR(path)) {
1393                         err = PTR_ERR(path);
1394                         goto out;
1395                 }
1396
1397                 /*
1398                  * only first (depth 0 -> 1) produces free space;
1399                  * in all other cases we have to split the grown tree
1400                  */
1401                 depth = ext_depth(inode);
1402                 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1403                         /* now we need to split */
1404                         goto repeat;
1405                 }
1406         }
1407
1408 out:
1409         return err;
1410 }
1411
1412 /*
1413  * search the closest allocated block to the left for *logical
1414  * and returns it at @logical + it's physical address at @phys
1415  * if *logical is the smallest allocated block, the function
1416  * returns 0 at @phys
1417  * return value contains 0 (success) or error code
1418  */
1419 static int ext4_ext_search_left(struct inode *inode,
1420                                 struct ext4_ext_path *path,
1421                                 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1422 {
1423         struct ext4_extent_idx *ix;
1424         struct ext4_extent *ex;
1425         int depth, ee_len;
1426
1427         if (unlikely(path == NULL)) {
1428                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1429                 return -EFSCORRUPTED;
1430         }
1431         depth = path->p_depth;
1432         *phys = 0;
1433
1434         if (depth == 0 && path->p_ext == NULL)
1435                 return 0;
1436
1437         /* usually extent in the path covers blocks smaller
1438          * then *logical, but it can be that extent is the
1439          * first one in the file */
1440
1441         ex = path[depth].p_ext;
1442         ee_len = ext4_ext_get_actual_len(ex);
1443         if (*logical < le32_to_cpu(ex->ee_block)) {
1444                 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1445                         EXT4_ERROR_INODE(inode,
1446                                          "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1447                                          *logical, le32_to_cpu(ex->ee_block));
1448                         return -EFSCORRUPTED;
1449                 }
1450                 while (--depth >= 0) {
1451                         ix = path[depth].p_idx;
1452                         if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1453                                 EXT4_ERROR_INODE(inode,
1454                                   "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1455                                   ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1456                                   EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1457                 le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1458                                   depth);
1459                                 return -EFSCORRUPTED;
1460                         }
1461                 }
1462                 return 0;
1463         }
1464
1465         if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1466                 EXT4_ERROR_INODE(inode,
1467                                  "logical %d < ee_block %d + ee_len %d!",
1468                                  *logical, le32_to_cpu(ex->ee_block), ee_len);
1469                 return -EFSCORRUPTED;
1470         }
1471
1472         *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1473         *phys = ext4_ext_pblock(ex) + ee_len - 1;
1474         return 0;
1475 }
1476
1477 /*
1478  * search the closest allocated block to the right for *logical
1479  * and returns it at @logical + it's physical address at @phys
1480  * if *logical is the largest allocated block, the function
1481  * returns 0 at @phys
1482  * return value contains 0 (success) or error code
1483  */
1484 static int ext4_ext_search_right(struct inode *inode,
1485                                  struct ext4_ext_path *path,
1486                                  ext4_lblk_t *logical, ext4_fsblk_t *phys,
1487                                  struct ext4_extent **ret_ex)
1488 {
1489         struct buffer_head *bh = NULL;
1490         struct ext4_extent_header *eh;
1491         struct ext4_extent_idx *ix;
1492         struct ext4_extent *ex;
1493         ext4_fsblk_t block;
1494         int depth;      /* Note, NOT eh_depth; depth from top of tree */
1495         int ee_len;
1496
1497         if (unlikely(path == NULL)) {
1498                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1499                 return -EFSCORRUPTED;
1500         }
1501         depth = path->p_depth;
1502         *phys = 0;
1503
1504         if (depth == 0 && path->p_ext == NULL)
1505                 return 0;
1506
1507         /* usually extent in the path covers blocks smaller
1508          * then *logical, but it can be that extent is the
1509          * first one in the file */
1510
1511         ex = path[depth].p_ext;
1512         ee_len = ext4_ext_get_actual_len(ex);
1513         if (*logical < le32_to_cpu(ex->ee_block)) {
1514                 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1515                         EXT4_ERROR_INODE(inode,
1516                                          "first_extent(path[%d].p_hdr) != ex",
1517                                          depth);
1518                         return -EFSCORRUPTED;
1519                 }
1520                 while (--depth >= 0) {
1521                         ix = path[depth].p_idx;
1522                         if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1523                                 EXT4_ERROR_INODE(inode,
1524                                                  "ix != EXT_FIRST_INDEX *logical %d!",
1525                                                  *logical);
1526                                 return -EFSCORRUPTED;
1527                         }
1528                 }
1529                 goto found_extent;
1530         }
1531
1532         if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1533                 EXT4_ERROR_INODE(inode,
1534                                  "logical %d < ee_block %d + ee_len %d!",
1535                                  *logical, le32_to_cpu(ex->ee_block), ee_len);
1536                 return -EFSCORRUPTED;
1537         }
1538
1539         if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1540                 /* next allocated block in this leaf */
1541                 ex++;
1542                 goto found_extent;
1543         }
1544
1545         /* go up and search for index to the right */
1546         while (--depth >= 0) {
1547                 ix = path[depth].p_idx;
1548                 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1549                         goto got_index;
1550         }
1551
1552         /* we've gone up to the root and found no index to the right */
1553         return 0;
1554
1555 got_index:
1556         /* we've found index to the right, let's
1557          * follow it and find the closest allocated
1558          * block to the right */
1559         ix++;
1560         block = ext4_idx_pblock(ix);
1561         while (++depth < path->p_depth) {
1562                 /* subtract from p_depth to get proper eh_depth */
1563                 bh = read_extent_tree_block(inode, block,
1564                                             path->p_depth - depth, 0);
1565                 if (IS_ERR(bh))
1566                         return PTR_ERR(bh);
1567                 eh = ext_block_hdr(bh);
1568                 ix = EXT_FIRST_INDEX(eh);
1569                 block = ext4_idx_pblock(ix);
1570                 put_bh(bh);
1571         }
1572
1573         bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
1574         if (IS_ERR(bh))
1575                 return PTR_ERR(bh);
1576         eh = ext_block_hdr(bh);
1577         ex = EXT_FIRST_EXTENT(eh);
1578 found_extent:
1579         *logical = le32_to_cpu(ex->ee_block);
1580         *phys = ext4_ext_pblock(ex);
1581         *ret_ex = ex;
1582         if (bh)
1583                 put_bh(bh);
1584         return 0;
1585 }
1586
1587 /*
1588  * ext4_ext_next_allocated_block:
1589  * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1590  * NOTE: it considers block number from index entry as
1591  * allocated block. Thus, index entries have to be consistent
1592  * with leaves.
1593  */
1594 ext4_lblk_t
1595 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1596 {
1597         int depth;
1598
1599         BUG_ON(path == NULL);
1600         depth = path->p_depth;
1601
1602         if (depth == 0 && path->p_ext == NULL)
1603                 return EXT_MAX_BLOCKS;
1604
1605         while (depth >= 0) {
1606                 if (depth == path->p_depth) {
1607                         /* leaf */
1608                         if (path[depth].p_ext &&
1609                                 path[depth].p_ext !=
1610                                         EXT_LAST_EXTENT(path[depth].p_hdr))
1611                           return le32_to_cpu(path[depth].p_ext[1].ee_block);
1612                 } else {
1613                         /* index */
1614                         if (path[depth].p_idx !=
1615                                         EXT_LAST_INDEX(path[depth].p_hdr))
1616                           return le32_to_cpu(path[depth].p_idx[1].ei_block);
1617                 }
1618                 depth--;
1619         }
1620
1621         return EXT_MAX_BLOCKS;
1622 }
1623
1624 /*
1625  * ext4_ext_next_leaf_block:
1626  * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1627  */
1628 static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1629 {
1630         int depth;
1631
1632         BUG_ON(path == NULL);
1633         depth = path->p_depth;
1634
1635         /* zero-tree has no leaf blocks at all */
1636         if (depth == 0)
1637                 return EXT_MAX_BLOCKS;
1638
1639         /* go to index block */
1640         depth--;
1641
1642         while (depth >= 0) {
1643                 if (path[depth].p_idx !=
1644                                 EXT_LAST_INDEX(path[depth].p_hdr))
1645                         return (ext4_lblk_t)
1646                                 le32_to_cpu(path[depth].p_idx[1].ei_block);
1647                 depth--;
1648         }
1649
1650         return EXT_MAX_BLOCKS;
1651 }
1652
1653 /*
1654  * ext4_ext_correct_indexes:
1655  * if leaf gets modified and modified extent is first in the leaf,
1656  * then we have to correct all indexes above.
1657  * TODO: do we need to correct tree in all cases?
1658  */
1659 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1660                                 struct ext4_ext_path *path)
1661 {
1662         struct ext4_extent_header *eh;
1663         int depth = ext_depth(inode);
1664         struct ext4_extent *ex;
1665         __le32 border;
1666         int k, err = 0;
1667
1668         eh = path[depth].p_hdr;
1669         ex = path[depth].p_ext;
1670
1671         if (unlikely(ex == NULL || eh == NULL)) {
1672                 EXT4_ERROR_INODE(inode,
1673                                  "ex %p == NULL or eh %p == NULL", ex, eh);
1674                 return -EFSCORRUPTED;
1675         }
1676
1677         if (depth == 0) {
1678                 /* there is no tree at all */
1679                 return 0;
1680         }
1681
1682         if (ex != EXT_FIRST_EXTENT(eh)) {
1683                 /* we correct tree if first leaf got modified only */
1684                 return 0;
1685         }
1686
1687         /*
1688          * TODO: we need correction if border is smaller than current one
1689          */
1690         k = depth - 1;
1691         border = path[depth].p_ext->ee_block;
1692         err = ext4_ext_get_access(handle, inode, path + k);
1693         if (err)
1694                 return err;
1695         path[k].p_idx->ei_block = border;
1696         err = ext4_ext_dirty(handle, inode, path + k);
1697         if (err)
1698                 return err;
1699
1700         while (k--) {
1701                 /* change all left-side indexes */
1702                 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1703                         break;
1704                 err = ext4_ext_get_access(handle, inode, path + k);
1705                 if (err)
1706                         break;
1707                 path[k].p_idx->ei_block = border;
1708                 err = ext4_ext_dirty(handle, inode, path + k);
1709                 if (err)
1710                         break;
1711         }
1712
1713         return err;
1714 }
1715
1716 int
1717 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1718                                 struct ext4_extent *ex2)
1719 {
1720         unsigned short ext1_ee_len, ext2_ee_len;
1721
1722         if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1723                 return 0;
1724
1725         ext1_ee_len = ext4_ext_get_actual_len(ex1);
1726         ext2_ee_len = ext4_ext_get_actual_len(ex2);
1727
1728         if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1729                         le32_to_cpu(ex2->ee_block))
1730                 return 0;
1731
1732         /*
1733          * To allow future support for preallocated extents to be added
1734          * as an RO_COMPAT feature, refuse to merge to extents if
1735          * this can result in the top bit of ee_len being set.
1736          */
1737         if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1738                 return 0;
1739         if (ext4_ext_is_unwritten(ex1) &&
1740             (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
1741              atomic_read(&EXT4_I(inode)->i_unwritten) ||
1742              (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
1743                 return 0;
1744 #ifdef AGGRESSIVE_TEST
1745         if (ext1_ee_len >= 4)
1746                 return 0;
1747 #endif
1748
1749         if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1750                 return 1;
1751         return 0;
1752 }
1753
1754 /*
1755  * This function tries to merge the "ex" extent to the next extent in the tree.
1756  * It always tries to merge towards right. If you want to merge towards
1757  * left, pass "ex - 1" as argument instead of "ex".
1758  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1759  * 1 if they got merged.
1760  */
1761 static int ext4_ext_try_to_merge_right(struct inode *inode,
1762                                  struct ext4_ext_path *path,
1763                                  struct ext4_extent *ex)
1764 {
1765         struct ext4_extent_header *eh;
1766         unsigned int depth, len;
1767         int merge_done = 0, unwritten;
1768
1769         depth = ext_depth(inode);
1770         BUG_ON(path[depth].p_hdr == NULL);
1771         eh = path[depth].p_hdr;
1772
1773         while (ex < EXT_LAST_EXTENT(eh)) {
1774                 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1775                         break;
1776                 /* merge with next extent! */
1777                 unwritten = ext4_ext_is_unwritten(ex);
1778                 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1779                                 + ext4_ext_get_actual_len(ex + 1));
1780                 if (unwritten)
1781                         ext4_ext_mark_unwritten(ex);
1782
1783                 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1784                         len = (EXT_LAST_EXTENT(eh) - ex - 1)
1785                                 * sizeof(struct ext4_extent);
1786                         memmove(ex + 1, ex + 2, len);
1787                 }
1788                 le16_add_cpu(&eh->eh_entries, -1);
1789                 merge_done = 1;
1790                 WARN_ON(eh->eh_entries == 0);
1791                 if (!eh->eh_entries)
1792                         EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1793         }
1794
1795         return merge_done;
1796 }
1797
1798 /*
1799  * This function does a very simple check to see if we can collapse
1800  * an extent tree with a single extent tree leaf block into the inode.
1801  */
1802 static void ext4_ext_try_to_merge_up(handle_t *handle,
1803                                      struct inode *inode,
1804                                      struct ext4_ext_path *path)
1805 {
1806         size_t s;
1807         unsigned max_root = ext4_ext_space_root(inode, 0);
1808         ext4_fsblk_t blk;
1809
1810         if ((path[0].p_depth != 1) ||
1811             (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1812             (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1813                 return;
1814
1815         /*
1816          * We need to modify the block allocation bitmap and the block
1817          * group descriptor to release the extent tree block.  If we
1818          * can't get the journal credits, give up.
1819          */
1820         if (ext4_journal_extend(handle, 2))
1821                 return;
1822
1823         /*
1824          * Copy the extent data up to the inode
1825          */
1826         blk = ext4_idx_pblock(path[0].p_idx);
1827         s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1828                 sizeof(struct ext4_extent_idx);
1829         s += sizeof(struct ext4_extent_header);
1830
1831         path[1].p_maxdepth = path[0].p_maxdepth;
1832         memcpy(path[0].p_hdr, path[1].p_hdr, s);
1833         path[0].p_depth = 0;
1834         path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1835                 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1836         path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1837
1838         brelse(path[1].p_bh);
1839         ext4_free_blocks(handle, inode, NULL, blk, 1,
1840                          EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1841 }
1842
1843 /*
1844  * This function tries to merge the @ex extent to neighbours in the tree.
1845  * return 1 if merge left else 0.
1846  */
1847 static void ext4_ext_try_to_merge(handle_t *handle,
1848                                   struct inode *inode,
1849                                   struct ext4_ext_path *path,
1850                                   struct ext4_extent *ex) {
1851         struct ext4_extent_header *eh;
1852         unsigned int depth;
1853         int merge_done = 0;
1854
1855         depth = ext_depth(inode);
1856         BUG_ON(path[depth].p_hdr == NULL);
1857         eh = path[depth].p_hdr;
1858
1859         if (ex > EXT_FIRST_EXTENT(eh))
1860                 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1861
1862         if (!merge_done)
1863                 (void) ext4_ext_try_to_merge_right(inode, path, ex);
1864
1865         ext4_ext_try_to_merge_up(handle, inode, path);
1866 }
1867
1868 /*
1869  * check if a portion of the "newext" extent overlaps with an
1870  * existing extent.
1871  *
1872  * If there is an overlap discovered, it updates the length of the newext
1873  * such that there will be no overlap, and then returns 1.
1874  * If there is no overlap found, it returns 0.
1875  */
1876 static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1877                                            struct inode *inode,
1878                                            struct ext4_extent *newext,
1879                                            struct ext4_ext_path *path)
1880 {
1881         ext4_lblk_t b1, b2;
1882         unsigned int depth, len1;
1883         unsigned int ret = 0;
1884
1885         b1 = le32_to_cpu(newext->ee_block);
1886         len1 = ext4_ext_get_actual_len(newext);
1887         depth = ext_depth(inode);
1888         if (!path[depth].p_ext)
1889                 goto out;
1890         b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1891
1892         /*
1893          * get the next allocated block if the extent in the path
1894          * is before the requested block(s)
1895          */
1896         if (b2 < b1) {
1897                 b2 = ext4_ext_next_allocated_block(path);
1898                 if (b2 == EXT_MAX_BLOCKS)
1899                         goto out;
1900                 b2 = EXT4_LBLK_CMASK(sbi, b2);
1901         }
1902
1903         /* check for wrap through zero on extent logical start block*/
1904         if (b1 + len1 < b1) {
1905                 len1 = EXT_MAX_BLOCKS - b1;
1906                 newext->ee_len = cpu_to_le16(len1);
1907                 ret = 1;
1908         }
1909
1910         /* check for overlap */
1911         if (b1 + len1 > b2) {
1912                 newext->ee_len = cpu_to_le16(b2 - b1);
1913                 ret = 1;
1914         }
1915 out:
1916         return ret;
1917 }
1918
1919 /*
1920  * ext4_ext_insert_extent:
1921  * tries to merge requsted extent into the existing extent or
1922  * inserts requested extent as new one into the tree,
1923  * creating new leaf in the no-space case.
1924  */
1925 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1926                                 struct ext4_ext_path **ppath,
1927                                 struct ext4_extent *newext, int gb_flags)
1928 {
1929         struct ext4_ext_path *path = *ppath;
1930         struct ext4_extent_header *eh;
1931         struct ext4_extent *ex, *fex;
1932         struct ext4_extent *nearex; /* nearest extent */
1933         struct ext4_ext_path *npath = NULL;
1934         int depth, len, err;
1935         ext4_lblk_t next;
1936         int mb_flags = 0, unwritten;
1937
1938         if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1939                 mb_flags |= EXT4_MB_DELALLOC_RESERVED;
1940         if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1941                 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1942                 return -EFSCORRUPTED;
1943         }
1944         depth = ext_depth(inode);
1945         ex = path[depth].p_ext;
1946         eh = path[depth].p_hdr;
1947         if (unlikely(path[depth].p_hdr == NULL)) {
1948                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1949                 return -EFSCORRUPTED;
1950         }
1951
1952         /* try to insert block into found extent and return */
1953         if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
1954
1955                 /*
1956                  * Try to see whether we should rather test the extent on
1957                  * right from ex, or from the left of ex. This is because
1958                  * ext4_find_extent() can return either extent on the
1959                  * left, or on the right from the searched position. This
1960                  * will make merging more effective.
1961                  */
1962                 if (ex < EXT_LAST_EXTENT(eh) &&
1963                     (le32_to_cpu(ex->ee_block) +
1964                     ext4_ext_get_actual_len(ex) <
1965                     le32_to_cpu(newext->ee_block))) {
1966                         ex += 1;
1967                         goto prepend;
1968                 } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
1969                            (le32_to_cpu(newext->ee_block) +
1970                            ext4_ext_get_actual_len(newext) <
1971                            le32_to_cpu(ex->ee_block)))
1972                         ex -= 1;
1973
1974                 /* Try to append newex to the ex */
1975                 if (ext4_can_extents_be_merged(inode, ex, newext)) {
1976                         ext_debug("append [%d]%d block to %u:[%d]%d"
1977                                   "(from %llu)\n",
1978                                   ext4_ext_is_unwritten(newext),
1979                                   ext4_ext_get_actual_len(newext),
1980                                   le32_to_cpu(ex->ee_block),
1981                                   ext4_ext_is_unwritten(ex),
1982                                   ext4_ext_get_actual_len(ex),
1983                                   ext4_ext_pblock(ex));
1984                         err = ext4_ext_get_access(handle, inode,
1985                                                   path + depth);
1986                         if (err)
1987                                 return err;
1988                         unwritten = ext4_ext_is_unwritten(ex);
1989                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1990                                         + ext4_ext_get_actual_len(newext));
1991                         if (unwritten)
1992                                 ext4_ext_mark_unwritten(ex);
1993                         eh = path[depth].p_hdr;
1994                         nearex = ex;
1995                         goto merge;
1996                 }
1997
1998 prepend:
1999                 /* Try to prepend newex to the ex */
2000                 if (ext4_can_extents_be_merged(inode, newext, ex)) {
2001                         ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
2002                                   "(from %llu)\n",
2003                                   le32_to_cpu(newext->ee_block),
2004                                   ext4_ext_is_unwritten(newext),
2005                                   ext4_ext_get_actual_len(newext),
2006                                   le32_to_cpu(ex->ee_block),
2007                                   ext4_ext_is_unwritten(ex),
2008                                   ext4_ext_get_actual_len(ex),
2009                                   ext4_ext_pblock(ex));
2010                         err = ext4_ext_get_access(handle, inode,
2011                                                   path + depth);
2012                         if (err)
2013                                 return err;
2014
2015                         unwritten = ext4_ext_is_unwritten(ex);
2016                         ex->ee_block = newext->ee_block;
2017                         ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
2018                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2019                                         + ext4_ext_get_actual_len(newext));
2020                         if (unwritten)
2021                                 ext4_ext_mark_unwritten(ex);
2022                         eh = path[depth].p_hdr;
2023                         nearex = ex;
2024                         goto merge;
2025                 }
2026         }
2027
2028         depth = ext_depth(inode);
2029         eh = path[depth].p_hdr;
2030         if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
2031                 goto has_space;
2032
2033         /* probably next leaf has space for us? */
2034         fex = EXT_LAST_EXTENT(eh);
2035         next = EXT_MAX_BLOCKS;
2036         if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2037                 next = ext4_ext_next_leaf_block(path);
2038         if (next != EXT_MAX_BLOCKS) {
2039                 ext_debug("next leaf block - %u\n", next);
2040                 BUG_ON(npath != NULL);
2041                 npath = ext4_find_extent(inode, next, NULL, 0);
2042                 if (IS_ERR(npath))
2043                         return PTR_ERR(npath);
2044                 BUG_ON(npath->p_depth != path->p_depth);
2045                 eh = npath[depth].p_hdr;
2046                 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2047                         ext_debug("next leaf isn't full(%d)\n",
2048                                   le16_to_cpu(eh->eh_entries));
2049                         path = npath;
2050                         goto has_space;
2051                 }
2052                 ext_debug("next leaf has no free space(%d,%d)\n",
2053                           le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2054         }
2055
2056         /*
2057          * There is no free space in the found leaf.
2058          * We're gonna add a new leaf in the tree.
2059          */
2060         if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2061                 mb_flags |= EXT4_MB_USE_RESERVED;
2062         err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2063                                        ppath, newext);
2064         if (err)
2065                 goto cleanup;
2066         depth = ext_depth(inode);
2067         eh = path[depth].p_hdr;
2068
2069 has_space:
2070         nearex = path[depth].p_ext;
2071
2072         err = ext4_ext_get_access(handle, inode, path + depth);
2073         if (err)
2074                 goto cleanup;
2075
2076         if (!nearex) {
2077                 /* there is no extent in this leaf, create first one */
2078                 ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
2079                                 le32_to_cpu(newext->ee_block),
2080                                 ext4_ext_pblock(newext),
2081                                 ext4_ext_is_unwritten(newext),
2082                                 ext4_ext_get_actual_len(newext));
2083                 nearex = EXT_FIRST_EXTENT(eh);
2084         } else {
2085                 if (le32_to_cpu(newext->ee_block)
2086                            > le32_to_cpu(nearex->ee_block)) {
2087                         /* Insert after */
2088                         ext_debug("insert %u:%llu:[%d]%d before: "
2089                                         "nearest %p\n",
2090                                         le32_to_cpu(newext->ee_block),
2091                                         ext4_ext_pblock(newext),
2092                                         ext4_ext_is_unwritten(newext),
2093                                         ext4_ext_get_actual_len(newext),
2094                                         nearex);
2095                         nearex++;
2096                 } else {
2097                         /* Insert before */
2098                         BUG_ON(newext->ee_block == nearex->ee_block);
2099                         ext_debug("insert %u:%llu:[%d]%d after: "
2100                                         "nearest %p\n",
2101                                         le32_to_cpu(newext->ee_block),
2102                                         ext4_ext_pblock(newext),
2103                                         ext4_ext_is_unwritten(newext),
2104                                         ext4_ext_get_actual_len(newext),
2105                                         nearex);
2106                 }
2107                 len = EXT_LAST_EXTENT(eh) - nearex + 1;
2108                 if (len > 0) {
2109                         ext_debug("insert %u:%llu:[%d]%d: "
2110                                         "move %d extents from 0x%p to 0x%p\n",
2111                                         le32_to_cpu(newext->ee_block),
2112                                         ext4_ext_pblock(newext),
2113                                         ext4_ext_is_unwritten(newext),
2114                                         ext4_ext_get_actual_len(newext),
2115                                         len, nearex, nearex + 1);
2116                         memmove(nearex + 1, nearex,
2117                                 len * sizeof(struct ext4_extent));
2118                 }
2119         }
2120
2121         le16_add_cpu(&eh->eh_entries, 1);
2122         path[depth].p_ext = nearex;
2123         nearex->ee_block = newext->ee_block;
2124         ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2125         nearex->ee_len = newext->ee_len;
2126
2127 merge:
2128         /* try to merge extents */
2129         if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2130                 ext4_ext_try_to_merge(handle, inode, path, nearex);
2131
2132
2133         /* time to correct all indexes above */
2134         err = ext4_ext_correct_indexes(handle, inode, path);
2135         if (err)
2136                 goto cleanup;
2137
2138         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2139
2140 cleanup:
2141         ext4_ext_drop_refs(npath);
2142         kfree(npath);
2143         return err;
2144 }
2145
2146 static int ext4_fill_fiemap_extents(struct inode *inode,
2147                                     ext4_lblk_t block, ext4_lblk_t num,
2148                                     struct fiemap_extent_info *fieinfo)
2149 {
2150         struct ext4_ext_path *path = NULL;
2151         struct ext4_extent *ex;
2152         struct extent_status es;
2153         ext4_lblk_t next, next_del, start = 0, end = 0;
2154         ext4_lblk_t last = block + num;
2155         int exists, depth = 0, err = 0;
2156         unsigned int flags = 0;
2157         unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2158
2159         while (block < last && block != EXT_MAX_BLOCKS) {
2160                 num = last - block;
2161                 /* find extent for this block */
2162                 down_read(&EXT4_I(inode)->i_data_sem);
2163
2164                 path = ext4_find_extent(inode, block, &path, 0);
2165                 if (IS_ERR(path)) {
2166                         up_read(&EXT4_I(inode)->i_data_sem);
2167                         err = PTR_ERR(path);
2168                         path = NULL;
2169                         break;
2170                 }
2171
2172                 depth = ext_depth(inode);
2173                 if (unlikely(path[depth].p_hdr == NULL)) {
2174                         up_read(&EXT4_I(inode)->i_data_sem);
2175                         EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2176                         err = -EFSCORRUPTED;
2177                         break;
2178                 }
2179                 ex = path[depth].p_ext;
2180                 next = ext4_ext_next_allocated_block(path);
2181
2182                 flags = 0;
2183                 exists = 0;
2184                 if (!ex) {
2185                         /* there is no extent yet, so try to allocate
2186                          * all requested space */
2187                         start = block;
2188                         end = block + num;
2189                 } else if (le32_to_cpu(ex->ee_block) > block) {
2190                         /* need to allocate space before found extent */
2191                         start = block;
2192                         end = le32_to_cpu(ex->ee_block);
2193                         if (block + num < end)
2194                                 end = block + num;
2195                 } else if (block >= le32_to_cpu(ex->ee_block)
2196                                         + ext4_ext_get_actual_len(ex)) {
2197                         /* need to allocate space after found extent */
2198                         start = block;
2199                         end = block + num;
2200                         if (end >= next)
2201                                 end = next;
2202                 } else if (block >= le32_to_cpu(ex->ee_block)) {
2203                         /*
2204                          * some part of requested space is covered
2205                          * by found extent
2206                          */
2207                         start = block;
2208                         end = le32_to_cpu(ex->ee_block)
2209                                 + ext4_ext_get_actual_len(ex);
2210                         if (block + num < end)
2211                                 end = block + num;
2212                         exists = 1;
2213                 } else {
2214                         BUG();
2215                 }
2216                 BUG_ON(end <= start);
2217
2218                 if (!exists) {
2219                         es.es_lblk = start;
2220                         es.es_len = end - start;
2221                         es.es_pblk = 0;
2222                 } else {
2223                         es.es_lblk = le32_to_cpu(ex->ee_block);
2224                         es.es_len = ext4_ext_get_actual_len(ex);
2225                         es.es_pblk = ext4_ext_pblock(ex);
2226                         if (ext4_ext_is_unwritten(ex))
2227                                 flags |= FIEMAP_EXTENT_UNWRITTEN;
2228                 }
2229
2230                 /*
2231                  * Find delayed extent and update es accordingly. We call
2232                  * it even in !exists case to find out whether es is the
2233                  * last existing extent or not.
2234                  */
2235                 next_del = ext4_find_delayed_extent(inode, &es);
2236                 if (!exists && next_del) {
2237                         exists = 1;
2238                         flags |= (FIEMAP_EXTENT_DELALLOC |
2239                                   FIEMAP_EXTENT_UNKNOWN);
2240                 }
2241                 up_read(&EXT4_I(inode)->i_data_sem);
2242
2243                 if (unlikely(es.es_len == 0)) {
2244                         EXT4_ERROR_INODE(inode, "es.es_len == 0");
2245                         err = -EFSCORRUPTED;
2246                         break;
2247                 }
2248
2249                 /*
2250                  * This is possible iff next == next_del == EXT_MAX_BLOCKS.
2251                  * we need to check next == EXT_MAX_BLOCKS because it is
2252                  * possible that an extent is with unwritten and delayed
2253                  * status due to when an extent is delayed allocated and
2254                  * is allocated by fallocate status tree will track both of
2255                  * them in a extent.
2256                  *
2257                  * So we could return a unwritten and delayed extent, and
2258                  * its block is equal to 'next'.
2259                  */
2260                 if (next == next_del && next == EXT_MAX_BLOCKS) {
2261                         flags |= FIEMAP_EXTENT_LAST;
2262                         if (unlikely(next_del != EXT_MAX_BLOCKS ||
2263                                      next != EXT_MAX_BLOCKS)) {
2264                                 EXT4_ERROR_INODE(inode,
2265                                                  "next extent == %u, next "
2266                                                  "delalloc extent = %u",
2267                                                  next, next_del);
2268                                 err = -EFSCORRUPTED;
2269                                 break;
2270                         }
2271                 }
2272
2273                 if (exists) {
2274                         err = fiemap_fill_next_extent(fieinfo,
2275                                 (__u64)es.es_lblk << blksize_bits,
2276                                 (__u64)es.es_pblk << blksize_bits,
2277                                 (__u64)es.es_len << blksize_bits,
2278                                 flags);
2279                         if (err < 0)
2280                                 break;
2281                         if (err == 1) {
2282                                 err = 0;
2283                                 break;
2284                         }
2285                 }
2286
2287                 block = es.es_lblk + es.es_len;
2288         }
2289
2290         ext4_ext_drop_refs(path);
2291         kfree(path);
2292         return err;
2293 }
2294
2295 /*
2296  * ext4_ext_put_gap_in_cache:
2297  * calculate boundaries of the gap that the requested block fits into
2298  * and cache this gap
2299  */
2300 static void
2301 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2302                                 ext4_lblk_t block)
2303 {
2304         int depth = ext_depth(inode);
2305         ext4_lblk_t len;
2306         ext4_lblk_t lblock;
2307         struct ext4_extent *ex;
2308         struct extent_status es;
2309
2310         ex = path[depth].p_ext;
2311         if (ex == NULL) {
2312                 /* there is no extent yet, so gap is [0;-] */
2313                 lblock = 0;
2314                 len = EXT_MAX_BLOCKS;
2315                 ext_debug("cache gap(whole file):");
2316         } else if (block < le32_to_cpu(ex->ee_block)) {
2317                 lblock = block;
2318                 len = le32_to_cpu(ex->ee_block) - block;
2319                 ext_debug("cache gap(before): %u [%u:%u]",
2320                                 block,
2321                                 le32_to_cpu(ex->ee_block),
2322                                  ext4_ext_get_actual_len(ex));
2323         } else if (block >= le32_to_cpu(ex->ee_block)
2324                         + ext4_ext_get_actual_len(ex)) {
2325                 ext4_lblk_t next;
2326                 lblock = le32_to_cpu(ex->ee_block)
2327                         + ext4_ext_get_actual_len(ex);
2328
2329                 next = ext4_ext_next_allocated_block(path);
2330                 ext_debug("cache gap(after): [%u:%u] %u",
2331                                 le32_to_cpu(ex->ee_block),
2332                                 ext4_ext_get_actual_len(ex),
2333                                 block);
2334                 BUG_ON(next == lblock);
2335                 len = next - lblock;
2336         } else {
2337                 BUG();
2338         }
2339
2340         ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
2341         if (es.es_len) {
2342                 /* There's delayed extent containing lblock? */
2343                 if (es.es_lblk <= lblock)
2344                         return;
2345                 len = min(es.es_lblk - lblock, len);
2346         }
2347         ext_debug(" -> %u:%u\n", lblock, len);
2348         ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
2349 }
2350
2351 /*
2352  * ext4_ext_rm_idx:
2353  * removes index from the index block.
2354  */
2355 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2356                         struct ext4_ext_path *path, int depth)
2357 {
2358         int err;
2359         ext4_fsblk_t leaf;
2360
2361         /* free index block */
2362         depth--;
2363         path = path + depth;
2364         leaf = ext4_idx_pblock(path->p_idx);
2365         if (unlikely(path->p_hdr->eh_entries == 0)) {
2366                 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2367                 return -EFSCORRUPTED;
2368         }
2369         err = ext4_ext_get_access(handle, inode, path);
2370         if (err)
2371                 return err;
2372
2373         if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2374                 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2375                 len *= sizeof(struct ext4_extent_idx);
2376                 memmove(path->p_idx, path->p_idx + 1, len);
2377         }
2378
2379         le16_add_cpu(&path->p_hdr->eh_entries, -1);
2380         err = ext4_ext_dirty(handle, inode, path);
2381         if (err)
2382                 return err;
2383         ext_debug("index is empty, remove it, free block %llu\n", leaf);
2384         trace_ext4_ext_rm_idx(inode, leaf);
2385
2386         ext4_free_blocks(handle, inode, NULL, leaf, 1,
2387                          EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2388
2389         while (--depth >= 0) {
2390                 if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2391                         break;
2392                 path--;
2393                 err = ext4_ext_get_access(handle, inode, path);
2394                 if (err)
2395                         break;
2396                 path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2397                 err = ext4_ext_dirty(handle, inode, path);
2398                 if (err)
2399                         break;
2400         }
2401         return err;
2402 }
2403
2404 /*
2405  * ext4_ext_calc_credits_for_single_extent:
2406  * This routine returns max. credits that needed to insert an extent
2407  * to the extent tree.
2408  * When pass the actual path, the caller should calculate credits
2409  * under i_data_sem.
2410  */
2411 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2412                                                 struct ext4_ext_path *path)
2413 {
2414         if (path) {
2415                 int depth = ext_depth(inode);
2416                 int ret = 0;
2417
2418                 /* probably there is space in leaf? */
2419                 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2420                                 < le16_to_cpu(path[depth].p_hdr->eh_max)) {
2421
2422                         /*
2423                          *  There are some space in the leaf tree, no
2424                          *  need to account for leaf block credit
2425                          *
2426                          *  bitmaps and block group descriptor blocks
2427                          *  and other metadata blocks still need to be
2428                          *  accounted.
2429                          */
2430                         /* 1 bitmap, 1 block group descriptor */
2431                         ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2432                         return ret;
2433                 }
2434         }
2435
2436         return ext4_chunk_trans_blocks(inode, nrblocks);
2437 }
2438
2439 /*
2440  * How many index/leaf blocks need to change/allocate to add @extents extents?
2441  *
2442  * If we add a single extent, then in the worse case, each tree level
2443  * index/leaf need to be changed in case of the tree split.
2444  *
2445  * If more extents are inserted, they could cause the whole tree split more
2446  * than once, but this is really rare.
2447  */
2448 int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2449 {
2450         int index;
2451         int depth;
2452
2453         /* If we are converting the inline data, only one is needed here. */
2454         if (ext4_has_inline_data(inode))
2455                 return 1;
2456
2457         depth = ext_depth(inode);
2458
2459         if (extents <= 1)
2460                 index = depth * 2;
2461         else
2462                 index = depth * 3;
2463
2464         return index;
2465 }
2466
2467 static inline int get_default_free_blocks_flags(struct inode *inode)
2468 {
2469         if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2470                 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2471         else if (ext4_should_journal_data(inode))
2472                 return EXT4_FREE_BLOCKS_FORGET;
2473         return 0;
2474 }
2475
2476 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2477                               struct ext4_extent *ex,
2478                               long long *partial_cluster,
2479                               ext4_lblk_t from, ext4_lblk_t to)
2480 {
2481         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2482         unsigned short ee_len = ext4_ext_get_actual_len(ex);
2483         ext4_fsblk_t pblk;
2484         int flags = get_default_free_blocks_flags(inode);
2485
2486         /*
2487          * For bigalloc file systems, we never free a partial cluster
2488          * at the beginning of the extent.  Instead, we make a note
2489          * that we tried freeing the cluster, and check to see if we
2490          * need to free it on a subsequent call to ext4_remove_blocks,
2491          * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2492          */
2493         flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2494
2495         trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2496         /*
2497          * If we have a partial cluster, and it's different from the
2498          * cluster of the last block, we need to explicitly free the
2499          * partial cluster here.
2500          */
2501         pblk = ext4_ext_pblock(ex) + ee_len - 1;
2502         if (*partial_cluster > 0 &&
2503             *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
2504                 ext4_free_blocks(handle, inode, NULL,
2505                                  EXT4_C2B(sbi, *partial_cluster),
2506                                  sbi->s_cluster_ratio, flags);
2507                 *partial_cluster = 0;
2508         }
2509
2510 #ifdef EXTENTS_STATS
2511         {
2512                 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2513                 spin_lock(&sbi->s_ext_stats_lock);
2514                 sbi->s_ext_blocks += ee_len;
2515                 sbi->s_ext_extents++;
2516                 if (ee_len < sbi->s_ext_min)
2517                         sbi->s_ext_min = ee_len;
2518                 if (ee_len > sbi->s_ext_max)
2519                         sbi->s_ext_max = ee_len;
2520                 if (ext_depth(inode) > sbi->s_depth_max)
2521                         sbi->s_depth_max = ext_depth(inode);
2522                 spin_unlock(&sbi->s_ext_stats_lock);
2523         }
2524 #endif
2525         if (from >= le32_to_cpu(ex->ee_block)
2526             && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2527                 /* tail removal */
2528                 ext4_lblk_t num;
2529                 long long first_cluster;
2530
2531                 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2532                 pblk = ext4_ext_pblock(ex) + ee_len - num;
2533                 /*
2534                  * Usually we want to free partial cluster at the end of the
2535                  * extent, except for the situation when the cluster is still
2536                  * used by any other extent (partial_cluster is negative).
2537                  */
2538                 if (*partial_cluster < 0 &&
2539                     *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
2540                         flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2541
2542                 ext_debug("free last %u blocks starting %llu partial %lld\n",
2543                           num, pblk, *partial_cluster);
2544                 ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2545                 /*
2546                  * If the block range to be freed didn't start at the
2547                  * beginning of a cluster, and we removed the entire
2548                  * extent and the cluster is not used by any other extent,
2549                  * save the partial cluster here, since we might need to
2550                  * delete if we determine that the truncate or punch hole
2551                  * operation has removed all of the blocks in the cluster.
2552                  * If that cluster is used by another extent, preserve its
2553                  * negative value so it isn't freed later on.
2554                  *
2555                  * If the whole extent wasn't freed, we've reached the
2556                  * start of the truncated/punched region and have finished
2557                  * removing blocks.  If there's a partial cluster here it's
2558                  * shared with the remainder of the extent and is no longer
2559                  * a candidate for removal.
2560                  */
2561                 if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
2562                         first_cluster = (long long) EXT4_B2C(sbi, pblk);
2563                         if (first_cluster != -*partial_cluster)
2564                                 *partial_cluster = first_cluster;
2565                 } else {
2566                         *partial_cluster = 0;
2567                 }
2568         } else
2569                 ext4_error(sbi->s_sb, "strange request: removal(2) "
2570                            "%u-%u from %u:%u\n",
2571                            from, to, le32_to_cpu(ex->ee_block), ee_len);
2572         return 0;
2573 }
2574
2575
2576 /*
2577  * ext4_ext_rm_leaf() Removes the extents associated with the
2578  * blocks appearing between "start" and "end".  Both "start"
2579  * and "end" must appear in the same extent or EIO is returned.
2580  *
2581  * @handle: The journal handle
2582  * @inode:  The files inode
2583  * @path:   The path to the leaf
2584  * @partial_cluster: The cluster which we'll have to free if all extents
2585  *                   has been released from it.  However, if this value is
2586  *                   negative, it's a cluster just to the right of the
2587  *                   punched region and it must not be freed.
2588  * @start:  The first block to remove
2589  * @end:   The last block to remove
2590  */
2591 static int
2592 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2593                  struct ext4_ext_path *path,
2594                  long long *partial_cluster,
2595                  ext4_lblk_t start, ext4_lblk_t end)
2596 {
2597         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2598         int err = 0, correct_index = 0;
2599         int depth = ext_depth(inode), credits;
2600         struct ext4_extent_header *eh;
2601         ext4_lblk_t a, b;
2602         unsigned num;
2603         ext4_lblk_t ex_ee_block;
2604         unsigned short ex_ee_len;
2605         unsigned unwritten = 0;
2606         struct ext4_extent *ex;
2607         ext4_fsblk_t pblk;
2608
2609         /* the header must be checked already in ext4_ext_remove_space() */
2610         ext_debug("truncate since %u in leaf to %u\n", start, end);
2611         if (!path[depth].p_hdr)
2612                 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2613         eh = path[depth].p_hdr;
2614         if (unlikely(path[depth].p_hdr == NULL)) {
2615                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2616                 return -EFSCORRUPTED;
2617         }
2618         /* find where to start removing */
2619         ex = path[depth].p_ext;
2620         if (!ex)
2621                 ex = EXT_LAST_EXTENT(eh);
2622
2623         ex_ee_block = le32_to_cpu(ex->ee_block);
2624         ex_ee_len = ext4_ext_get_actual_len(ex);
2625
2626         trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2627
2628         while (ex >= EXT_FIRST_EXTENT(eh) &&
2629                         ex_ee_block + ex_ee_len > start) {
2630
2631                 if (ext4_ext_is_unwritten(ex))
2632                         unwritten = 1;
2633                 else
2634                         unwritten = 0;
2635
2636                 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2637                           unwritten, ex_ee_len);
2638                 path[depth].p_ext = ex;
2639
2640                 a = ex_ee_block > start ? ex_ee_block : start;
2641                 b = ex_ee_block+ex_ee_len - 1 < end ?
2642                         ex_ee_block+ex_ee_len - 1 : end;
2643
2644                 ext_debug("  border %u:%u\n", a, b);
2645
2646                 /* If this extent is beyond the end of the hole, skip it */
2647                 if (end < ex_ee_block) {
2648                         /*
2649                          * We're going to skip this extent and move to another,
2650                          * so note that its first cluster is in use to avoid
2651                          * freeing it when removing blocks.  Eventually, the
2652                          * right edge of the truncated/punched region will
2653                          * be just to the left.
2654                          */
2655                         if (sbi->s_cluster_ratio > 1) {
2656                                 pblk = ext4_ext_pblock(ex);
2657                                 *partial_cluster =
2658                                         -(long long) EXT4_B2C(sbi, pblk);
2659                         }
2660                         ex--;
2661                         ex_ee_block = le32_to_cpu(ex->ee_block);
2662                         ex_ee_len = ext4_ext_get_actual_len(ex);
2663                         continue;
2664                 } else if (b != ex_ee_block + ex_ee_len - 1) {
2665                         EXT4_ERROR_INODE(inode,
2666                                          "can not handle truncate %u:%u "
2667                                          "on extent %u:%u",
2668                                          start, end, ex_ee_block,
2669                                          ex_ee_block + ex_ee_len - 1);
2670                         err = -EFSCORRUPTED;
2671                         goto out;
2672                 } else if (a != ex_ee_block) {
2673                         /* remove tail of the extent */
2674                         num = a - ex_ee_block;
2675                 } else {
2676                         /* remove whole extent: excellent! */
2677                         num = 0;
2678                 }
2679                 /*
2680                  * 3 for leaf, sb, and inode plus 2 (bmap and group
2681                  * descriptor) for each block group; assume two block
2682                  * groups plus ex_ee_len/blocks_per_block_group for
2683                  * the worst case
2684                  */
2685                 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2686                 if (ex == EXT_FIRST_EXTENT(eh)) {
2687                         correct_index = 1;
2688                         credits += (ext_depth(inode)) + 1;
2689                 }
2690                 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2691
2692                 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2693                 if (err)
2694                         goto out;
2695
2696                 err = ext4_ext_get_access(handle, inode, path + depth);
2697                 if (err)
2698                         goto out;
2699
2700                 err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2701                                          a, b);
2702                 if (err)
2703                         goto out;
2704
2705                 if (num == 0)
2706                         /* this extent is removed; mark slot entirely unused */
2707                         ext4_ext_store_pblock(ex, 0);
2708
2709                 ex->ee_len = cpu_to_le16(num);
2710                 /*
2711                  * Do not mark unwritten if all the blocks in the
2712                  * extent have been removed.
2713                  */
2714                 if (unwritten && num)
2715                         ext4_ext_mark_unwritten(ex);
2716                 /*
2717                  * If the extent was completely released,
2718                  * we need to remove it from the leaf
2719                  */
2720                 if (num == 0) {
2721                         if (end != EXT_MAX_BLOCKS - 1) {
2722                                 /*
2723                                  * For hole punching, we need to scoot all the
2724                                  * extents up when an extent is removed so that
2725                                  * we dont have blank extents in the middle
2726                                  */
2727                                 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2728                                         sizeof(struct ext4_extent));
2729
2730                                 /* Now get rid of the one at the end */
2731                                 memset(EXT_LAST_EXTENT(eh), 0,
2732                                         sizeof(struct ext4_extent));
2733                         }
2734                         le16_add_cpu(&eh->eh_entries, -1);
2735                 }
2736
2737                 err = ext4_ext_dirty(handle, inode, path + depth);
2738                 if (err)
2739                         goto out;
2740
2741                 ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2742                                 ext4_ext_pblock(ex));
2743                 ex--;
2744                 ex_ee_block = le32_to_cpu(ex->ee_block);
2745                 ex_ee_len = ext4_ext_get_actual_len(ex);
2746         }
2747
2748         if (correct_index && eh->eh_entries)
2749                 err = ext4_ext_correct_indexes(handle, inode, path);
2750
2751         /*
2752          * If there's a partial cluster and at least one extent remains in
2753          * the leaf, free the partial cluster if it isn't shared with the
2754          * current extent.  If it is shared with the current extent
2755          * we zero partial_cluster because we've reached the start of the
2756          * truncated/punched region and we're done removing blocks.
2757          */
2758         if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
2759                 pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2760                 if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
2761                         ext4_free_blocks(handle, inode, NULL,
2762                                          EXT4_C2B(sbi, *partial_cluster),
2763                                          sbi->s_cluster_ratio,
2764                                          get_default_free_blocks_flags(inode));
2765                 }
2766                 *partial_cluster = 0;
2767         }
2768
2769         /* if this leaf is free, then we should
2770          * remove it from index block above */
2771         if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2772                 err = ext4_ext_rm_idx(handle, inode, path, depth);
2773
2774 out:
2775         return err;
2776 }
2777
2778 /*
2779  * ext4_ext_more_to_rm:
2780  * returns 1 if current index has to be freed (even partial)
2781  */
2782 static int
2783 ext4_ext_more_to_rm(struct ext4_ext_path *path)
2784 {
2785         BUG_ON(path->p_idx == NULL);
2786
2787         if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2788                 return 0;
2789
2790         /*
2791          * if truncate on deeper level happened, it wasn't partial,
2792          * so we have to consider current index for truncation
2793          */
2794         if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2795                 return 0;
2796         return 1;
2797 }
2798
2799 int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2800                           ext4_lblk_t end)
2801 {
2802         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2803         int depth = ext_depth(inode);
2804         struct ext4_ext_path *path = NULL;
2805         long long partial_cluster = 0;
2806         handle_t *handle;
2807         int i = 0, err = 0;
2808
2809         ext_debug("truncate since %u to %u\n", start, end);
2810
2811         /* probably first extent we're gonna free will be last in block */
2812         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
2813         if (IS_ERR(handle))
2814                 return PTR_ERR(handle);
2815
2816 again:
2817         trace_ext4_ext_remove_space(inode, start, end, depth);
2818
2819         /*
2820          * Check if we are removing extents inside the extent tree. If that
2821          * is the case, we are going to punch a hole inside the extent tree
2822          * so we have to check whether we need to split the extent covering
2823          * the last block to remove so we can easily remove the part of it
2824          * in ext4_ext_rm_leaf().
2825          */
2826         if (end < EXT_MAX_BLOCKS - 1) {
2827                 struct ext4_extent *ex;
2828                 ext4_lblk_t ee_block, ex_end, lblk;
2829                 ext4_fsblk_t pblk;
2830
2831                 /* find extent for or closest extent to this block */
2832                 path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
2833                 if (IS_ERR(path)) {
2834                         ext4_journal_stop(handle);
2835                         return PTR_ERR(path);
2836                 }
2837                 depth = ext_depth(inode);
2838                 /* Leaf not may not exist only if inode has no blocks at all */
2839                 ex = path[depth].p_ext;
2840                 if (!ex) {
2841                         if (depth) {
2842                                 EXT4_ERROR_INODE(inode,
2843                                                  "path[%d].p_hdr == NULL",
2844                                                  depth);
2845                                 err = -EFSCORRUPTED;
2846                         }
2847                         goto out;
2848                 }
2849
2850                 ee_block = le32_to_cpu(ex->ee_block);
2851                 ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
2852
2853                 /*
2854                  * See if the last block is inside the extent, if so split
2855                  * the extent at 'end' block so we can easily remove the
2856                  * tail of the first part of the split extent in
2857                  * ext4_ext_rm_leaf().
2858                  */
2859                 if (end >= ee_block && end < ex_end) {
2860
2861                         /*
2862                          * If we're going to split the extent, note that
2863                          * the cluster containing the block after 'end' is
2864                          * in use to avoid freeing it when removing blocks.
2865                          */
2866                         if (sbi->s_cluster_ratio > 1) {
2867                                 pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
2868                                 partial_cluster =
2869                                         -(long long) EXT4_B2C(sbi, pblk);
2870                         }
2871
2872                         /*
2873                          * Split the extent in two so that 'end' is the last
2874                          * block in the first new extent. Also we should not
2875                          * fail removing space due to ENOSPC so try to use
2876                          * reserved block if that happens.
2877                          */
2878                         err = ext4_force_split_extent_at(handle, inode, &path,
2879                                                          end + 1, 1);
2880                         if (err < 0)
2881                                 goto out;
2882
2883                 } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
2884                         /*
2885                          * If there's an extent to the right its first cluster
2886                          * contains the immediate right boundary of the
2887                          * truncated/punched region.  Set partial_cluster to
2888                          * its negative value so it won't be freed if shared
2889                          * with the current extent.  The end < ee_block case
2890                          * is handled in ext4_ext_rm_leaf().
2891                          */
2892                         lblk = ex_end + 1;
2893                         err = ext4_ext_search_right(inode, path, &lblk, &pblk,
2894                                                     &ex);
2895                         if (err)
2896                                 goto out;
2897                         if (pblk)
2898                                 partial_cluster =
2899                                         -(long long) EXT4_B2C(sbi, pblk);
2900                 }
2901         }
2902         /*
2903          * We start scanning from right side, freeing all the blocks
2904          * after i_size and walking into the tree depth-wise.
2905          */
2906         depth = ext_depth(inode);
2907         if (path) {
2908                 int k = i = depth;
2909                 while (--k > 0)
2910                         path[k].p_block =
2911                                 le16_to_cpu(path[k].p_hdr->eh_entries)+1;
2912         } else {
2913                 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
2914                                GFP_NOFS);
2915                 if (path == NULL) {
2916                         ext4_journal_stop(handle);
2917                         return -ENOMEM;
2918                 }
2919                 path[0].p_maxdepth = path[0].p_depth = depth;
2920                 path[0].p_hdr = ext_inode_hdr(inode);
2921                 i = 0;
2922
2923                 if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
2924                         err = -EFSCORRUPTED;
2925                         goto out;
2926                 }
2927         }
2928         err = 0;
2929
2930         while (i >= 0 && err == 0) {
2931                 if (i == depth) {
2932                         /* this is leaf block */
2933                         err = ext4_ext_rm_leaf(handle, inode, path,
2934                                                &partial_cluster, start,
2935                                                end);
2936                         /* root level has p_bh == NULL, brelse() eats this */
2937                         brelse(path[i].p_bh);
2938                         path[i].p_bh = NULL;
2939                         i--;
2940                         continue;
2941                 }
2942
2943                 /* this is index block */
2944                 if (!path[i].p_hdr) {
2945                         ext_debug("initialize header\n");
2946                         path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2947                 }
2948
2949                 if (!path[i].p_idx) {
2950                         /* this level hasn't been touched yet */
2951                         path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2952                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2953                         ext_debug("init index ptr: hdr 0x%p, num %d\n",
2954                                   path[i].p_hdr,
2955                                   le16_to_cpu(path[i].p_hdr->eh_entries));
2956                 } else {
2957                         /* we were already here, see at next index */
2958                         path[i].p_idx--;
2959                 }
2960
2961                 ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2962                                 i, EXT_FIRST_INDEX(path[i].p_hdr),
2963                                 path[i].p_idx);
2964                 if (ext4_ext_more_to_rm(path + i)) {
2965                         struct buffer_head *bh;
2966                         /* go to the next level */
2967                         ext_debug("move to level %d (block %llu)\n",
2968                                   i + 1, ext4_idx_pblock(path[i].p_idx));
2969                         memset(path + i + 1, 0, sizeof(*path));
2970                         bh = read_extent_tree_block(inode,
2971                                 ext4_idx_pblock(path[i].p_idx), depth - i - 1,
2972                                 EXT4_EX_NOCACHE);
2973                         if (IS_ERR(bh)) {
2974                                 /* should we reset i_size? */
2975                                 err = PTR_ERR(bh);
2976                                 break;
2977                         }
2978                         /* Yield here to deal with large extent trees.
2979                          * Should be a no-op if we did IO above. */
2980                         cond_resched();
2981                         if (WARN_ON(i + 1 > depth)) {
2982                                 err = -EFSCORRUPTED;
2983                                 break;
2984                         }
2985                         path[i + 1].p_bh = bh;
2986
2987                         /* save actual number of indexes since this
2988                          * number is changed at the next iteration */
2989                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2990                         i++;
2991                 } else {
2992                         /* we finished processing this index, go up */
2993                         if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2994                                 /* index is empty, remove it;
2995                                  * handle must be already prepared by the
2996                                  * truncatei_leaf() */
2997                                 err = ext4_ext_rm_idx(handle, inode, path, i);
2998                         }
2999                         /* root level has p_bh == NULL, brelse() eats this */
3000                         brelse(path[i].p_bh);
3001                         path[i].p_bh = NULL;
3002                         i--;
3003                         ext_debug("return to level %d\n", i);
3004                 }
3005         }
3006
3007         trace_ext4_ext_remove_space_done(inode, start, end, depth,
3008                         partial_cluster, path->p_hdr->eh_entries);
3009
3010         /*
3011          * If we still have something in the partial cluster and we have removed
3012          * even the first extent, then we should free the blocks in the partial
3013          * cluster as well.  (This code will only run when there are no leaves
3014          * to the immediate left of the truncated/punched region.)
3015          */
3016         if (partial_cluster > 0 && err == 0) {
3017                 /* don't zero partial_cluster since it's not used afterwards */
3018                 ext4_free_blocks(handle, inode, NULL,
3019                                  EXT4_C2B(sbi, partial_cluster),
3020                                  sbi->s_cluster_ratio,
3021                                  get_default_free_blocks_flags(inode));
3022         }
3023
3024         /* TODO: flexible tree reduction should be here */
3025         if (path->p_hdr->eh_entries == 0) {
3026                 /*
3027                  * truncate to zero freed all the tree,
3028                  * so we need to correct eh_depth
3029                  */
3030                 err = ext4_ext_get_access(handle, inode, path);
3031                 if (err == 0) {
3032                         ext_inode_hdr(inode)->eh_depth = 0;
3033                         ext_inode_hdr(inode)->eh_max =
3034                                 cpu_to_le16(ext4_ext_space_root(inode, 0));
3035                         err = ext4_ext_dirty(handle, inode, path);
3036                 }
3037         }
3038 out:
3039         ext4_ext_drop_refs(path);
3040         kfree(path);
3041         path = NULL;
3042         if (err == -EAGAIN)
3043                 goto again;
3044         ext4_journal_stop(handle);
3045
3046         return err;
3047 }
3048
3049 /*
3050  * called at mount time
3051  */
3052 void ext4_ext_init(struct super_block *sb)
3053 {
3054         /*
3055          * possible initialization would be here
3056          */
3057
3058         if (ext4_has_feature_extents(sb)) {
3059 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3060                 printk(KERN_INFO "EXT4-fs: file extents enabled"
3061 #ifdef AGGRESSIVE_TEST
3062                        ", aggressive tests"
3063 #endif
3064 #ifdef CHECK_BINSEARCH
3065                        ", check binsearch"
3066 #endif
3067 #ifdef EXTENTS_STATS
3068                        ", stats"
3069 #endif
3070                        "\n");
3071 #endif
3072 #ifdef EXTENTS_STATS
3073                 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3074                 EXT4_SB(sb)->s_ext_min = 1 << 30;
3075                 EXT4_SB(sb)->s_ext_max = 0;
3076 #endif
3077         }
3078 }
3079
3080 /*
3081  * called at umount time
3082  */
3083 void ext4_ext_release(struct super_block *sb)
3084 {
3085         if (!ext4_has_feature_extents(sb))
3086                 return;
3087
3088 #ifdef EXTENTS_STATS
3089         if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3090                 struct ext4_sb_info *sbi = EXT4_SB(sb);
3091                 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3092                         sbi->s_ext_blocks, sbi->s_ext_extents,
3093                         sbi->s_ext_blocks / sbi->s_ext_extents);
3094                 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3095                         sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3096         }
3097 #endif
3098 }
3099
3100 static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3101 {
3102         ext4_lblk_t  ee_block;
3103         ext4_fsblk_t ee_pblock;
3104         unsigned int ee_len;
3105
3106         ee_block  = le32_to_cpu(ex->ee_block);
3107         ee_len    = ext4_ext_get_actual_len(ex);
3108         ee_pblock = ext4_ext_pblock(ex);
3109
3110         if (ee_len == 0)
3111                 return 0;
3112
3113         return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3114                                      EXTENT_STATUS_WRITTEN);
3115 }
3116
3117 /* FIXME!! we need to try to merge to left or right after zero-out  */
3118 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3119 {
3120         ext4_fsblk_t ee_pblock;
3121         unsigned int ee_len;
3122         int ret;
3123
3124         ee_len    = ext4_ext_get_actual_len(ex);
3125         ee_pblock = ext4_ext_pblock(ex);
3126
3127         if (ext4_encrypted_inode(inode))
3128                 return ext4_encrypted_zeroout(inode, ex);
3129
3130         ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
3131         if (ret > 0)
3132                 ret = 0;
3133
3134         return ret;
3135 }
3136
3137 /*
3138  * ext4_split_extent_at() splits an extent at given block.
3139  *
3140  * @handle: the journal handle
3141  * @inode: the file inode
3142  * @path: the path to the extent
3143  * @split: the logical block where the extent is splitted.
3144  * @split_flags: indicates if the extent could be zeroout if split fails, and
3145  *               the states(init or unwritten) of new extents.
3146  * @flags: flags used to insert new extent to extent tree.
3147  *
3148  *
3149  * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3150  * of which are deterimined by split_flag.
3151  *
3152  * There are two cases:
3153  *  a> the extent are splitted into two extent.
3154  *  b> split is not needed, and just mark the extent.
3155  *
3156  * return 0 on success.
3157  */
3158 static int ext4_split_extent_at(handle_t *handle,
3159                              struct inode *inode,
3160                              struct ext4_ext_path **ppath,
3161                              ext4_lblk_t split,
3162                              int split_flag,
3163                              int flags)
3164 {
3165         struct ext4_ext_path *path = *ppath;
3166         ext4_fsblk_t newblock;
3167         ext4_lblk_t ee_block;
3168         struct ext4_extent *ex, newex, orig_ex, zero_ex;
3169         struct ext4_extent *ex2 = NULL;
3170         unsigned int ee_len, depth;
3171         int err = 0;
3172
3173         BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3174                (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3175
3176         ext_debug("ext4_split_extents_at: inode %lu, logical"
3177                 "block %llu\n", inode->i_ino, (unsigned long long)split);
3178
3179         ext4_ext_show_leaf(inode, path);
3180
3181         depth = ext_depth(inode);
3182         ex = path[depth].p_ext;
3183         ee_block = le32_to_cpu(ex->ee_block);
3184         ee_len = ext4_ext_get_actual_len(ex);
3185         newblock = split - ee_block + ext4_ext_pblock(ex);
3186
3187         BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3188         BUG_ON(!ext4_ext_is_unwritten(ex) &&
3189                split_flag & (EXT4_EXT_MAY_ZEROOUT |
3190                              EXT4_EXT_MARK_UNWRIT1 |
3191                              EXT4_EXT_MARK_UNWRIT2));
3192
3193         err = ext4_ext_get_access(handle, inode, path + depth);
3194         if (err)
3195                 goto out;
3196
3197         if (split == ee_block) {
3198                 /*
3199                  * case b: block @split is the block that the extent begins with
3200                  * then we just change the state of the extent, and splitting
3201                  * is not needed.
3202                  */
3203                 if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3204                         ext4_ext_mark_unwritten(ex);
3205                 else
3206                         ext4_ext_mark_initialized(ex);
3207
3208                 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3209                         ext4_ext_try_to_merge(handle, inode, path, ex);
3210
3211                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3212                 goto out;
3213         }
3214
3215         /* case a */
3216         memcpy(&orig_ex, ex, sizeof(orig_ex));
3217         ex->ee_len = cpu_to_le16(split - ee_block);
3218         if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3219                 ext4_ext_mark_unwritten(ex);
3220
3221         /*
3222          * path may lead to new leaf, not to original leaf any more
3223          * after ext4_ext_insert_extent() returns,
3224          */
3225         err = ext4_ext_dirty(handle, inode, path + depth);
3226         if (err)
3227                 goto fix_extent_len;
3228
3229         ex2 = &newex;
3230         ex2->ee_block = cpu_to_le32(split);
3231         ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3232         ext4_ext_store_pblock(ex2, newblock);
3233         if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3234                 ext4_ext_mark_unwritten(ex2);
3235
3236         err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
3237         if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3238                 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3239                         if (split_flag & EXT4_EXT_DATA_VALID1) {
3240                                 err = ext4_ext_zeroout(inode, ex2);
3241                                 zero_ex.ee_block = ex2->ee_block;
3242                                 zero_ex.ee_len = cpu_to_le16(
3243                                                 ext4_ext_get_actual_len(ex2));
3244                                 ext4_ext_store_pblock(&zero_ex,
3245                                                       ext4_ext_pblock(ex2));
3246                         } else {
3247                                 err = ext4_ext_zeroout(inode, ex);
3248                                 zero_ex.ee_block = ex->ee_block;
3249                                 zero_ex.ee_len = cpu_to_le16(
3250                                                 ext4_ext_get_actual_len(ex));
3251                                 ext4_ext_store_pblock(&zero_ex,
3252                                                       ext4_ext_pblock(ex));
3253                         }
3254                 } else {
3255                         err = ext4_ext_zeroout(inode, &orig_ex);
3256                         zero_ex.ee_block = orig_ex.ee_block;
3257                         zero_ex.ee_len = cpu_to_le16(
3258                                                 ext4_ext_get_actual_len(&orig_ex));
3259                         ext4_ext_store_pblock(&zero_ex,
3260                                               ext4_ext_pblock(&orig_ex));
3261                 }
3262
3263                 if (err)
3264                         goto fix_extent_len;
3265                 /* update the extent length and mark as initialized */
3266                 ex->ee_len = cpu_to_le16(ee_len);
3267                 ext4_ext_try_to_merge(handle, inode, path, ex);
3268                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3269                 if (err)
3270                         goto fix_extent_len;
3271
3272                 /* update extent status tree */
3273                 err = ext4_zeroout_es(inode, &zero_ex);
3274
3275                 goto out;
3276         } else if (err)
3277                 goto fix_extent_len;
3278
3279 out:
3280         ext4_ext_show_leaf(inode, path);
3281         return err;
3282
3283 fix_extent_len:
3284         ex->ee_len = orig_ex.ee_len;
3285         ext4_ext_dirty(handle, inode, path + path->p_depth);
3286         return err;
3287 }
3288
3289 /*
3290  * ext4_split_extents() splits an extent and mark extent which is covered
3291  * by @map as split_flags indicates
3292  *
3293  * It may result in splitting the extent into multiple extents (up to three)
3294  * There are three possibilities:
3295  *   a> There is no split required
3296  *   b> Splits in two extents: Split is happening at either end of the extent
3297  *   c> Splits in three extents: Somone is splitting in middle of the extent
3298  *
3299  */
3300 static int ext4_split_extent(handle_t *handle,
3301                               struct inode *inode,
3302                               struct ext4_ext_path **ppath,
3303                               struct ext4_map_blocks *map,
3304                               int split_flag,
3305                               int flags)
3306 {
3307         struct ext4_ext_path *path = *ppath;
3308         ext4_lblk_t ee_block;
3309         struct ext4_extent *ex;
3310         unsigned int ee_len, depth;
3311         int err = 0;
3312         int unwritten;
3313         int split_flag1, flags1;
3314         int allocated = map->m_len;
3315
3316         depth = ext_depth(inode);
3317         ex = path[depth].p_ext;
3318         ee_block = le32_to_cpu(ex->ee_block);
3319         ee_len = ext4_ext_get_actual_len(ex);
3320         unwritten = ext4_ext_is_unwritten(ex);
3321
3322         if (map->m_lblk + map->m_len < ee_block + ee_len) {
3323                 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3324                 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3325                 if (unwritten)
3326                         split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3327                                        EXT4_EXT_MARK_UNWRIT2;
3328                 if (split_flag & EXT4_EXT_DATA_VALID2)
3329                         split_flag1 |= EXT4_EXT_DATA_VALID1;
3330                 err = ext4_split_extent_at(handle, inode, ppath,
3331                                 map->m_lblk + map->m_len, split_flag1, flags1);
3332                 if (err)
3333                         goto out;
3334         } else {
3335                 allocated = ee_len - (map->m_lblk - ee_block);
3336         }
3337         /*
3338          * Update path is required because previous ext4_split_extent_at() may
3339          * result in split of original leaf or extent zeroout.
3340          */
3341         path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3342         if (IS_ERR(path))
3343                 return PTR_ERR(path);
3344         depth = ext_depth(inode);
3345         ex = path[depth].p_ext;
3346         if (!ex) {
3347                 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3348                                  (unsigned long) map->m_lblk);
3349                 return -EFSCORRUPTED;
3350         }
3351         unwritten = ext4_ext_is_unwritten(ex);
3352         split_flag1 = 0;
3353
3354         if (map->m_lblk >= ee_block) {
3355                 split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3356                 if (unwritten) {
3357                         split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3358                         split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3359                                                      EXT4_EXT_MARK_UNWRIT2);
3360                 }
3361                 err = ext4_split_extent_at(handle, inode, ppath,
3362                                 map->m_lblk, split_flag1, flags);
3363                 if (err)
3364                         goto out;
3365         }
3366
3367         ext4_ext_show_leaf(inode, path);
3368 out:
3369         return err ? err : allocated;
3370 }
3371
3372 /*
3373  * This function is called by ext4_ext_map_blocks() if someone tries to write
3374  * to an unwritten extent. It may result in splitting the unwritten
3375  * extent into multiple extents (up to three - one initialized and two
3376  * unwritten).
3377  * There are three possibilities:
3378  *   a> There is no split required: Entire extent should be initialized
3379  *   b> Splits in two extents: Write is happening at either end of the extent
3380  *   c> Splits in three extents: Somone is writing in middle of the extent
3381  *
3382  * Pre-conditions:
3383  *  - The extent pointed to by 'path' is unwritten.
3384  *  - The extent pointed to by 'path' contains a superset
3385  *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3386  *
3387  * Post-conditions on success:
3388  *  - the returned value is the number of blocks beyond map->l_lblk
3389  *    that are allocated and initialized.
3390  *    It is guaranteed to be >= map->m_len.
3391  */
3392 static int ext4_ext_convert_to_initialized(handle_t *handle,
3393                                            struct inode *inode,
3394                                            struct ext4_map_blocks *map,
3395                                            struct ext4_ext_path **ppath,
3396                                            int flags)
3397 {
3398         struct ext4_ext_path *path = *ppath;
3399         struct ext4_sb_info *sbi;
3400         struct ext4_extent_header *eh;
3401         struct ext4_map_blocks split_map;
3402         struct ext4_extent zero_ex;
3403         struct ext4_extent *ex, *abut_ex;
3404         ext4_lblk_t ee_block, eof_block;
3405         unsigned int ee_len, depth, map_len = map->m_len;
3406         int allocated = 0, max_zeroout = 0;
3407         int err = 0;
3408         int split_flag = 0;
3409
3410         ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3411                 "block %llu, max_blocks %u\n", inode->i_ino,
3412                 (unsigned long long)map->m_lblk, map_len);
3413
3414         sbi = EXT4_SB(inode->i_sb);
3415         eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3416                 inode->i_sb->s_blocksize_bits;
3417         if (eof_block < map->m_lblk + map_len)
3418                 eof_block = map->m_lblk + map_len;
3419
3420         depth = ext_depth(inode);
3421         eh = path[depth].p_hdr;
3422         ex = path[depth].p_ext;
3423         ee_block = le32_to_cpu(ex->ee_block);
3424         ee_len = ext4_ext_get_actual_len(ex);
3425         zero_ex.ee_len = 0;
3426
3427         trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3428
3429         /* Pre-conditions */
3430         BUG_ON(!ext4_ext_is_unwritten(ex));
3431         BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3432
3433         /*
3434          * Attempt to transfer newly initialized blocks from the currently
3435          * unwritten extent to its neighbor. This is much cheaper
3436          * than an insertion followed by a merge as those involve costly
3437          * memmove() calls. Transferring to the left is the common case in
3438          * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3439          * followed by append writes.
3440          *
3441          * Limitations of the current logic:
3442          *  - L1: we do not deal with writes covering the whole extent.
3443          *    This would require removing the extent if the transfer
3444          *    is possible.
3445          *  - L2: we only attempt to merge with an extent stored in the
3446          *    same extent tree node.
3447          */
3448         if ((map->m_lblk == ee_block) &&
3449                 /* See if we can merge left */
3450                 (map_len < ee_len) &&           /*L1*/
3451                 (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
3452                 ext4_lblk_t prev_lblk;
3453                 ext4_fsblk_t prev_pblk, ee_pblk;
3454                 unsigned int prev_len;
3455
3456                 abut_ex = ex - 1;
3457                 prev_lblk = le32_to_cpu(abut_ex->ee_block);
3458                 prev_len = ext4_ext_get_actual_len(abut_ex);
3459                 prev_pblk = ext4_ext_pblock(abut_ex);
3460                 ee_pblk = ext4_ext_pblock(ex);
3461
3462                 /*
3463                  * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3464                  * upon those conditions:
3465                  * - C1: abut_ex is initialized,
3466                  * - C2: abut_ex is logically abutting ex,
3467                  * - C3: abut_ex is physically abutting ex,
3468                  * - C4: abut_ex can receive the additional blocks without
3469                  *   overflowing the (initialized) length limit.
3470                  */
3471                 if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3472                         ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
3473                         ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
3474                         (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
3475                         err = ext4_ext_get_access(handle, inode, path + depth);
3476                         if (err)
3477                                 goto out;
3478
3479                         trace_ext4_ext_convert_to_initialized_fastpath(inode,
3480                                 map, ex, abut_ex);
3481
3482                         /* Shift the start of ex by 'map_len' blocks */
3483                         ex->ee_block = cpu_to_le32(ee_block + map_len);
3484                         ext4_ext_store_pblock(ex, ee_pblk + map_len);
3485                         ex->ee_len = cpu_to_le16(ee_len - map_len);
3486                         ext4_ext_mark_unwritten(ex); /* Restore the flag */
3487
3488                         /* Extend abut_ex by 'map_len' blocks */
3489                         abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3490
3491                         /* Result: number of initialized blocks past m_lblk */
3492                         allocated = map_len;
3493                 }
3494         } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3495                    (map_len < ee_len) &&        /*L1*/
3496                    ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
3497                 /* See if we can merge right */
3498                 ext4_lblk_t next_lblk;
3499                 ext4_fsblk_t next_pblk, ee_pblk;
3500                 unsigned int next_len;
3501
3502                 abut_ex = ex + 1;
3503                 next_lblk = le32_to_cpu(abut_ex->ee_block);
3504                 next_len = ext4_ext_get_actual_len(abut_ex);
3505                 next_pblk = ext4_ext_pblock(abut_ex);
3506                 ee_pblk = ext4_ext_pblock(ex);
3507
3508                 /*
3509                  * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3510                  * upon those conditions:
3511                  * - C1: abut_ex is initialized,
3512                  * - C2: abut_ex is logically abutting ex,
3513                  * - C3: abut_ex is physically abutting ex,
3514                  * - C4: abut_ex can receive the additional blocks without
3515                  *   overflowing the (initialized) length limit.
3516                  */
3517                 if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3518                     ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
3519                     ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
3520                     (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
3521                         err = ext4_ext_get_access(handle, inode, path + depth);
3522                         if (err)
3523                                 goto out;
3524
3525                         trace_ext4_ext_convert_to_initialized_fastpath(inode,
3526                                 map, ex, abut_ex);
3527
3528                         /* Shift the start of abut_ex by 'map_len' blocks */
3529                         abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3530                         ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3531                         ex->ee_len = cpu_to_le16(ee_len - map_len);
3532                         ext4_ext_mark_unwritten(ex); /* Restore the flag */
3533
3534                         /* Extend abut_ex by 'map_len' blocks */
3535                         abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3536
3537                         /* Result: number of initialized blocks past m_lblk */
3538                         allocated = map_len;
3539                 }
3540         }
3541         if (allocated) {
3542                 /* Mark the block containing both extents as dirty */
3543                 ext4_ext_dirty(handle, inode, path + depth);
3544
3545                 /* Update path to point to the right extent */
3546                 path[depth].p_ext = abut_ex;
3547                 goto out;
3548         } else
3549                 allocated = ee_len - (map->m_lblk - ee_block);
3550
3551         WARN_ON(map->m_lblk < ee_block);
3552         /*
3553          * It is safe to convert extent to initialized via explicit
3554          * zeroout only if extent is fully inside i_size or new_size.
3555          */
3556         split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3557
3558         if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3559                 max_zeroout = sbi->s_extent_max_zeroout_kb >>
3560                         (inode->i_sb->s_blocksize_bits - 10);
3561
3562         if (ext4_encrypted_inode(inode))
3563                 max_zeroout = 0;
3564
3565         /* If extent is less than s_max_zeroout_kb, zeroout directly */
3566         if (max_zeroout && (ee_len <= max_zeroout)) {
3567                 err = ext4_ext_zeroout(inode, ex);
3568                 if (err)
3569                         goto out;
3570                 zero_ex.ee_block = ex->ee_block;
3571                 zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
3572                 ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
3573
3574                 err = ext4_ext_get_access(handle, inode, path + depth);
3575                 if (err)
3576                         goto out;
3577                 ext4_ext_mark_initialized(ex);
3578                 ext4_ext_try_to_merge(handle, inode, path, ex);
3579                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3580                 goto out;
3581         }
3582
3583         /*
3584          * four cases:
3585          * 1. split the extent into three extents.
3586          * 2. split the extent into two extents, zeroout the first half.
3587          * 3. split the extent into two extents, zeroout the second half.
3588          * 4. split the extent into two extents with out zeroout.
3589          */
3590         split_map.m_lblk = map->m_lblk;
3591         split_map.m_len = map->m_len;
3592
3593         if (max_zeroout && (allocated > map->m_len)) {
3594                 if (allocated <= max_zeroout) {
3595                         /* case 3 */
3596                         zero_ex.ee_block =
3597                                          cpu_to_le32(map->m_lblk);
3598                         zero_ex.ee_len = cpu_to_le16(allocated);
3599                         ext4_ext_store_pblock(&zero_ex,
3600                                 ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3601                         err = ext4_ext_zeroout(inode, &zero_ex);
3602                         if (err)
3603                                 goto out;
3604                         split_map.m_lblk = map->m_lblk;
3605                         split_map.m_len = allocated;
3606                 } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
3607                         /* case 2 */
3608                         if (map->m_lblk != ee_block) {
3609                                 zero_ex.ee_block = ex->ee_block;
3610                                 zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3611                                                         ee_block);
3612                                 ext4_ext_store_pblock(&zero_ex,
3613                                                       ext4_ext_pblock(ex));
3614                                 err = ext4_ext_zeroout(inode, &zero_ex);
3615                                 if (err)
3616                                         goto out;
3617                         }
3618
3619                         split_map.m_lblk = ee_block;
3620                         split_map.m_len = map->m_lblk - ee_block + map->m_len;
3621                         allocated = map->m_len;
3622                 }
3623         }
3624
3625         err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
3626                                 flags);
3627         if (err > 0)
3628                 err = 0;
3629 out:
3630         /* If we have gotten a failure, don't zero out status tree */
3631         if (!err)
3632                 err = ext4_zeroout_es(inode, &zero_ex);
3633         return err ? err : allocated;
3634 }
3635
3636 /*
3637  * This function is called by ext4_ext_map_blocks() from
3638  * ext4_get_blocks_dio_write() when DIO to write
3639  * to an unwritten extent.
3640  *
3641  * Writing to an unwritten extent may result in splitting the unwritten
3642  * extent into multiple initialized/unwritten extents (up to three)
3643  * There are three possibilities:
3644  *   a> There is no split required: Entire extent should be unwritten
3645  *   b> Splits in two extents: Write is happening at either end of the extent
3646  *   c> Splits in three extents: Somone is writing in middle of the extent
3647  *
3648  * This works the same way in the case of initialized -> unwritten conversion.
3649  *
3650  * One of more index blocks maybe needed if the extent tree grow after
3651  * the unwritten extent split. To prevent ENOSPC occur at the IO
3652  * complete, we need to split the unwritten extent before DIO submit
3653  * the IO. The unwritten extent called at this time will be split
3654  * into three unwritten extent(at most). After IO complete, the part
3655  * being filled will be convert to initialized by the end_io callback function
3656  * via ext4_convert_unwritten_extents().
3657  *
3658  * Returns the size of unwritten extent to be written on success.
3659  */
3660 static int ext4_split_convert_extents(handle_t *handle,
3661                                         struct inode *inode,
3662                                         struct ext4_map_blocks *map,
3663                                         struct ext4_ext_path **ppath,
3664                                         int flags)
3665 {
3666         struct ext4_ext_path *path = *ppath;
3667         ext4_lblk_t eof_block;
3668         ext4_lblk_t ee_block;
3669         struct ext4_extent *ex;
3670         unsigned int ee_len;
3671         int split_flag = 0, depth;
3672
3673         ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3674                   __func__, inode->i_ino,
3675                   (unsigned long long)map->m_lblk, map->m_len);
3676
3677         eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3678                 inode->i_sb->s_blocksize_bits;
3679         if (eof_block < map->m_lblk + map->m_len)
3680                 eof_block = map->m_lblk + map->m_len;
3681         /*
3682          * It is safe to convert extent to initialized via explicit
3683          * zeroout only if extent is fully insde i_size or new_size.
3684          */
3685         depth = ext_depth(inode);
3686         ex = path[depth].p_ext;
3687         ee_block = le32_to_cpu(ex->ee_block);
3688         ee_len = ext4_ext_get_actual_len(ex);
3689
3690         /* Convert to unwritten */
3691         if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3692                 split_flag |= EXT4_EXT_DATA_VALID1;
3693         /* Convert to initialized */
3694         } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3695                 split_flag |= ee_block + ee_len <= eof_block ?
3696                               EXT4_EXT_MAY_ZEROOUT : 0;
3697                 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3698         }
3699         flags |= EXT4_GET_BLOCKS_PRE_IO;
3700         return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
3701 }
3702
3703 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3704                                                 struct inode *inode,
3705                                                 struct ext4_map_blocks *map,
3706                                                 struct ext4_ext_path **ppath)
3707 {
3708         struct ext4_ext_path *path = *ppath;
3709         struct ext4_extent *ex;
3710         ext4_lblk_t ee_block;
3711         unsigned int ee_len;
3712         int depth;
3713         int err = 0;
3714
3715         depth = ext_depth(inode);
3716         ex = path[depth].p_ext;
3717         ee_block = le32_to_cpu(ex->ee_block);
3718         ee_len = ext4_ext_get_actual_len(ex);
3719
3720         ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3721                 "block %llu, max_blocks %u\n", inode->i_ino,
3722                   (unsigned long long)ee_block, ee_len);
3723
3724         /* If extent is larger than requested it is a clear sign that we still
3725          * have some extent state machine issues left. So extent_split is still
3726          * required.
3727          * TODO: Once all related issues will be fixed this situation should be
3728          * illegal.
3729          */
3730         if (ee_block != map->m_lblk || ee_len > map->m_len) {
3731 #ifdef EXT4_DEBUG
3732                 ext4_warning("Inode (%ld) finished: extent logical block %llu,"
3733                              " len %u; IO logical block %llu, len %u\n",
3734                              inode->i_ino, (unsigned long long)ee_block, ee_len,
3735                              (unsigned long long)map->m_lblk, map->m_len);
3736 #endif
3737                 err = ext4_split_convert_extents(handle, inode, map, ppath,
3738                                                  EXT4_GET_BLOCKS_CONVERT);
3739                 if (err < 0)
3740                         return err;
3741                 path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3742                 if (IS_ERR(path))
3743                         return PTR_ERR(path);
3744                 depth = ext_depth(inode);
3745                 ex = path[depth].p_ext;
3746         }
3747
3748         err = ext4_ext_get_access(handle, inode, path + depth);
3749         if (err)
3750                 goto out;
3751         /* first mark the extent as initialized */
3752         ext4_ext_mark_initialized(ex);
3753
3754         /* note: ext4_ext_correct_indexes() isn't needed here because
3755          * borders are not changed
3756          */
3757         ext4_ext_try_to_merge(handle, inode, path, ex);
3758
3759         /* Mark modified extent as dirty */
3760         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3761 out:
3762         ext4_ext_show_leaf(inode, path);
3763         return err;
3764 }
3765
3766 static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3767                         sector_t block, int count)
3768 {
3769         int i;
3770         for (i = 0; i < count; i++)
3771                 unmap_underlying_metadata(bdev, block + i);
3772 }
3773
3774 /*
3775  * Handle EOFBLOCKS_FL flag, clearing it if necessary
3776  */
3777 static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3778                               ext4_lblk_t lblk,
3779                               struct ext4_ext_path *path,
3780                               unsigned int len)
3781 {
3782         int i, depth;
3783         struct ext4_extent_header *eh;
3784         struct ext4_extent *last_ex;
3785
3786         if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3787                 return 0;
3788
3789         depth = ext_depth(inode);
3790         eh = path[depth].p_hdr;
3791
3792         /*
3793          * We're going to remove EOFBLOCKS_FL entirely in future so we
3794          * do not care for this case anymore. Simply remove the flag
3795          * if there are no extents.
3796          */
3797         if (unlikely(!eh->eh_entries))
3798                 goto out;
3799         last_ex = EXT_LAST_EXTENT(eh);
3800         /*
3801          * We should clear the EOFBLOCKS_FL flag if we are writing the
3802          * last block in the last extent in the file.  We test this by
3803          * first checking to see if the caller to
3804          * ext4_ext_get_blocks() was interested in the last block (or
3805          * a block beyond the last block) in the current extent.  If
3806          * this turns out to be false, we can bail out from this
3807          * function immediately.
3808          */
3809         if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3810             ext4_ext_get_actual_len(last_ex))
3811                 return 0;
3812         /*
3813          * If the caller does appear to be planning to write at or
3814          * beyond the end of the current extent, we then test to see
3815          * if the current extent is the last extent in the file, by
3816          * checking to make sure it was reached via the rightmost node
3817          * at each level of the tree.
3818          */
3819         for (i = depth-1; i >= 0; i--)
3820                 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3821                         return 0;
3822 out:
3823         ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3824         return ext4_mark_inode_dirty(handle, inode);
3825 }
3826
3827 /**
3828  * ext4_find_delalloc_range: find delayed allocated block in the given range.
3829  *
3830  * Return 1 if there is a delalloc block in the range, otherwise 0.
3831  */
3832 int ext4_find_delalloc_range(struct inode *inode,
3833                              ext4_lblk_t lblk_start,
3834                              ext4_lblk_t lblk_end)
3835 {
3836         struct extent_status es;
3837
3838         ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
3839         if (es.es_len == 0)
3840                 return 0; /* there is no delay extent in this tree */
3841         else if (es.es_lblk <= lblk_start &&
3842                  lblk_start < es.es_lblk + es.es_len)
3843                 return 1;
3844         else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
3845                 return 1;
3846         else
3847                 return 0;
3848 }
3849
3850 int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
3851 {
3852         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3853         ext4_lblk_t lblk_start, lblk_end;
3854         lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
3855         lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3856
3857         return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
3858 }
3859
3860 /**
3861  * Determines how many complete clusters (out of those specified by the 'map')
3862  * are under delalloc and were reserved quota for.
3863  * This function is called when we are writing out the blocks that were
3864  * originally written with their allocation delayed, but then the space was
3865  * allocated using fallocate() before the delayed allocation could be resolved.
3866  * The cases to look for are:
3867  * ('=' indicated delayed allocated blocks
3868  *  '-' indicates non-delayed allocated blocks)
3869  * (a) partial clusters towards beginning and/or end outside of allocated range
3870  *     are not delalloc'ed.
3871  *      Ex:
3872  *      |----c---=|====c====|====c====|===-c----|
3873  *               |++++++ allocated ++++++|
3874  *      ==> 4 complete clusters in above example
3875  *
3876  * (b) partial cluster (outside of allocated range) towards either end is
3877  *     marked for delayed allocation. In this case, we will exclude that
3878  *     cluster.
3879  *      Ex:
3880  *      |----====c========|========c========|
3881  *           |++++++ allocated ++++++|
3882  *      ==> 1 complete clusters in above example
3883  *
3884  *      Ex:
3885  *      |================c================|
3886  *            |++++++ allocated ++++++|
3887  *      ==> 0 complete clusters in above example
3888  *
3889  * The ext4_da_update_reserve_space will be called only if we
3890  * determine here that there were some "entire" clusters that span
3891  * this 'allocated' range.
3892  * In the non-bigalloc case, this function will just end up returning num_blks
3893  * without ever calling ext4_find_delalloc_range.
3894  */
3895 static unsigned int
3896 get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3897                            unsigned int num_blks)
3898 {
3899         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3900         ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3901         ext4_lblk_t lblk_from, lblk_to, c_offset;
3902         unsigned int allocated_clusters = 0;
3903
3904         alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3905         alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3906
3907         /* max possible clusters for this allocation */
3908         allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3909
3910         trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3911
3912         /* Check towards left side */
3913         c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
3914         if (c_offset) {
3915                 lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
3916                 lblk_to = lblk_from + c_offset - 1;
3917
3918                 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3919                         allocated_clusters--;
3920         }
3921
3922         /* Now check towards right. */
3923         c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
3924         if (allocated_clusters && c_offset) {
3925                 lblk_from = lblk_start + num_blks;
3926                 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3927
3928                 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3929                         allocated_clusters--;
3930         }
3931
3932         return allocated_clusters;
3933 }
3934
3935 static int
3936 convert_initialized_extent(handle_t *handle, struct inode *inode,
3937                            struct ext4_map_blocks *map,
3938                            struct ext4_ext_path **ppath, int flags,
3939                            unsigned int allocated, ext4_fsblk_t newblock)
3940 {
3941         struct ext4_ext_path *path = *ppath;
3942         struct ext4_extent *ex;
3943         ext4_lblk_t ee_block;
3944         unsigned int ee_len;
3945         int depth;
3946         int err = 0;
3947
3948         /*
3949          * Make sure that the extent is no bigger than we support with
3950          * unwritten extent
3951          */
3952         if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3953                 map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3954
3955         depth = ext_depth(inode);
3956         ex = path[depth].p_ext;
3957         ee_block = le32_to_cpu(ex->ee_block);
3958         ee_len = ext4_ext_get_actual_len(ex);
3959
3960         ext_debug("%s: inode %lu, logical"
3961                 "block %llu, max_blocks %u\n", __func__, inode->i_ino,
3962                   (unsigned long long)ee_block, ee_len);
3963
3964         if (ee_block != map->m_lblk || ee_len > map->m_len) {
3965                 err = ext4_split_convert_extents(handle, inode, map, ppath,
3966                                 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3967                 if (err < 0)
3968                         return err;
3969                 path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3970                 if (IS_ERR(path))
3971                         return PTR_ERR(path);
3972                 depth = ext_depth(inode);
3973                 ex = path[depth].p_ext;
3974                 if (!ex) {
3975                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3976                                          (unsigned long) map->m_lblk);
3977                         return -EFSCORRUPTED;
3978                 }
3979         }
3980
3981         err = ext4_ext_get_access(handle, inode, path + depth);
3982         if (err)
3983                 return err;
3984         /* first mark the extent as unwritten */
3985         ext4_ext_mark_unwritten(ex);
3986
3987         /* note: ext4_ext_correct_indexes() isn't needed here because
3988          * borders are not changed
3989          */
3990         ext4_ext_try_to_merge(handle, inode, path, ex);
3991
3992         /* Mark modified extent as dirty */
3993         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3994         if (err)
3995                 return err;
3996         ext4_ext_show_leaf(inode, path);
3997
3998         ext4_update_inode_fsync_trans(handle, inode, 1);
3999         err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
4000         if (err)
4001                 return err;
4002         map->m_flags |= EXT4_MAP_UNWRITTEN;
4003         if (allocated > map->m_len)
4004                 allocated = map->m_len;
4005         map->m_len = allocated;
4006         return allocated;
4007 }
4008
4009 static int
4010 ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
4011                         struct ext4_map_blocks *map,
4012                         struct ext4_ext_path **ppath, int flags,
4013                         unsigned int allocated, ext4_fsblk_t newblock)
4014 {
4015         struct ext4_ext_path *path = *ppath;
4016         int ret = 0;
4017         int err = 0;
4018         ext4_io_end_t *io = ext4_inode_aio(inode);
4019
4020         ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
4021                   "block %llu, max_blocks %u, flags %x, allocated %u\n",
4022                   inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
4023                   flags, allocated);
4024         ext4_ext_show_leaf(inode, path);
4025
4026         /*
4027          * When writing into unwritten space, we should not fail to
4028          * allocate metadata blocks for the new extent block if needed.
4029          */
4030         flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
4031
4032         trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
4033                                                     allocated, newblock);
4034
4035         /* get_block() before submit the IO, split the extent */
4036         if (flags & EXT4_GET_BLOCKS_PRE_IO) {
4037                 ret = ext4_split_convert_extents(handle, inode, map, ppath,
4038                                          flags | EXT4_GET_BLOCKS_CONVERT);
4039                 if (ret <= 0)
4040                         goto out;
4041                 /*
4042                  * Flag the inode(non aio case) or end_io struct (aio case)
4043                  * that this IO needs to conversion to written when IO is
4044                  * completed
4045                  */
4046                 if (io)
4047                         ext4_set_io_unwritten_flag(inode, io);
4048                 else
4049                         ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
4050                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4051                 goto out;
4052         }
4053         /* IO end_io complete, convert the filled extent to written */
4054         if (flags & EXT4_GET_BLOCKS_CONVERT) {
4055                 ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
4056                                                            ppath);
4057                 if (ret >= 0) {
4058                         ext4_update_inode_fsync_trans(handle, inode, 1);
4059                         err = check_eofblocks_fl(handle, inode, map->m_lblk,
4060                                                  path, map->m_len);
4061                 } else
4062                         err = ret;
4063                 map->m_flags |= EXT4_MAP_MAPPED;
4064                 map->m_pblk = newblock;
4065                 if (allocated > map->m_len)
4066                         allocated = map->m_len;
4067                 map->m_len = allocated;
4068                 goto out2;
4069         }
4070         /* buffered IO case */
4071         /*
4072          * repeat fallocate creation request
4073          * we already have an unwritten extent
4074          */
4075         if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4076                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4077                 goto map_out;
4078         }
4079
4080         /* buffered READ or buffered write_begin() lookup */
4081         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4082                 /*
4083                  * We have blocks reserved already.  We
4084                  * return allocated blocks so that delalloc
4085                  * won't do block reservation for us.  But
4086                  * the buffer head will be unmapped so that
4087                  * a read from the block returns 0s.
4088                  */
4089                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4090                 goto out1;
4091         }
4092
4093         /* buffered write, writepage time, convert*/
4094         ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
4095         if (ret >= 0)
4096                 ext4_update_inode_fsync_trans(handle, inode, 1);
4097 out:
4098         if (ret <= 0) {
4099                 err = ret;
4100                 goto out2;
4101         } else
4102                 allocated = ret;
4103         map->m_flags |= EXT4_MAP_NEW;
4104         /*
4105          * if we allocated more blocks than requested
4106          * we need to make sure we unmap the extra block
4107          * allocated. The actual needed block will get
4108          * unmapped later when we find the buffer_head marked
4109          * new.
4110          */
4111         if (allocated > map->m_len) {
4112                 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
4113                                         newblock + map->m_len,
4114                                         allocated - map->m_len);
4115                 allocated = map->m_len;
4116         }
4117         map->m_len = allocated;
4118
4119         /*
4120          * If we have done fallocate with the offset that is already
4121          * delayed allocated, we would have block reservation
4122          * and quota reservation done in the delayed write path.
4123          * But fallocate would have already updated quota and block
4124          * count for this offset. So cancel these reservation
4125          */
4126         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4127                 unsigned int reserved_clusters;
4128                 reserved_clusters = get_reserved_cluster_alloc(inode,
4129                                 map->m_lblk, map->m_len);
4130                 if (reserved_clusters)
4131                         ext4_da_update_reserve_space(inode,
4132                                                      reserved_clusters,
4133                                                      0);
4134         }
4135
4136 map_out:
4137         map->m_flags |= EXT4_MAP_MAPPED;
4138         if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
4139                 err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
4140                                          map->m_len);
4141                 if (err < 0)
4142                         goto out2;
4143         }
4144 out1:
4145         if (allocated > map->m_len)
4146                 allocated = map->m_len;
4147         ext4_ext_show_leaf(inode, path);
4148         map->m_pblk = newblock;
4149         map->m_len = allocated;
4150 out2:
4151         return err ? err : allocated;
4152 }
4153
4154 /*
4155  * get_implied_cluster_alloc - check to see if the requested
4156  * allocation (in the map structure) overlaps with a cluster already
4157  * allocated in an extent.
4158  *      @sb     The filesystem superblock structure
4159  *      @map    The requested lblk->pblk mapping
4160  *      @ex     The extent structure which might contain an implied
4161  *                      cluster allocation
4162  *
4163  * This function is called by ext4_ext_map_blocks() after we failed to
4164  * find blocks that were already in the inode's extent tree.  Hence,
4165  * we know that the beginning of the requested region cannot overlap
4166  * the extent from the inode's extent tree.  There are three cases we
4167  * want to catch.  The first is this case:
4168  *
4169  *               |--- cluster # N--|
4170  *    |--- extent ---|  |---- requested region ---|
4171  *                      |==========|
4172  *
4173  * The second case that we need to test for is this one:
4174  *
4175  *   |--------- cluster # N ----------------|
4176  *         |--- requested region --|   |------- extent ----|
4177  *         |=======================|
4178  *
4179  * The third case is when the requested region lies between two extents
4180  * within the same cluster:
4181  *          |------------- cluster # N-------------|
4182  * |----- ex -----|                  |---- ex_right ----|
4183  *                  |------ requested region ------|
4184  *                  |================|
4185  *
4186  * In each of the above cases, we need to set the map->m_pblk and
4187  * map->m_len so it corresponds to the return the extent labelled as
4188  * "|====|" from cluster #N, since it is already in use for data in
4189  * cluster EXT4_B2C(sbi, map->m_lblk).  We will then return 1 to
4190  * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
4191  * as a new "allocated" block region.  Otherwise, we will return 0 and
4192  * ext4_ext_map_blocks() will then allocate one or more new clusters
4193  * by calling ext4_mb_new_blocks().
4194  */
4195 static int get_implied_cluster_alloc(struct super_block *sb,
4196                                      struct ext4_map_blocks *map,
4197                                      struct ext4_extent *ex,
4198                                      struct ext4_ext_path *path)
4199 {
4200         struct ext4_sb_info *sbi = EXT4_SB(sb);
4201         ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4202         ext4_lblk_t ex_cluster_start, ex_cluster_end;
4203         ext4_lblk_t rr_cluster_start;
4204         ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4205         ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4206         unsigned short ee_len = ext4_ext_get_actual_len(ex);
4207
4208         /* The extent passed in that we are trying to match */
4209         ex_cluster_start = EXT4_B2C(sbi, ee_block);
4210         ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
4211
4212         /* The requested region passed into ext4_map_blocks() */
4213         rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
4214
4215         if ((rr_cluster_start == ex_cluster_end) ||
4216             (rr_cluster_start == ex_cluster_start)) {
4217                 if (rr_cluster_start == ex_cluster_end)
4218                         ee_start += ee_len - 1;
4219                 map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
4220                 map->m_len = min(map->m_len,
4221                                  (unsigned) sbi->s_cluster_ratio - c_offset);
4222                 /*
4223                  * Check for and handle this case:
4224                  *
4225                  *   |--------- cluster # N-------------|
4226                  *                     |------- extent ----|
4227                  *         |--- requested region ---|
4228                  *         |===========|
4229                  */
4230
4231                 if (map->m_lblk < ee_block)
4232                         map->m_len = min(map->m_len, ee_block - map->m_lblk);
4233
4234                 /*
4235                  * Check for the case where there is already another allocated
4236                  * block to the right of 'ex' but before the end of the cluster.
4237                  *
4238                  *          |------------- cluster # N-------------|
4239                  * |----- ex -----|                  |---- ex_right ----|
4240                  *                  |------ requested region ------|
4241                  *                  |================|
4242                  */
4243                 if (map->m_lblk > ee_block) {
4244                         ext4_lblk_t next = ext4_ext_next_allocated_block(path);
4245                         map->m_len = min(map->m_len, next - map->m_lblk);
4246                 }
4247
4248                 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
4249                 return 1;
4250         }
4251
4252         trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
4253         return 0;
4254 }
4255
4256
4257 /*
4258  * Block allocation/map/preallocation routine for extents based files
4259  *
4260  *
4261  * Need to be called with
4262  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4263  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4264  *
4265  * return > 0, number of of blocks already mapped/allocated
4266  *          if create == 0 and these are pre-allocated blocks
4267  *              buffer head is unmapped
4268  *          otherwise blocks are mapped
4269  *
4270  * return = 0, if plain look up failed (blocks have not been allocated)
4271  *          buffer head is unmapped
4272  *
4273  * return < 0, error case.
4274  */
4275 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4276                         struct ext4_map_blocks *map, int flags)
4277 {
4278         struct ext4_ext_path *path = NULL;
4279         struct ext4_extent newex, *ex, *ex2;
4280         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4281         ext4_fsblk_t newblock = 0;
4282         int free_on_err = 0, err = 0, depth, ret;
4283         unsigned int allocated = 0, offset = 0;
4284         unsigned int allocated_clusters = 0;
4285         struct ext4_allocation_request ar;
4286         ext4_io_end_t *io = ext4_inode_aio(inode);
4287         ext4_lblk_t cluster_offset;
4288         int set_unwritten = 0;
4289         bool map_from_cluster = false;
4290
4291         ext_debug("blocks %u/%u requested for inode %lu\n",
4292                   map->m_lblk, map->m_len, inode->i_ino);
4293         trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4294
4295         /* find extent for this block */
4296         path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
4297         if (IS_ERR(path)) {
4298                 err = PTR_ERR(path);
4299                 path = NULL;
4300                 goto out2;
4301         }
4302
4303         depth = ext_depth(inode);
4304
4305         /*
4306          * consistent leaf must not be empty;
4307          * this situation is possible, though, _during_ tree modification;
4308          * this is why assert can't be put in ext4_find_extent()
4309          */
4310         if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4311                 EXT4_ERROR_INODE(inode, "bad extent address "
4312                                  "lblock: %lu, depth: %d pblock %lld",
4313                                  (unsigned long) map->m_lblk, depth,
4314                                  path[depth].p_block);
4315                 err = -EFSCORRUPTED;
4316                 goto out2;
4317         }
4318
4319         ex = path[depth].p_ext;
4320         if (ex) {
4321                 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4322                 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4323                 unsigned short ee_len;
4324
4325
4326                 /*
4327                  * unwritten extents are treated as holes, except that
4328                  * we split out initialized portions during a write.
4329                  */
4330                 ee_len = ext4_ext_get_actual_len(ex);
4331
4332                 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4333
4334                 /* if found extent covers block, simply return it */
4335                 if (in_range(map->m_lblk, ee_block, ee_len)) {
4336                         newblock = map->m_lblk - ee_block + ee_start;
4337                         /* number of remaining blocks in the extent */
4338                         allocated = ee_len - (map->m_lblk - ee_block);
4339                         ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4340                                   ee_block, ee_len, newblock);
4341
4342                         /*
4343                          * If the extent is initialized check whether the
4344                          * caller wants to convert it to unwritten.
4345                          */
4346                         if ((!ext4_ext_is_unwritten(ex)) &&
4347                             (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4348                                 allocated = convert_initialized_extent(
4349                                                 handle, inode, map, &path,
4350                                                 flags, allocated, newblock);
4351                                 goto out2;
4352                         } else if (!ext4_ext_is_unwritten(ex))
4353                                 goto out;
4354
4355                         ret = ext4_ext_handle_unwritten_extents(
4356                                 handle, inode, map, &path, flags,
4357                                 allocated, newblock);
4358                         if (ret < 0)
4359                                 err = ret;
4360                         else
4361                                 allocated = ret;
4362                         goto out2;
4363                 }
4364         }
4365
4366         /*
4367          * requested block isn't allocated yet;
4368          * we couldn't try to create block if create flag is zero
4369          */
4370         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4371                 /*
4372                  * put just found gap into cache to speed up
4373                  * subsequent requests
4374                  */
4375                 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
4376                 goto out2;
4377         }
4378
4379         /*
4380          * Okay, we need to do block allocation.
4381          */
4382         newex.ee_block = cpu_to_le32(map->m_lblk);
4383         cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4384
4385         /*
4386          * If we are doing bigalloc, check to see if the extent returned
4387          * by ext4_find_extent() implies a cluster we can use.
4388          */
4389         if (cluster_offset && ex &&
4390             get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4391                 ar.len = allocated = map->m_len;
4392                 newblock = map->m_pblk;
4393                 map_from_cluster = true;
4394                 goto got_allocated_blocks;
4395         }
4396
4397         /* find neighbour allocated blocks */
4398         ar.lleft = map->m_lblk;
4399         err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4400         if (err)
4401                 goto out2;
4402         ar.lright = map->m_lblk;
4403         ex2 = NULL;
4404         err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4405         if (err)
4406                 goto out2;
4407
4408         /* Check if the extent after searching to the right implies a
4409          * cluster we can use. */
4410         if ((sbi->s_cluster_ratio > 1) && ex2 &&
4411             get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4412                 ar.len = allocated = map->m_len;
4413                 newblock = map->m_pblk;
4414                 map_from_cluster = true;
4415                 goto got_allocated_blocks;
4416         }
4417
4418         /*
4419          * See if request is beyond maximum number of blocks we can have in
4420          * a single extent. For an initialized extent this limit is
4421          * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4422          * EXT_UNWRITTEN_MAX_LEN.
4423          */
4424         if (map->m_len > EXT_INIT_MAX_LEN &&
4425             !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4426                 map->m_len = EXT_INIT_MAX_LEN;
4427         else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4428                  (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4429                 map->m_len = EXT_UNWRITTEN_MAX_LEN;
4430
4431         /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4432         newex.ee_len = cpu_to_le16(map->m_len);
4433         err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4434         if (err)
4435                 allocated = ext4_ext_get_actual_len(&newex);
4436         else
4437                 allocated = map->m_len;
4438
4439         /* allocate new block */
4440         ar.inode = inode;
4441         ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4442         ar.logical = map->m_lblk;
4443         /*
4444          * We calculate the offset from the beginning of the cluster
4445          * for the logical block number, since when we allocate a
4446          * physical cluster, the physical block should start at the
4447          * same offset from the beginning of the cluster.  This is
4448          * needed so that future calls to get_implied_cluster_alloc()
4449          * work correctly.
4450          */
4451         offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4452         ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4453         ar.goal -= offset;
4454         ar.logical -= offset;
4455         if (S_ISREG(inode->i_mode))
4456                 ar.flags = EXT4_MB_HINT_DATA;
4457         else
4458                 /* disable in-core preallocation for non-regular files */
4459                 ar.flags = 0;
4460         if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4461                 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4462         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4463                 ar.flags |= EXT4_MB_DELALLOC_RESERVED;
4464         if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
4465                 ar.flags |= EXT4_MB_USE_RESERVED;
4466         newblock = ext4_mb_new_blocks(handle, &ar, &err);
4467         if (!newblock)
4468                 goto out2;
4469         ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4470                   ar.goal, newblock, allocated);
4471         free_on_err = 1;
4472         allocated_clusters = ar.len;
4473         ar.len = EXT4_C2B(sbi, ar.len) - offset;
4474         if (ar.len > allocated)
4475                 ar.len = allocated;
4476
4477 got_allocated_blocks:
4478         /* try to insert new extent into found leaf and return */
4479         ext4_ext_store_pblock(&newex, newblock + offset);
4480         newex.ee_len = cpu_to_le16(ar.len);
4481         /* Mark unwritten */
4482         if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
4483                 ext4_ext_mark_unwritten(&newex);
4484                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4485                 /*
4486                  * io_end structure was created for every IO write to an
4487                  * unwritten extent. To avoid unnecessary conversion,
4488                  * here we flag the IO that really needs the conversion.
4489                  * For non asycn direct IO case, flag the inode state
4490                  * that we need to perform conversion when IO is done.
4491                  */
4492                 if (flags & EXT4_GET_BLOCKS_PRE_IO)
4493                         set_unwritten = 1;
4494         }
4495
4496         err = 0;
4497         if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4498                 err = check_eofblocks_fl(handle, inode, map->m_lblk,
4499                                          path, ar.len);
4500         if (!err)
4501                 err = ext4_ext_insert_extent(handle, inode, &path,
4502                                              &newex, flags);
4503
4504         if (!err && set_unwritten) {
4505                 if (io)
4506                         ext4_set_io_unwritten_flag(inode, io);
4507                 else
4508                         ext4_set_inode_state(inode,
4509                                              EXT4_STATE_DIO_UNWRITTEN);
4510         }
4511
4512         if (err && free_on_err) {
4513                 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4514                         EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4515                 /* free data blocks we just allocated */
4516                 /* not a good idea to call discard here directly,
4517                  * but otherwise we'd need to call it every free() */
4518                 ext4_discard_preallocations(inode);
4519                 ext4_free_blocks(handle, inode, NULL, newblock,
4520                                  EXT4_C2B(sbi, allocated_clusters), fb_flags);
4521                 goto out2;
4522         }
4523
4524         /* previous routine could use block we allocated */
4525         newblock = ext4_ext_pblock(&newex);
4526         allocated = ext4_ext_get_actual_len(&newex);
4527         if (allocated > map->m_len)
4528                 allocated = map->m_len;
4529         map->m_flags |= EXT4_MAP_NEW;
4530
4531         /*
4532          * Update reserved blocks/metadata blocks after successful
4533          * block allocation which had been deferred till now.
4534          */
4535         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4536                 unsigned int reserved_clusters;
4537                 /*
4538                  * Check how many clusters we had reserved this allocated range
4539                  */
4540                 reserved_clusters = get_reserved_cluster_alloc(inode,
4541                                                 map->m_lblk, allocated);
4542                 if (!map_from_cluster) {
4543                         BUG_ON(allocated_clusters < reserved_clusters);
4544                         if (reserved_clusters < allocated_clusters) {
4545                                 struct ext4_inode_info *ei = EXT4_I(inode);
4546                                 int reservation = allocated_clusters -
4547                                                   reserved_clusters;
4548                                 /*
4549                                  * It seems we claimed few clusters outside of
4550                                  * the range of this allocation. We should give
4551                                  * it back to the reservation pool. This can
4552                                  * happen in the following case:
4553                                  *
4554                                  * * Suppose s_cluster_ratio is 4 (i.e., each
4555                                  *   cluster has 4 blocks. Thus, the clusters
4556                                  *   are [0-3],[4-7],[8-11]...
4557                                  * * First comes delayed allocation write for
4558                                  *   logical blocks 10 & 11. Since there were no
4559                                  *   previous delayed allocated blocks in the
4560                                  *   range [8-11], we would reserve 1 cluster
4561                                  *   for this write.
4562                                  * * Next comes write for logical blocks 3 to 8.
4563                                  *   In this case, we will reserve 2 clusters
4564                                  *   (for [0-3] and [4-7]; and not for [8-11] as
4565                                  *   that range has a delayed allocated blocks.
4566                                  *   Thus total reserved clusters now becomes 3.
4567                                  * * Now, during the delayed allocation writeout
4568                                  *   time, we will first write blocks [3-8] and
4569                                  *   allocate 3 clusters for writing these
4570                                  *   blocks. Also, we would claim all these
4571                                  *   three clusters above.
4572                                  * * Now when we come here to writeout the
4573                                  *   blocks [10-11], we would expect to claim
4574                                  *   the reservation of 1 cluster we had made
4575                                  *   (and we would claim it since there are no
4576                                  *   more delayed allocated blocks in the range
4577                                  *   [8-11]. But our reserved cluster count had
4578                                  *   already gone to 0.
4579                                  *
4580                                  *   Thus, at the step 4 above when we determine
4581                                  *   that there are still some unwritten delayed
4582                                  *   allocated blocks outside of our current
4583                                  *   block range, we should increment the
4584                                  *   reserved clusters count so that when the
4585                                  *   remaining blocks finally gets written, we
4586                                  *   could claim them.
4587                                  */
4588                                 dquot_reserve_block(inode,
4589                                                 EXT4_C2B(sbi, reservation));
4590                                 spin_lock(&ei->i_block_reservation_lock);
4591                                 ei->i_reserved_data_blocks += reservation;
4592                                 spin_unlock(&ei->i_block_reservation_lock);
4593                         }
4594                         /*
4595                          * We will claim quota for all newly allocated blocks.
4596                          * We're updating the reserved space *after* the
4597                          * correction above so we do not accidentally free
4598                          * all the metadata reservation because we might
4599                          * actually need it later on.
4600                          */
4601                         ext4_da_update_reserve_space(inode, allocated_clusters,
4602                                                         1);
4603                 }
4604         }
4605
4606         /*
4607          * Cache the extent and update transaction to commit on fdatasync only
4608          * when it is _not_ an unwritten extent.
4609          */
4610         if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4611                 ext4_update_inode_fsync_trans(handle, inode, 1);
4612         else
4613                 ext4_update_inode_fsync_trans(handle, inode, 0);
4614 out:
4615         if (allocated > map->m_len)
4616                 allocated = map->m_len;
4617         ext4_ext_show_leaf(inode, path);
4618         map->m_flags |= EXT4_MAP_MAPPED;
4619         map->m_pblk = newblock;
4620         map->m_len = allocated;
4621 out2:
4622         ext4_ext_drop_refs(path);
4623         kfree(path);
4624
4625         trace_ext4_ext_map_blocks_exit(inode, flags, map,
4626                                        err ? err : allocated);
4627         return err ? err : allocated;
4628 }
4629
4630 void ext4_ext_truncate(handle_t *handle, struct inode *inode)
4631 {
4632         struct super_block *sb = inode->i_sb;
4633         ext4_lblk_t last_block;
4634         int err = 0;
4635
4636         /*
4637          * TODO: optimization is possible here.
4638          * Probably we need not scan at all,
4639          * because page truncation is enough.
4640          */
4641
4642         /* we have to know where to truncate from in crash case */
4643         EXT4_I(inode)->i_disksize = inode->i_size;
4644         ext4_mark_inode_dirty(handle, inode);
4645
4646         last_block = (inode->i_size + sb->s_blocksize - 1)
4647                         >> EXT4_BLOCK_SIZE_BITS(sb);
4648 retry:
4649         err = ext4_es_remove_extent(inode, last_block,
4650                                     EXT_MAX_BLOCKS - last_block);
4651         if (err == -ENOMEM) {
4652                 cond_resched();
4653                 congestion_wait(BLK_RW_ASYNC, HZ/50);
4654                 goto retry;
4655         }
4656         if (err) {
4657                 ext4_std_error(inode->i_sb, err);
4658                 return;
4659         }
4660         err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4661         ext4_std_error(inode->i_sb, err);
4662 }
4663
4664 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4665                                   ext4_lblk_t len, loff_t new_size,
4666                                   int flags, int mode)
4667 {
4668         struct inode *inode = file_inode(file);
4669         handle_t *handle;
4670         int ret = 0;
4671         int ret2 = 0;
4672         int retries = 0;
4673         int depth = 0;
4674         struct ext4_map_blocks map;
4675         unsigned int credits;
4676         loff_t epos;
4677
4678         map.m_lblk = offset;
4679         map.m_len = len;
4680         /*
4681          * Don't normalize the request if it can fit in one extent so
4682          * that it doesn't get unnecessarily split into multiple
4683          * extents.
4684          */
4685         if (len <= EXT_UNWRITTEN_MAX_LEN)
4686                 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4687
4688         /* Wait all existing dio workers, newcomers will block on i_mutex */
4689         ext4_inode_block_unlocked_dio(inode);
4690         inode_dio_wait(inode);
4691
4692         /*
4693          * credits to insert 1 extent into extent tree
4694          */
4695         credits = ext4_chunk_trans_blocks(inode, len);
4696         /*
4697          * We can only call ext_depth() on extent based inodes
4698          */
4699         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4700                 depth = ext_depth(inode);
4701         else
4702                 depth = -1;
4703
4704 retry:
4705         while (ret >= 0 && len) {
4706                 /*
4707                  * Recalculate credits when extent tree depth changes.
4708                  */
4709                 if (depth >= 0 && depth != ext_depth(inode)) {
4710                         credits = ext4_chunk_trans_blocks(inode, len);
4711                         depth = ext_depth(inode);
4712                 }
4713
4714                 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4715                                             credits);
4716                 if (IS_ERR(handle)) {
4717                         ret = PTR_ERR(handle);
4718                         break;
4719                 }
4720                 ret = ext4_map_blocks(handle, inode, &map, flags);
4721                 if (ret <= 0) {
4722                         ext4_debug("inode #%lu: block %u: len %u: "
4723                                    "ext4_ext_map_blocks returned %d",
4724                                    inode->i_ino, map.m_lblk,
4725                                    map.m_len, ret);
4726                         ext4_mark_inode_dirty(handle, inode);
4727                         ret2 = ext4_journal_stop(handle);
4728                         break;
4729                 }
4730                 map.m_lblk += ret;
4731                 map.m_len = len = len - ret;
4732                 epos = (loff_t)map.m_lblk << inode->i_blkbits;
4733                 inode->i_ctime = ext4_current_time(inode);
4734                 if (new_size) {
4735                         if (epos > new_size)
4736                                 epos = new_size;
4737                         if (ext4_update_inode_size(inode, epos) & 0x1)
4738                                 inode->i_mtime = inode->i_ctime;
4739                 } else {
4740                         if (epos > inode->i_size)
4741                                 ext4_set_inode_flag(inode,
4742                                                     EXT4_INODE_EOFBLOCKS);
4743                 }
4744                 ext4_mark_inode_dirty(handle, inode);
4745                 ret2 = ext4_journal_stop(handle);
4746                 if (ret2)
4747                         break;
4748         }
4749         if (ret == -ENOSPC &&
4750                         ext4_should_retry_alloc(inode->i_sb, &retries)) {
4751                 ret = 0;
4752                 goto retry;
4753         }
4754
4755         ext4_inode_resume_unlocked_dio(inode);
4756
4757         return ret > 0 ? ret2 : ret;
4758 }
4759
4760 static long ext4_zero_range(struct file *file, loff_t offset,
4761                             loff_t len, int mode)
4762 {
4763         struct inode *inode = file_inode(file);
4764         handle_t *handle = NULL;
4765         unsigned int max_blocks;
4766         loff_t new_size = 0;
4767         int ret = 0;
4768         int flags;
4769         int credits;
4770         int partial_begin, partial_end;
4771         loff_t start, end;
4772         ext4_lblk_t lblk;
4773         struct address_space *mapping = inode->i_mapping;
4774         unsigned int blkbits = inode->i_blkbits;
4775
4776         trace_ext4_zero_range(inode, offset, len, mode);
4777
4778         if (!S_ISREG(inode->i_mode))
4779                 return -EINVAL;
4780
4781         /* Call ext4_force_commit to flush all data in case of data=journal. */
4782         if (ext4_should_journal_data(inode)) {
4783                 ret = ext4_force_commit(inode->i_sb);
4784                 if (ret)
4785                         return ret;
4786         }
4787
4788         /*
4789          * Write out all dirty pages to avoid race conditions
4790          * Then release them.
4791          */
4792         if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4793                 ret = filemap_write_and_wait_range(mapping, offset,
4794                                                    offset + len - 1);
4795                 if (ret)
4796                         return ret;
4797         }
4798
4799         /*
4800          * Round up offset. This is not fallocate, we neet to zero out
4801          * blocks, so convert interior block aligned part of the range to
4802          * unwritten and possibly manually zero out unaligned parts of the
4803          * range.
4804          */
4805         start = round_up(offset, 1 << blkbits);
4806         end = round_down((offset + len), 1 << blkbits);
4807
4808         if (start < offset || end > offset + len)
4809                 return -EINVAL;
4810         partial_begin = offset & ((1 << blkbits) - 1);
4811         partial_end = (offset + len) & ((1 << blkbits) - 1);
4812
4813         lblk = start >> blkbits;
4814         max_blocks = (end >> blkbits);
4815         if (max_blocks < lblk)
4816                 max_blocks = 0;
4817         else
4818                 max_blocks -= lblk;
4819
4820         mutex_lock(&inode->i_mutex);
4821
4822         /*
4823          * Indirect files do not support unwritten extnets
4824          */
4825         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4826                 ret = -EOPNOTSUPP;
4827                 goto out_mutex;
4828         }
4829
4830         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4831              offset + len > i_size_read(inode)) {
4832                 new_size = offset + len;
4833                 ret = inode_newsize_ok(inode, new_size);
4834                 if (ret)
4835                         goto out_mutex;
4836         }
4837
4838         flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4839         if (mode & FALLOC_FL_KEEP_SIZE)
4840                 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4841
4842         /* Preallocate the range including the unaligned edges */
4843         if (partial_begin || partial_end) {
4844                 ret = ext4_alloc_file_blocks(file,
4845                                 round_down(offset, 1 << blkbits) >> blkbits,
4846                                 (round_up((offset + len), 1 << blkbits) -
4847                                  round_down(offset, 1 << blkbits)) >> blkbits,
4848                                 new_size, flags, mode);
4849                 if (ret)
4850                         goto out_mutex;
4851
4852         }
4853
4854         /* Zero range excluding the unaligned edges */
4855         if (max_blocks > 0) {
4856                 flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4857                           EXT4_EX_NOCACHE);
4858
4859                 /* Now release the pages and zero block aligned part of pages*/
4860                 truncate_pagecache_range(inode, start, end - 1);
4861                 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4862
4863                 /* Wait all existing dio workers, newcomers will block on i_mutex */
4864                 ext4_inode_block_unlocked_dio(inode);
4865                 inode_dio_wait(inode);
4866
4867                 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4868                                              flags, mode);
4869                 if (ret)
4870                         goto out_dio;
4871         }
4872         if (!partial_begin && !partial_end)
4873                 goto out_dio;
4874
4875         /*
4876          * In worst case we have to writeout two nonadjacent unwritten
4877          * blocks and update the inode
4878          */
4879         credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4880         if (ext4_should_journal_data(inode))
4881                 credits += 2;
4882         handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4883         if (IS_ERR(handle)) {
4884                 ret = PTR_ERR(handle);
4885                 ext4_std_error(inode->i_sb, ret);
4886                 goto out_dio;
4887         }
4888
4889         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4890         if (new_size) {
4891                 ext4_update_inode_size(inode, new_size);
4892         } else {
4893                 /*
4894                 * Mark that we allocate beyond EOF so the subsequent truncate
4895                 * can proceed even if the new size is the same as i_size.
4896                 */
4897                 if ((offset + len) > i_size_read(inode))
4898                         ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4899         }
4900         ext4_mark_inode_dirty(handle, inode);
4901
4902         /* Zero out partial block at the edges of the range */
4903         ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4904
4905         if (file->f_flags & O_SYNC)
4906                 ext4_handle_sync(handle);
4907
4908         ext4_journal_stop(handle);
4909 out_dio:
4910         ext4_inode_resume_unlocked_dio(inode);
4911 out_mutex:
4912         mutex_unlock(&inode->i_mutex);
4913         return ret;
4914 }
4915
4916 /*
4917  * preallocate space for a file. This implements ext4's fallocate file
4918  * operation, which gets called from sys_fallocate system call.
4919  * For block-mapped files, posix_fallocate should fall back to the method
4920  * of writing zeroes to the required new blocks (the same behavior which is
4921  * expected for file systems which do not support fallocate() system call).
4922  */
4923 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4924 {
4925         struct inode *inode = file_inode(file);
4926         loff_t new_size = 0;
4927         unsigned int max_blocks;
4928         int ret = 0;
4929         int flags;
4930         ext4_lblk_t lblk;
4931         unsigned int blkbits = inode->i_blkbits;
4932
4933         /*
4934          * Encrypted inodes can't handle collapse range or insert
4935          * range since we would need to re-encrypt blocks with a
4936          * different IV or XTS tweak (which are based on the logical
4937          * block number).
4938          *
4939          * XXX It's not clear why zero range isn't working, but we'll
4940          * leave it disabled for encrypted inodes for now.  This is a
4941          * bug we should fix....
4942          */
4943         if (ext4_encrypted_inode(inode) &&
4944             (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
4945                      FALLOC_FL_ZERO_RANGE)))
4946                 return -EOPNOTSUPP;
4947
4948         /* Return error if mode is not supported */
4949         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4950                      FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
4951                      FALLOC_FL_INSERT_RANGE))
4952                 return -EOPNOTSUPP;
4953
4954         if (mode & FALLOC_FL_PUNCH_HOLE)
4955                 return ext4_punch_hole(inode, offset, len);
4956
4957         ret = ext4_convert_inline_data(inode);
4958         if (ret)
4959                 return ret;
4960
4961         if (mode & FALLOC_FL_COLLAPSE_RANGE)
4962                 return ext4_collapse_range(inode, offset, len);
4963
4964         if (mode & FALLOC_FL_INSERT_RANGE)
4965                 return ext4_insert_range(inode, offset, len);
4966
4967         if (mode & FALLOC_FL_ZERO_RANGE)
4968                 return ext4_zero_range(file, offset, len, mode);
4969
4970         trace_ext4_fallocate_enter(inode, offset, len, mode);
4971         lblk = offset >> blkbits;
4972         /*
4973          * We can't just convert len to max_blocks because
4974          * If blocksize = 4096 offset = 3072 and len = 2048
4975          */
4976         max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4977                 - lblk;
4978
4979         flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4980         if (mode & FALLOC_FL_KEEP_SIZE)
4981                 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4982
4983         mutex_lock(&inode->i_mutex);
4984
4985         /*
4986          * We only support preallocation for extent-based files only
4987          */
4988         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4989                 ret = -EOPNOTSUPP;
4990                 goto out;
4991         }
4992
4993         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4994              offset + len > i_size_read(inode)) {
4995                 new_size = offset + len;
4996                 ret = inode_newsize_ok(inode, new_size);
4997                 if (ret)
4998                         goto out;
4999         }
5000
5001         ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
5002                                      flags, mode);
5003         if (ret)
5004                 goto out;
5005
5006         if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
5007                 ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
5008                                                 EXT4_I(inode)->i_sync_tid);
5009         }
5010 out:
5011         mutex_unlock(&inode->i_mutex);
5012         trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
5013         return ret;
5014 }
5015
5016 /*
5017  * This function convert a range of blocks to written extents
5018  * The caller of this function will pass the start offset and the size.
5019  * all unwritten extents within this range will be converted to
5020  * written extents.
5021  *
5022  * This function is called from the direct IO end io call back
5023  * function, to convert the fallocated extents after IO is completed.
5024  * Returns 0 on success.
5025  */
5026 int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
5027                                    loff_t offset, ssize_t len)
5028 {
5029         unsigned int max_blocks;
5030         int ret = 0;
5031         int ret2 = 0;
5032         struct ext4_map_blocks map;
5033         unsigned int credits, blkbits = inode->i_blkbits;
5034
5035         map.m_lblk = offset >> blkbits;
5036         /*
5037          * We can't just convert len to max_blocks because
5038          * If blocksize = 4096 offset = 3072 and len = 2048
5039          */
5040         max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
5041                       map.m_lblk);
5042         /*
5043          * This is somewhat ugly but the idea is clear: When transaction is
5044          * reserved, everything goes into it. Otherwise we rather start several
5045          * smaller transactions for conversion of each extent separately.
5046          */
5047         if (handle) {
5048                 handle = ext4_journal_start_reserved(handle,
5049                                                      EXT4_HT_EXT_CONVERT);
5050                 if (IS_ERR(handle))
5051                         return PTR_ERR(handle);
5052                 credits = 0;
5053         } else {
5054                 /*
5055                  * credits to insert 1 extent into extent tree
5056                  */
5057                 credits = ext4_chunk_trans_blocks(inode, max_blocks);
5058         }
5059         while (ret >= 0 && ret < max_blocks) {
5060                 map.m_lblk += ret;
5061                 map.m_len = (max_blocks -= ret);
5062                 if (credits) {
5063                         handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
5064                                                     credits);
5065                         if (IS_ERR(handle)) {
5066                                 ret = PTR_ERR(handle);
5067                                 break;
5068                         }
5069                 }
5070                 ret = ext4_map_blocks(handle, inode, &map,
5071                                       EXT4_GET_BLOCKS_IO_CONVERT_EXT);
5072                 if (ret <= 0)
5073                         ext4_warning(inode->i_sb,
5074                                      "inode #%lu: block %u: len %u: "
5075                                      "ext4_ext_map_blocks returned %d",
5076                                      inode->i_ino, map.m_lblk,
5077                                      map.m_len, ret);
5078                 ext4_mark_inode_dirty(handle, inode);
5079                 if (credits)
5080                         ret2 = ext4_journal_stop(handle);
5081                 if (ret <= 0 || ret2)
5082                         break;
5083         }
5084         if (!credits)
5085                 ret2 = ext4_journal_stop(handle);
5086         return ret > 0 ? ret2 : ret;
5087 }
5088
5089 /*
5090  * If newes is not existing extent (newes->ec_pblk equals zero) find
5091  * delayed extent at start of newes and update newes accordingly and
5092  * return start of the next delayed extent.
5093  *
5094  * If newes is existing extent (newes->ec_pblk is not equal zero)
5095  * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
5096  * extent found. Leave newes unmodified.
5097  */
5098 static int ext4_find_delayed_extent(struct inode *inode,
5099                                     struct extent_status *newes)
5100 {
5101         struct extent_status es;
5102         ext4_lblk_t block, next_del;
5103
5104         if (newes->es_pblk == 0) {
5105                 ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
5106                                 newes->es_lblk + newes->es_len - 1, &es);
5107
5108                 /*
5109                  * No extent in extent-tree contains block @newes->es_pblk,
5110                  * then the block may stay in 1)a hole or 2)delayed-extent.
5111                  */
5112                 if (es.es_len == 0)
5113                         /* A hole found. */
5114                         return 0;
5115
5116                 if (es.es_lblk > newes->es_lblk) {
5117                         /* A hole found. */
5118                         newes->es_len = min(es.es_lblk - newes->es_lblk,
5119                                             newes->es_len);
5120                         return 0;
5121                 }
5122
5123                 newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
5124         }
5125
5126         block = newes->es_lblk + newes->es_len;
5127         ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
5128         if (es.es_len == 0)
5129                 next_del = EXT_MAX_BLOCKS;
5130         else
5131                 next_del = es.es_lblk;
5132
5133         return next_del;
5134 }
5135 /* fiemap flags we can handle specified here */
5136 #define EXT4_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
5137
5138 static int ext4_xattr_fiemap(struct inode *inode,
5139                                 struct fiemap_extent_info *fieinfo)
5140 {
5141         __u64 physical = 0;
5142         __u64 length;
5143         __u32 flags = FIEMAP_EXTENT_LAST;
5144         int blockbits = inode->i_sb->s_blocksize_bits;
5145         int error = 0;
5146
5147         /* in-inode? */
5148         if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
5149                 struct ext4_iloc iloc;
5150                 int offset;     /* offset of xattr in inode */
5151
5152                 error = ext4_get_inode_loc(inode, &iloc);
5153                 if (error)
5154                         return error;
5155                 physical = (__u64)iloc.bh->b_blocknr << blockbits;
5156                 offset = EXT4_GOOD_OLD_INODE_SIZE +
5157                                 EXT4_I(inode)->i_extra_isize;
5158                 physical += offset;
5159                 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
5160                 flags |= FIEMAP_EXTENT_DATA_INLINE;
5161                 brelse(iloc.bh);
5162         } else { /* external block */
5163                 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
5164                 length = inode->i_sb->s_blocksize;
5165         }
5166
5167         if (physical)
5168                 error = fiemap_fill_next_extent(fieinfo, 0, physical,
5169                                                 length, flags);
5170         return (error < 0 ? error : 0);
5171 }
5172
5173 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5174                 __u64 start, __u64 len)
5175 {
5176         ext4_lblk_t start_blk;
5177         int error = 0;
5178
5179         if (ext4_has_inline_data(inode)) {
5180                 int has_inline = 1;
5181
5182                 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
5183                                                 start, len);
5184
5185                 if (has_inline)
5186                         return error;
5187         }
5188
5189         if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
5190                 error = ext4_ext_precache(inode);
5191                 if (error)
5192                         return error;
5193         }
5194
5195         /* fallback to generic here if not in extents fmt */
5196         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5197                 return generic_block_fiemap(inode, fieinfo, start, len,
5198                         ext4_get_block);
5199
5200         if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
5201                 return -EBADR;
5202
5203         if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
5204                 error = ext4_xattr_fiemap(inode, fieinfo);
5205         } else {
5206                 ext4_lblk_t len_blks;
5207                 __u64 last_blk;
5208
5209                 start_blk = start >> inode->i_sb->s_blocksize_bits;
5210                 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
5211                 if (last_blk >= EXT_MAX_BLOCKS)
5212                         last_blk = EXT_MAX_BLOCKS-1;
5213                 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
5214
5215                 /*
5216                  * Walk the extent tree gathering extent information
5217                  * and pushing extents back to the user.
5218                  */
5219                 error = ext4_fill_fiemap_extents(inode, start_blk,
5220                                                  len_blks, fieinfo);
5221         }
5222         return error;
5223 }
5224
5225 /*
5226  * ext4_access_path:
5227  * Function to access the path buffer for marking it dirty.
5228  * It also checks if there are sufficient credits left in the journal handle
5229  * to update path.
5230  */
5231 static int
5232 ext4_access_path(handle_t *handle, struct inode *inode,
5233                 struct ext4_ext_path *path)
5234 {
5235         int credits, err;
5236
5237         if (!ext4_handle_valid(handle))
5238                 return 0;
5239
5240         /*
5241          * Check if need to extend journal credits
5242          * 3 for leaf, sb, and inode plus 2 (bmap and group
5243          * descriptor) for each block group; assume two block
5244          * groups
5245          */
5246         if (handle->h_buffer_credits < 7) {
5247                 credits = ext4_writepage_trans_blocks(inode);
5248                 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
5249                 /* EAGAIN is success */
5250                 if (err && err != -EAGAIN)
5251                         return err;
5252         }
5253
5254         err = ext4_ext_get_access(handle, inode, path);
5255         return err;
5256 }
5257
5258 /*
5259  * ext4_ext_shift_path_extents:
5260  * Shift the extents of a path structure lying between path[depth].p_ext
5261  * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
5262  * if it is right shift or left shift operation.
5263  */
5264 static int
5265 ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5266                             struct inode *inode, handle_t *handle,
5267                             enum SHIFT_DIRECTION SHIFT)
5268 {
5269         int depth, err = 0;
5270         struct ext4_extent *ex_start, *ex_last;
5271         bool update = 0;
5272         depth = path->p_depth;
5273
5274         while (depth >= 0) {
5275                 if (depth == path->p_depth) {
5276                         ex_start = path[depth].p_ext;
5277                         if (!ex_start)
5278                                 return -EFSCORRUPTED;
5279
5280                         ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5281
5282                         err = ext4_access_path(handle, inode, path + depth);
5283                         if (err)
5284                                 goto out;
5285
5286                         if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
5287                                 update = 1;
5288
5289                         while (ex_start <= ex_last) {
5290                                 if (SHIFT == SHIFT_LEFT) {
5291                                         le32_add_cpu(&ex_start->ee_block,
5292                                                 -shift);
5293                                         /* Try to merge to the left. */
5294                                         if ((ex_start >
5295                                             EXT_FIRST_EXTENT(path[depth].p_hdr))
5296                                             &&
5297                                             ext4_ext_try_to_merge_right(inode,
5298                                             path, ex_start - 1))
5299                                                 ex_last--;
5300                                         else
5301                                                 ex_start++;
5302                                 } else {
5303                                         le32_add_cpu(&ex_last->ee_block, shift);
5304                                         ext4_ext_try_to_merge_right(inode, path,
5305                                                 ex_last);
5306                                         ex_last--;
5307                                 }
5308                         }
5309                         err = ext4_ext_dirty(handle, inode, path + depth);
5310                         if (err)
5311                                 goto out;
5312
5313                         if (--depth < 0 || !update)
5314                                 break;
5315                 }
5316
5317                 /* Update index too */
5318                 err = ext4_access_path(handle, inode, path + depth);
5319                 if (err)
5320                         goto out;
5321
5322                 if (SHIFT == SHIFT_LEFT)
5323                         le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5324                 else
5325                         le32_add_cpu(&path[depth].p_idx->ei_block, shift);
5326                 err = ext4_ext_dirty(handle, inode, path + depth);
5327                 if (err)
5328                         goto out;
5329
5330                 /* we are done if current index is not a starting index */
5331                 if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5332                         break;
5333
5334                 depth--;
5335         }
5336
5337 out:
5338         return err;
5339 }
5340
5341 /*
5342  * ext4_ext_shift_extents:
5343  * All the extents which lies in the range from @start to the last allocated
5344  * block for the @inode are shifted either towards left or right (depending
5345  * upon @SHIFT) by @shift blocks.
5346  * On success, 0 is returned, error otherwise.
5347  */
5348 static int
5349 ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5350                        ext4_lblk_t start, ext4_lblk_t shift,
5351                        enum SHIFT_DIRECTION SHIFT)
5352 {
5353         struct ext4_ext_path *path;
5354         int ret = 0, depth;
5355         struct ext4_extent *extent;
5356         ext4_lblk_t stop, *iterator, ex_start, ex_end;
5357
5358         /* Let path point to the last extent */
5359         path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
5360         if (IS_ERR(path))
5361                 return PTR_ERR(path);
5362
5363         depth = path->p_depth;
5364         extent = path[depth].p_ext;
5365         if (!extent)
5366                 goto out;
5367
5368         stop = le32_to_cpu(extent->ee_block) +
5369                         ext4_ext_get_actual_len(extent);
5370
5371        /*
5372          * In case of left shift, Don't start shifting extents until we make
5373          * sure the hole is big enough to accommodate the shift.
5374         */
5375         if (SHIFT == SHIFT_LEFT) {
5376                 path = ext4_find_extent(inode, start - 1, &path, 0);
5377                 if (IS_ERR(path))
5378                         return PTR_ERR(path);
5379                 depth = path->p_depth;
5380                 extent =  path[depth].p_ext;
5381                 if (extent) {
5382                         ex_start = le32_to_cpu(extent->ee_block);
5383                         ex_end = le32_to_cpu(extent->ee_block) +
5384                                 ext4_ext_get_actual_len(extent);
5385                 } else {
5386                         ex_start = 0;
5387                         ex_end = 0;
5388                 }
5389
5390                 if ((start == ex_start && shift > ex_start) ||
5391                     (shift > start - ex_end)) {
5392                         ext4_ext_drop_refs(path);
5393                         kfree(path);
5394                         return -EINVAL;
5395                 }
5396         }
5397
5398         /*
5399          * In case of left shift, iterator points to start and it is increased
5400          * till we reach stop. In case of right shift, iterator points to stop
5401          * and it is decreased till we reach start.
5402          */
5403         if (SHIFT == SHIFT_LEFT)
5404                 iterator = &start;
5405         else
5406                 iterator = &stop;
5407
5408         /* Its safe to start updating extents */
5409         while (start < stop) {
5410                 path = ext4_find_extent(inode, *iterator, &path, 0);
5411                 if (IS_ERR(path))
5412                         return PTR_ERR(path);
5413                 depth = path->p_depth;
5414                 extent = path[depth].p_ext;
5415                 if (!extent) {
5416                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5417                                          (unsigned long) *iterator);
5418                         return -EFSCORRUPTED;
5419                 }
5420                 if (SHIFT == SHIFT_LEFT && *iterator >
5421                     le32_to_cpu(extent->ee_block)) {
5422                         /* Hole, move to the next extent */
5423                         if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
5424                                 path[depth].p_ext++;
5425                         } else {
5426                                 *iterator = ext4_ext_next_allocated_block(path);
5427                                 continue;
5428                         }
5429                 }
5430
5431                 if (SHIFT == SHIFT_LEFT) {
5432                         extent = EXT_LAST_EXTENT(path[depth].p_hdr);
5433                         *iterator = le32_to_cpu(extent->ee_block) +
5434                                         ext4_ext_get_actual_len(extent);
5435                 } else {
5436                         extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
5437                         *iterator =  le32_to_cpu(extent->ee_block) > 0 ?
5438                                 le32_to_cpu(extent->ee_block) - 1 : 0;
5439                         /* Update path extent in case we need to stop */
5440                         while (le32_to_cpu(extent->ee_block) < start)
5441                                 extent++;
5442                         path[depth].p_ext = extent;
5443                 }
5444                 ret = ext4_ext_shift_path_extents(path, shift, inode,
5445                                 handle, SHIFT);
5446                 if (ret)
5447                         break;
5448         }
5449 out:
5450         ext4_ext_drop_refs(path);
5451         kfree(path);
5452         return ret;
5453 }
5454
5455 /*
5456  * ext4_collapse_range:
5457  * This implements the fallocate's collapse range functionality for ext4
5458  * Returns: 0 and non-zero on error.
5459  */
5460 int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5461 {
5462         struct super_block *sb = inode->i_sb;
5463         ext4_lblk_t punch_start, punch_stop;
5464         handle_t *handle;
5465         unsigned int credits;
5466         loff_t new_size, ioffset;
5467         int ret;
5468
5469         /*
5470          * We need to test this early because xfstests assumes that a
5471          * collapse range of (0, 1) will return EOPNOTSUPP if the file
5472          * system does not support collapse range.
5473          */
5474         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5475                 return -EOPNOTSUPP;
5476
5477         /* Collapse range works only on fs block size aligned offsets. */
5478         if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
5479             len & (EXT4_CLUSTER_SIZE(sb) - 1))
5480                 return -EINVAL;
5481
5482         if (!S_ISREG(inode->i_mode))
5483                 return -EINVAL;
5484
5485         trace_ext4_collapse_range(inode, offset, len);
5486
5487         punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5488         punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5489
5490         /* Call ext4_force_commit to flush all data in case of data=journal. */
5491         if (ext4_should_journal_data(inode)) {
5492                 ret = ext4_force_commit(inode->i_sb);
5493                 if (ret)
5494                         return ret;
5495         }
5496
5497         /*
5498          * Need to round down offset to be aligned with page size boundary
5499          * for page size > block size.
5500          */
5501         ioffset = round_down(offset, PAGE_SIZE);
5502
5503         /* Write out all dirty pages */
5504         ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5505                                            LLONG_MAX);
5506         if (ret)
5507                 return ret;
5508
5509         /* Take mutex lock */
5510         mutex_lock(&inode->i_mutex);
5511
5512         /*
5513          * There is no need to overlap collapse range with EOF, in which case
5514          * it is effectively a truncate operation
5515          */
5516         if (offset + len >= i_size_read(inode)) {
5517                 ret = -EINVAL;
5518                 goto out_mutex;
5519         }
5520
5521         /* Currently just for extent based files */
5522         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5523                 ret = -EOPNOTSUPP;
5524                 goto out_mutex;
5525         }
5526
5527         truncate_pagecache(inode, ioffset);
5528
5529         /* Wait for existing dio to complete */
5530         ext4_inode_block_unlocked_dio(inode);
5531         inode_dio_wait(inode);
5532
5533         credits = ext4_writepage_trans_blocks(inode);
5534         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5535         if (IS_ERR(handle)) {
5536                 ret = PTR_ERR(handle);
5537                 goto out_dio;
5538         }
5539
5540         down_write(&EXT4_I(inode)->i_data_sem);
5541         ext4_discard_preallocations(inode);
5542
5543         ret = ext4_es_remove_extent(inode, punch_start,
5544                                     EXT_MAX_BLOCKS - punch_start);
5545         if (ret) {
5546                 up_write(&EXT4_I(inode)->i_data_sem);
5547                 goto out_stop;
5548         }
5549
5550         ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5551         if (ret) {
5552                 up_write(&EXT4_I(inode)->i_data_sem);
5553                 goto out_stop;
5554         }
5555         ext4_discard_preallocations(inode);
5556
5557         ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5558                                      punch_stop - punch_start, SHIFT_LEFT);
5559         if (ret) {
5560                 up_write(&EXT4_I(inode)->i_data_sem);
5561                 goto out_stop;
5562         }
5563
5564         new_size = i_size_read(inode) - len;
5565         i_size_write(inode, new_size);
5566         EXT4_I(inode)->i_disksize = new_size;
5567
5568         up_write(&EXT4_I(inode)->i_data_sem);
5569         if (IS_SYNC(inode))
5570                 ext4_handle_sync(handle);
5571         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
5572         ext4_mark_inode_dirty(handle, inode);
5573
5574 out_stop:
5575         ext4_journal_stop(handle);
5576 out_dio:
5577         ext4_inode_resume_unlocked_dio(inode);
5578 out_mutex:
5579         mutex_unlock(&inode->i_mutex);
5580         return ret;
5581 }
5582
5583 /*
5584  * ext4_insert_range:
5585  * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
5586  * The data blocks starting from @offset to the EOF are shifted by @len
5587  * towards right to create a hole in the @inode. Inode size is increased
5588  * by len bytes.
5589  * Returns 0 on success, error otherwise.
5590  */
5591 int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
5592 {
5593         struct super_block *sb = inode->i_sb;
5594         handle_t *handle;
5595         struct ext4_ext_path *path;
5596         struct ext4_extent *extent;
5597         ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
5598         unsigned int credits, ee_len;
5599         int ret = 0, depth, split_flag = 0;
5600         loff_t ioffset;
5601
5602         /*
5603          * We need to test this early because xfstests assumes that an
5604          * insert range of (0, 1) will return EOPNOTSUPP if the file
5605          * system does not support insert range.
5606          */
5607         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5608                 return -EOPNOTSUPP;
5609
5610         /* Insert range works only on fs block size aligned offsets. */
5611         if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
5612                         len & (EXT4_CLUSTER_SIZE(sb) - 1))
5613                 return -EINVAL;
5614
5615         if (!S_ISREG(inode->i_mode))
5616                 return -EOPNOTSUPP;
5617
5618         trace_ext4_insert_range(inode, offset, len);
5619
5620         offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5621         len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
5622
5623         /* Call ext4_force_commit to flush all data in case of data=journal */
5624         if (ext4_should_journal_data(inode)) {
5625                 ret = ext4_force_commit(inode->i_sb);
5626                 if (ret)
5627                         return ret;
5628         }
5629
5630         /*
5631          * Need to round down to align start offset to page size boundary
5632          * for page size > block size.
5633          */
5634         ioffset = round_down(offset, PAGE_SIZE);
5635
5636         /* Write out all dirty pages */
5637         ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5638                         LLONG_MAX);
5639         if (ret)
5640                 return ret;
5641
5642         /* Take mutex lock */
5643         mutex_lock(&inode->i_mutex);
5644
5645         /* Currently just for extent based files */
5646         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5647                 ret = -EOPNOTSUPP;
5648                 goto out_mutex;
5649         }
5650
5651         /* Check for wrap through zero */
5652         if (inode->i_size + len > inode->i_sb->s_maxbytes) {
5653                 ret = -EFBIG;
5654                 goto out_mutex;
5655         }
5656
5657         /* Offset should be less than i_size */
5658         if (offset >= i_size_read(inode)) {
5659                 ret = -EINVAL;
5660                 goto out_mutex;
5661         }
5662
5663         truncate_pagecache(inode, ioffset);
5664
5665         /* Wait for existing dio to complete */
5666         ext4_inode_block_unlocked_dio(inode);
5667         inode_dio_wait(inode);
5668
5669         credits = ext4_writepage_trans_blocks(inode);
5670         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5671         if (IS_ERR(handle)) {
5672                 ret = PTR_ERR(handle);
5673                 goto out_dio;
5674         }
5675
5676         /* Expand file to avoid data loss if there is error while shifting */
5677         inode->i_size += len;
5678         EXT4_I(inode)->i_disksize += len;
5679         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
5680         ret = ext4_mark_inode_dirty(handle, inode);
5681         if (ret)
5682                 goto out_stop;
5683
5684         down_write(&EXT4_I(inode)->i_data_sem);
5685         ext4_discard_preallocations(inode);
5686
5687         path = ext4_find_extent(inode, offset_lblk, NULL, 0);
5688         if (IS_ERR(path)) {
5689                 up_write(&EXT4_I(inode)->i_data_sem);
5690                 goto out_stop;
5691         }
5692
5693         depth = ext_depth(inode);
5694         extent = path[depth].p_ext;
5695         if (extent) {
5696                 ee_start_lblk = le32_to_cpu(extent->ee_block);
5697                 ee_len = ext4_ext_get_actual_len(extent);
5698
5699                 /*
5700                  * If offset_lblk is not the starting block of extent, split
5701                  * the extent @offset_lblk
5702                  */
5703                 if ((offset_lblk > ee_start_lblk) &&
5704                                 (offset_lblk < (ee_start_lblk + ee_len))) {
5705                         if (ext4_ext_is_unwritten(extent))
5706                                 split_flag = EXT4_EXT_MARK_UNWRIT1 |
5707                                         EXT4_EXT_MARK_UNWRIT2;
5708                         ret = ext4_split_extent_at(handle, inode, &path,
5709                                         offset_lblk, split_flag,
5710                                         EXT4_EX_NOCACHE |
5711                                         EXT4_GET_BLOCKS_PRE_IO |
5712                                         EXT4_GET_BLOCKS_METADATA_NOFAIL);
5713                 }
5714
5715                 ext4_ext_drop_refs(path);
5716                 kfree(path);
5717                 if (ret < 0) {
5718                         up_write(&EXT4_I(inode)->i_data_sem);
5719                         goto out_stop;
5720                 }
5721         }
5722
5723         ret = ext4_es_remove_extent(inode, offset_lblk,
5724                         EXT_MAX_BLOCKS - offset_lblk);
5725         if (ret) {
5726                 up_write(&EXT4_I(inode)->i_data_sem);
5727                 goto out_stop;
5728         }
5729
5730         /*
5731          * if offset_lblk lies in a hole which is at start of file, use
5732          * ee_start_lblk to shift extents
5733          */
5734         ret = ext4_ext_shift_extents(inode, handle,
5735                 ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
5736                 len_lblk, SHIFT_RIGHT);
5737
5738         up_write(&EXT4_I(inode)->i_data_sem);
5739         if (IS_SYNC(inode))
5740                 ext4_handle_sync(handle);
5741
5742 out_stop:
5743         ext4_journal_stop(handle);
5744 out_dio:
5745         ext4_inode_resume_unlocked_dio(inode);
5746 out_mutex:
5747         mutex_unlock(&inode->i_mutex);
5748         return ret;
5749 }
5750
5751 /**
5752  * ext4_swap_extents - Swap extents between two inodes
5753  *
5754  * @inode1:     First inode
5755  * @inode2:     Second inode
5756  * @lblk1:      Start block for first inode
5757  * @lblk2:      Start block for second inode
5758  * @count:      Number of blocks to swap
5759  * @mark_unwritten: Mark second inode's extents as unwritten after swap
5760  * @erp:        Pointer to save error value
5761  *
5762  * This helper routine does exactly what is promise "swap extents". All other
5763  * stuff such as page-cache locking consistency, bh mapping consistency or
5764  * extent's data copying must be performed by caller.
5765  * Locking:
5766  *              i_mutex is held for both inodes
5767  *              i_data_sem is locked for write for both inodes
5768  * Assumptions:
5769  *              All pages from requested range are locked for both inodes
5770  */
5771 int
5772 ext4_swap_extents(handle_t *handle, struct inode *inode1,
5773                      struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
5774                   ext4_lblk_t count, int unwritten, int *erp)
5775 {
5776         struct ext4_ext_path *path1 = NULL;
5777         struct ext4_ext_path *path2 = NULL;
5778         int replaced_count = 0;
5779
5780         BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
5781         BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
5782         BUG_ON(!mutex_is_locked(&inode1->i_mutex));
5783         BUG_ON(!mutex_is_locked(&inode2->i_mutex));
5784
5785         *erp = ext4_es_remove_extent(inode1, lblk1, count);
5786         if (unlikely(*erp))
5787                 return 0;
5788         *erp = ext4_es_remove_extent(inode2, lblk2, count);
5789         if (unlikely(*erp))
5790                 return 0;
5791
5792         while (count) {
5793                 struct ext4_extent *ex1, *ex2, tmp_ex;
5794                 ext4_lblk_t e1_blk, e2_blk;
5795                 int e1_len, e2_len, len;
5796                 int split = 0;
5797
5798                 path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
5799                 if (IS_ERR(path1)) {
5800                         *erp = PTR_ERR(path1);
5801                         path1 = NULL;
5802                 finish:
5803                         count = 0;
5804                         goto repeat;
5805                 }
5806                 path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
5807                 if (IS_ERR(path2)) {
5808                         *erp = PTR_ERR(path2);
5809                         path2 = NULL;
5810                         goto finish;
5811                 }
5812                 ex1 = path1[path1->p_depth].p_ext;
5813                 ex2 = path2[path2->p_depth].p_ext;
5814                 /* Do we have somthing to swap ? */
5815                 if (unlikely(!ex2 || !ex1))
5816                         goto finish;
5817
5818                 e1_blk = le32_to_cpu(ex1->ee_block);
5819                 e2_blk = le32_to_cpu(ex2->ee_block);
5820                 e1_len = ext4_ext_get_actual_len(ex1);
5821                 e2_len = ext4_ext_get_actual_len(ex2);
5822
5823                 /* Hole handling */
5824                 if (!in_range(lblk1, e1_blk, e1_len) ||
5825                     !in_range(lblk2, e2_blk, e2_len)) {
5826                         ext4_lblk_t next1, next2;
5827
5828                         /* if hole after extent, then go to next extent */
5829                         next1 = ext4_ext_next_allocated_block(path1);
5830                         next2 = ext4_ext_next_allocated_block(path2);
5831                         /* If hole before extent, then shift to that extent */
5832                         if (e1_blk > lblk1)
5833                                 next1 = e1_blk;
5834                         if (e2_blk > lblk2)
5835                                 next2 = e1_blk;
5836                         /* Do we have something to swap */
5837                         if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
5838                                 goto finish;
5839                         /* Move to the rightest boundary */
5840                         len = next1 - lblk1;
5841                         if (len < next2 - lblk2)
5842                                 len = next2 - lblk2;
5843                         if (len > count)
5844                                 len = count;
5845                         lblk1 += len;
5846                         lblk2 += len;
5847                         count -= len;
5848                         goto repeat;
5849                 }
5850
5851                 /* Prepare left boundary */
5852                 if (e1_blk < lblk1) {
5853                         split = 1;
5854                         *erp = ext4_force_split_extent_at(handle, inode1,
5855                                                 &path1, lblk1, 0);
5856                         if (unlikely(*erp))
5857                                 goto finish;
5858                 }
5859                 if (e2_blk < lblk2) {
5860                         split = 1;
5861                         *erp = ext4_force_split_extent_at(handle, inode2,
5862                                                 &path2,  lblk2, 0);
5863                         if (unlikely(*erp))
5864                                 goto finish;
5865                 }
5866                 /* ext4_split_extent_at() may result in leaf extent split,
5867                  * path must to be revalidated. */
5868                 if (split)
5869                         goto repeat;
5870
5871                 /* Prepare right boundary */
5872                 len = count;
5873                 if (len > e1_blk + e1_len - lblk1)
5874                         len = e1_blk + e1_len - lblk1;
5875                 if (len > e2_blk + e2_len - lblk2)
5876                         len = e2_blk + e2_len - lblk2;
5877
5878                 if (len != e1_len) {
5879                         split = 1;
5880                         *erp = ext4_force_split_extent_at(handle, inode1,
5881                                                 &path1, lblk1 + len, 0);
5882                         if (unlikely(*erp))
5883                                 goto finish;
5884                 }
5885                 if (len != e2_len) {
5886                         split = 1;
5887                         *erp = ext4_force_split_extent_at(handle, inode2,
5888                                                 &path2, lblk2 + len, 0);
5889                         if (*erp)
5890                                 goto finish;
5891                 }
5892                 /* ext4_split_extent_at() may result in leaf extent split,
5893                  * path must to be revalidated. */
5894                 if (split)
5895                         goto repeat;
5896
5897                 BUG_ON(e2_len != e1_len);
5898                 *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
5899                 if (unlikely(*erp))
5900                         goto finish;
5901                 *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
5902                 if (unlikely(*erp))
5903                         goto finish;
5904
5905                 /* Both extents are fully inside boundaries. Swap it now */
5906                 tmp_ex = *ex1;
5907                 ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
5908                 ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
5909                 ex1->ee_len = cpu_to_le16(e2_len);
5910                 ex2->ee_len = cpu_to_le16(e1_len);
5911                 if (unwritten)
5912                         ext4_ext_mark_unwritten(ex2);
5913                 if (ext4_ext_is_unwritten(&tmp_ex))
5914                         ext4_ext_mark_unwritten(ex1);
5915
5916                 ext4_ext_try_to_merge(handle, inode2, path2, ex2);
5917                 ext4_ext_try_to_merge(handle, inode1, path1, ex1);
5918                 *erp = ext4_ext_dirty(handle, inode2, path2 +
5919                                       path2->p_depth);
5920                 if (unlikely(*erp))
5921                         goto finish;
5922                 *erp = ext4_ext_dirty(handle, inode1, path1 +
5923                                       path1->p_depth);
5924                 /*
5925                  * Looks scarry ah..? second inode already points to new blocks,
5926                  * and it was successfully dirtied. But luckily error may happen
5927                  * only due to journal error, so full transaction will be
5928                  * aborted anyway.
5929                  */
5930                 if (unlikely(*erp))
5931                         goto finish;
5932                 lblk1 += len;
5933                 lblk2 += len;
5934                 replaced_count += len;
5935                 count -= len;
5936
5937         repeat:
5938                 ext4_ext_drop_refs(path1);
5939                 kfree(path1);
5940                 ext4_ext_drop_refs(path2);
5941                 kfree(path2);
5942                 path1 = path2 = NULL;
5943         }
5944         return replaced_count;
5945 }