]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - fs/ext4/inode.c
7a1d2e590cb410d05a9e6b082db6711f0a46cdd7
[karo-tx-linux.git] / fs / ext4 / inode.c
1 /*
2  *  linux/fs/ext4/inode.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/inode.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  Goal-directed block allocation by Stephen Tweedie
16  *      (sct@redhat.com), 1993, 1998
17  *  Big-endian to little-endian byte-swapping/bitmaps by
18  *        David S. Miller (davem@caip.rutgers.edu), 1995
19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
20  *      (jj@sunsite.ms.mff.cuni.cz)
21  *
22  *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
23  */
24
25 #include <linux/module.h>
26 #include <linux/fs.h>
27 #include <linux/time.h>
28 #include <linux/jbd2.h>
29 #include <linux/highuid.h>
30 #include <linux/pagemap.h>
31 #include <linux/quotaops.h>
32 #include <linux/string.h>
33 #include <linux/buffer_head.h>
34 #include <linux/writeback.h>
35 #include <linux/pagevec.h>
36 #include <linux/mpage.h>
37 #include <linux/uio.h>
38 #include <linux/bio.h>
39 #include "ext4_jbd2.h"
40 #include "xattr.h"
41 #include "acl.h"
42 #include "ext4_extents.h"
43
44 #define MPAGE_DA_EXTENT_TAIL 0x01
45
46 static inline int ext4_begin_ordered_truncate(struct inode *inode,
47                                               loff_t new_size)
48 {
49         return jbd2_journal_begin_ordered_truncate(
50                                         EXT4_SB(inode->i_sb)->s_journal,
51                                         &EXT4_I(inode)->jinode,
52                                         new_size);
53 }
54
55 static void ext4_invalidatepage(struct page *page, unsigned long offset);
56
57 /*
58  * Test whether an inode is a fast symlink.
59  */
60 static int ext4_inode_is_fast_symlink(struct inode *inode)
61 {
62         int ea_blocks = EXT4_I(inode)->i_file_acl ?
63                 (inode->i_sb->s_blocksize >> 9) : 0;
64
65         return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
66 }
67
68 /*
69  * The ext4 forget function must perform a revoke if we are freeing data
70  * which has been journaled.  Metadata (eg. indirect blocks) must be
71  * revoked in all cases.
72  *
73  * "bh" may be NULL: a metadata block may have been freed from memory
74  * but there may still be a record of it in the journal, and that record
75  * still needs to be revoked.
76  */
77 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
78                         struct buffer_head *bh, ext4_fsblk_t blocknr)
79 {
80         int err;
81
82         might_sleep();
83
84         BUFFER_TRACE(bh, "enter");
85
86         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
87                   "data mode %lx\n",
88                   bh, is_metadata, inode->i_mode,
89                   test_opt(inode->i_sb, DATA_FLAGS));
90
91         /* Never use the revoke function if we are doing full data
92          * journaling: there is no need to, and a V1 superblock won't
93          * support it.  Otherwise, only skip the revoke on un-journaled
94          * data blocks. */
95
96         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
97             (!is_metadata && !ext4_should_journal_data(inode))) {
98                 if (bh) {
99                         BUFFER_TRACE(bh, "call jbd2_journal_forget");
100                         return ext4_journal_forget(handle, bh);
101                 }
102                 return 0;
103         }
104
105         /*
106          * data!=journal && (is_metadata || should_journal_data(inode))
107          */
108         BUFFER_TRACE(bh, "call ext4_journal_revoke");
109         err = ext4_journal_revoke(handle, blocknr, bh);
110         if (err)
111                 ext4_abort(inode->i_sb, __func__,
112                            "error %d when attempting revoke", err);
113         BUFFER_TRACE(bh, "exit");
114         return err;
115 }
116
117 /*
118  * Work out how many blocks we need to proceed with the next chunk of a
119  * truncate transaction.
120  */
121 static unsigned long blocks_for_truncate(struct inode *inode)
122 {
123         ext4_lblk_t needed;
124
125         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
126
127         /* Give ourselves just enough room to cope with inodes in which
128          * i_blocks is corrupt: we've seen disk corruptions in the past
129          * which resulted in random data in an inode which looked enough
130          * like a regular file for ext4 to try to delete it.  Things
131          * will go a bit crazy if that happens, but at least we should
132          * try not to panic the whole kernel. */
133         if (needed < 2)
134                 needed = 2;
135
136         /* But we need to bound the transaction so we don't overflow the
137          * journal. */
138         if (needed > EXT4_MAX_TRANS_DATA)
139                 needed = EXT4_MAX_TRANS_DATA;
140
141         return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
142 }
143
144 /*
145  * Truncate transactions can be complex and absolutely huge.  So we need to
146  * be able to restart the transaction at a conventient checkpoint to make
147  * sure we don't overflow the journal.
148  *
149  * start_transaction gets us a new handle for a truncate transaction,
150  * and extend_transaction tries to extend the existing one a bit.  If
151  * extend fails, we need to propagate the failure up and restart the
152  * transaction in the top-level truncate loop. --sct
153  */
154 static handle_t *start_transaction(struct inode *inode)
155 {
156         handle_t *result;
157
158         result = ext4_journal_start(inode, blocks_for_truncate(inode));
159         if (!IS_ERR(result))
160                 return result;
161
162         ext4_std_error(inode->i_sb, PTR_ERR(result));
163         return result;
164 }
165
166 /*
167  * Try to extend this transaction for the purposes of truncation.
168  *
169  * Returns 0 if we managed to create more room.  If we can't create more
170  * room, and the transaction must be restarted we return 1.
171  */
172 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
173 {
174         if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
175                 return 0;
176         if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
177                 return 0;
178         return 1;
179 }
180
181 /*
182  * Restart the transaction associated with *handle.  This does a commit,
183  * so before we call here everything must be consistently dirtied against
184  * this transaction.
185  */
186 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
187 {
188         jbd_debug(2, "restarting handle %p\n", handle);
189         return ext4_journal_restart(handle, blocks_for_truncate(inode));
190 }
191
192 /*
193  * Called at the last iput() if i_nlink is zero.
194  */
195 void ext4_delete_inode (struct inode * inode)
196 {
197         handle_t *handle;
198         int err;
199
200         if (ext4_should_order_data(inode))
201                 ext4_begin_ordered_truncate(inode, 0);
202         truncate_inode_pages(&inode->i_data, 0);
203
204         if (is_bad_inode(inode))
205                 goto no_delete;
206
207         handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
208         if (IS_ERR(handle)) {
209                 ext4_std_error(inode->i_sb, PTR_ERR(handle));
210                 /*
211                  * If we're going to skip the normal cleanup, we still need to
212                  * make sure that the in-core orphan linked list is properly
213                  * cleaned up.
214                  */
215                 ext4_orphan_del(NULL, inode);
216                 goto no_delete;
217         }
218
219         if (IS_SYNC(inode))
220                 handle->h_sync = 1;
221         inode->i_size = 0;
222         err = ext4_mark_inode_dirty(handle, inode);
223         if (err) {
224                 ext4_warning(inode->i_sb, __func__,
225                              "couldn't mark inode dirty (err %d)", err);
226                 goto stop_handle;
227         }
228         if (inode->i_blocks)
229                 ext4_truncate(inode);
230
231         /*
232          * ext4_ext_truncate() doesn't reserve any slop when it
233          * restarts journal transactions; therefore there may not be
234          * enough credits left in the handle to remove the inode from
235          * the orphan list and set the dtime field.
236          */
237         if (handle->h_buffer_credits < 3) {
238                 err = ext4_journal_extend(handle, 3);
239                 if (err > 0)
240                         err = ext4_journal_restart(handle, 3);
241                 if (err != 0) {
242                         ext4_warning(inode->i_sb, __func__,
243                                      "couldn't extend journal (err %d)", err);
244                 stop_handle:
245                         ext4_journal_stop(handle);
246                         goto no_delete;
247                 }
248         }
249
250         /*
251          * Kill off the orphan record which ext4_truncate created.
252          * AKPM: I think this can be inside the above `if'.
253          * Note that ext4_orphan_del() has to be able to cope with the
254          * deletion of a non-existent orphan - this is because we don't
255          * know if ext4_truncate() actually created an orphan record.
256          * (Well, we could do this if we need to, but heck - it works)
257          */
258         ext4_orphan_del(handle, inode);
259         EXT4_I(inode)->i_dtime  = get_seconds();
260
261         /*
262          * One subtle ordering requirement: if anything has gone wrong
263          * (transaction abort, IO errors, whatever), then we can still
264          * do these next steps (the fs will already have been marked as
265          * having errors), but we can't free the inode if the mark_dirty
266          * fails.
267          */
268         if (ext4_mark_inode_dirty(handle, inode))
269                 /* If that failed, just do the required in-core inode clear. */
270                 clear_inode(inode);
271         else
272                 ext4_free_inode(handle, inode);
273         ext4_journal_stop(handle);
274         return;
275 no_delete:
276         clear_inode(inode);     /* We must guarantee clearing of inode... */
277 }
278
279 typedef struct {
280         __le32  *p;
281         __le32  key;
282         struct buffer_head *bh;
283 } Indirect;
284
285 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
286 {
287         p->key = *(p->p = v);
288         p->bh = bh;
289 }
290
291 /**
292  *      ext4_block_to_path - parse the block number into array of offsets
293  *      @inode: inode in question (we are only interested in its superblock)
294  *      @i_block: block number to be parsed
295  *      @offsets: array to store the offsets in
296  *      @boundary: set this non-zero if the referred-to block is likely to be
297  *             followed (on disk) by an indirect block.
298  *
299  *      To store the locations of file's data ext4 uses a data structure common
300  *      for UNIX filesystems - tree of pointers anchored in the inode, with
301  *      data blocks at leaves and indirect blocks in intermediate nodes.
302  *      This function translates the block number into path in that tree -
303  *      return value is the path length and @offsets[n] is the offset of
304  *      pointer to (n+1)th node in the nth one. If @block is out of range
305  *      (negative or too large) warning is printed and zero returned.
306  *
307  *      Note: function doesn't find node addresses, so no IO is needed. All
308  *      we need to know is the capacity of indirect blocks (taken from the
309  *      inode->i_sb).
310  */
311
312 /*
313  * Portability note: the last comparison (check that we fit into triple
314  * indirect block) is spelled differently, because otherwise on an
315  * architecture with 32-bit longs and 8Kb pages we might get into trouble
316  * if our filesystem had 8Kb blocks. We might use long long, but that would
317  * kill us on x86. Oh, well, at least the sign propagation does not matter -
318  * i_block would have to be negative in the very beginning, so we would not
319  * get there at all.
320  */
321
322 static int ext4_block_to_path(struct inode *inode,
323                         ext4_lblk_t i_block,
324                         ext4_lblk_t offsets[4], int *boundary)
325 {
326         int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
327         int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
328         const long direct_blocks = EXT4_NDIR_BLOCKS,
329                 indirect_blocks = ptrs,
330                 double_blocks = (1 << (ptrs_bits * 2));
331         int n = 0;
332         int final = 0;
333
334         if (i_block < 0) {
335                 ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
336         } else if (i_block < direct_blocks) {
337                 offsets[n++] = i_block;
338                 final = direct_blocks;
339         } else if ( (i_block -= direct_blocks) < indirect_blocks) {
340                 offsets[n++] = EXT4_IND_BLOCK;
341                 offsets[n++] = i_block;
342                 final = ptrs;
343         } else if ((i_block -= indirect_blocks) < double_blocks) {
344                 offsets[n++] = EXT4_DIND_BLOCK;
345                 offsets[n++] = i_block >> ptrs_bits;
346                 offsets[n++] = i_block & (ptrs - 1);
347                 final = ptrs;
348         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
349                 offsets[n++] = EXT4_TIND_BLOCK;
350                 offsets[n++] = i_block >> (ptrs_bits * 2);
351                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
352                 offsets[n++] = i_block & (ptrs - 1);
353                 final = ptrs;
354         } else {
355                 ext4_warning(inode->i_sb, "ext4_block_to_path",
356                                 "block %lu > max in inode %lu",
357                                 i_block + direct_blocks +
358                                 indirect_blocks + double_blocks, inode->i_ino);
359         }
360         if (boundary)
361                 *boundary = final - 1 - (i_block & (ptrs - 1));
362         return n;
363 }
364
365 /**
366  *      ext4_get_branch - read the chain of indirect blocks leading to data
367  *      @inode: inode in question
368  *      @depth: depth of the chain (1 - direct pointer, etc.)
369  *      @offsets: offsets of pointers in inode/indirect blocks
370  *      @chain: place to store the result
371  *      @err: here we store the error value
372  *
373  *      Function fills the array of triples <key, p, bh> and returns %NULL
374  *      if everything went OK or the pointer to the last filled triple
375  *      (incomplete one) otherwise. Upon the return chain[i].key contains
376  *      the number of (i+1)-th block in the chain (as it is stored in memory,
377  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
378  *      number (it points into struct inode for i==0 and into the bh->b_data
379  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
380  *      block for i>0 and NULL for i==0. In other words, it holds the block
381  *      numbers of the chain, addresses they were taken from (and where we can
382  *      verify that chain did not change) and buffer_heads hosting these
383  *      numbers.
384  *
385  *      Function stops when it stumbles upon zero pointer (absent block)
386  *              (pointer to last triple returned, *@err == 0)
387  *      or when it gets an IO error reading an indirect block
388  *              (ditto, *@err == -EIO)
389  *      or when it reads all @depth-1 indirect blocks successfully and finds
390  *      the whole chain, all way to the data (returns %NULL, *err == 0).
391  *
392  *      Need to be called with
393  *      down_read(&EXT4_I(inode)->i_data_sem)
394  */
395 static Indirect *ext4_get_branch(struct inode *inode, int depth,
396                                  ext4_lblk_t  *offsets,
397                                  Indirect chain[4], int *err)
398 {
399         struct super_block *sb = inode->i_sb;
400         Indirect *p = chain;
401         struct buffer_head *bh;
402
403         *err = 0;
404         /* i_data is not going away, no lock needed */
405         add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
406         if (!p->key)
407                 goto no_block;
408         while (--depth) {
409                 bh = sb_bread(sb, le32_to_cpu(p->key));
410                 if (!bh)
411                         goto failure;
412                 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
413                 /* Reader: end */
414                 if (!p->key)
415                         goto no_block;
416         }
417         return NULL;
418
419 failure:
420         *err = -EIO;
421 no_block:
422         return p;
423 }
424
425 /**
426  *      ext4_find_near - find a place for allocation with sufficient locality
427  *      @inode: owner
428  *      @ind: descriptor of indirect block.
429  *
430  *      This function returns the preferred place for block allocation.
431  *      It is used when heuristic for sequential allocation fails.
432  *      Rules are:
433  *        + if there is a block to the left of our position - allocate near it.
434  *        + if pointer will live in indirect block - allocate near that block.
435  *        + if pointer will live in inode - allocate in the same
436  *          cylinder group.
437  *
438  * In the latter case we colour the starting block by the callers PID to
439  * prevent it from clashing with concurrent allocations for a different inode
440  * in the same block group.   The PID is used here so that functionally related
441  * files will be close-by on-disk.
442  *
443  *      Caller must make sure that @ind is valid and will stay that way.
444  */
445 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
446 {
447         struct ext4_inode_info *ei = EXT4_I(inode);
448         __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
449         __le32 *p;
450         ext4_fsblk_t bg_start;
451         ext4_fsblk_t last_block;
452         ext4_grpblk_t colour;
453
454         /* Try to find previous block */
455         for (p = ind->p - 1; p >= start; p--) {
456                 if (*p)
457                         return le32_to_cpu(*p);
458         }
459
460         /* No such thing, so let's try location of indirect block */
461         if (ind->bh)
462                 return ind->bh->b_blocknr;
463
464         /*
465          * It is going to be referred to from the inode itself? OK, just put it
466          * into the same cylinder group then.
467          */
468         bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
469         last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
470
471         if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
472                 colour = (current->pid % 16) *
473                         (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
474         else
475                 colour = (current->pid % 16) * ((last_block - bg_start) / 16);
476         return bg_start + colour;
477 }
478
479 /**
480  *      ext4_find_goal - find a preferred place for allocation.
481  *      @inode: owner
482  *      @block:  block we want
483  *      @partial: pointer to the last triple within a chain
484  *
485  *      Normally this function find the preferred place for block allocation,
486  *      returns it.
487  */
488 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
489                 Indirect *partial)
490 {
491         struct ext4_block_alloc_info *block_i;
492
493         block_i =  EXT4_I(inode)->i_block_alloc_info;
494
495         /*
496          * try the heuristic for sequential allocation,
497          * failing that at least try to get decent locality.
498          */
499         if (block_i && (block == block_i->last_alloc_logical_block + 1)
500                 && (block_i->last_alloc_physical_block != 0)) {
501                 return block_i->last_alloc_physical_block + 1;
502         }
503
504         return ext4_find_near(inode, partial);
505 }
506
507 /**
508  *      ext4_blks_to_allocate: Look up the block map and count the number
509  *      of direct blocks need to be allocated for the given branch.
510  *
511  *      @branch: chain of indirect blocks
512  *      @k: number of blocks need for indirect blocks
513  *      @blks: number of data blocks to be mapped.
514  *      @blocks_to_boundary:  the offset in the indirect block
515  *
516  *      return the total number of blocks to be allocate, including the
517  *      direct and indirect blocks.
518  */
519 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
520                 int blocks_to_boundary)
521 {
522         unsigned long count = 0;
523
524         /*
525          * Simple case, [t,d]Indirect block(s) has not allocated yet
526          * then it's clear blocks on that path have not allocated
527          */
528         if (k > 0) {
529                 /* right now we don't handle cross boundary allocation */
530                 if (blks < blocks_to_boundary + 1)
531                         count += blks;
532                 else
533                         count += blocks_to_boundary + 1;
534                 return count;
535         }
536
537         count++;
538         while (count < blks && count <= blocks_to_boundary &&
539                 le32_to_cpu(*(branch[0].p + count)) == 0) {
540                 count++;
541         }
542         return count;
543 }
544
545 /**
546  *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
547  *      @indirect_blks: the number of blocks need to allocate for indirect
548  *                      blocks
549  *
550  *      @new_blocks: on return it will store the new block numbers for
551  *      the indirect blocks(if needed) and the first direct block,
552  *      @blks:  on return it will store the total number of allocated
553  *              direct blocks
554  */
555 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
556                                 ext4_lblk_t iblock, ext4_fsblk_t goal,
557                                 int indirect_blks, int blks,
558                                 ext4_fsblk_t new_blocks[4], int *err)
559 {
560         int target, i;
561         unsigned long count = 0, blk_allocated = 0;
562         int index = 0;
563         ext4_fsblk_t current_block = 0;
564         int ret = 0;
565
566         /*
567          * Here we try to allocate the requested multiple blocks at once,
568          * on a best-effort basis.
569          * To build a branch, we should allocate blocks for
570          * the indirect blocks(if not allocated yet), and at least
571          * the first direct block of this branch.  That's the
572          * minimum number of blocks need to allocate(required)
573          */
574         /* first we try to allocate the indirect blocks */
575         target = indirect_blks;
576         while (target > 0) {
577                 count = target;
578                 /* allocating blocks for indirect blocks and direct blocks */
579                 current_block = ext4_new_meta_blocks(handle, inode,
580                                                         goal, &count, err);
581                 if (*err)
582                         goto failed_out;
583
584                 target -= count;
585                 /* allocate blocks for indirect blocks */
586                 while (index < indirect_blks && count) {
587                         new_blocks[index++] = current_block++;
588                         count--;
589                 }
590                 if (count > 0) {
591                         /*
592                          * save the new block number
593                          * for the first direct block
594                          */
595                         new_blocks[index] = current_block;
596                         printk(KERN_INFO "%s returned more blocks than "
597                                                 "requested\n", __func__);
598                         WARN_ON(1);
599                         break;
600                 }
601         }
602
603         target = blks - count ;
604         blk_allocated = count;
605         if (!target)
606                 goto allocated;
607         /* Now allocate data blocks */
608         count = target;
609         /* allocating blocks for data blocks */
610         current_block = ext4_new_blocks(handle, inode, iblock,
611                                                 goal, &count, err);
612         if (*err && (target == blks)) {
613                 /*
614                  * if the allocation failed and we didn't allocate
615                  * any blocks before
616                  */
617                 goto failed_out;
618         }
619         if (!*err) {
620                 if (target == blks) {
621                 /*
622                  * save the new block number
623                  * for the first direct block
624                  */
625                         new_blocks[index] = current_block;
626                 }
627                 blk_allocated += count;
628         }
629 allocated:
630         /* total number of blocks allocated for direct blocks */
631         ret = blk_allocated;
632         *err = 0;
633         return ret;
634 failed_out:
635         for (i = 0; i <index; i++)
636                 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
637         return ret;
638 }
639
640 /**
641  *      ext4_alloc_branch - allocate and set up a chain of blocks.
642  *      @inode: owner
643  *      @indirect_blks: number of allocated indirect blocks
644  *      @blks: number of allocated direct blocks
645  *      @offsets: offsets (in the blocks) to store the pointers to next.
646  *      @branch: place to store the chain in.
647  *
648  *      This function allocates blocks, zeroes out all but the last one,
649  *      links them into chain and (if we are synchronous) writes them to disk.
650  *      In other words, it prepares a branch that can be spliced onto the
651  *      inode. It stores the information about that chain in the branch[], in
652  *      the same format as ext4_get_branch() would do. We are calling it after
653  *      we had read the existing part of chain and partial points to the last
654  *      triple of that (one with zero ->key). Upon the exit we have the same
655  *      picture as after the successful ext4_get_block(), except that in one
656  *      place chain is disconnected - *branch->p is still zero (we did not
657  *      set the last link), but branch->key contains the number that should
658  *      be placed into *branch->p to fill that gap.
659  *
660  *      If allocation fails we free all blocks we've allocated (and forget
661  *      their buffer_heads) and return the error value the from failed
662  *      ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
663  *      as described above and return 0.
664  */
665 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
666                                 ext4_lblk_t iblock, int indirect_blks,
667                                 int *blks, ext4_fsblk_t goal,
668                                 ext4_lblk_t *offsets, Indirect *branch)
669 {
670         int blocksize = inode->i_sb->s_blocksize;
671         int i, n = 0;
672         int err = 0;
673         struct buffer_head *bh;
674         int num;
675         ext4_fsblk_t new_blocks[4];
676         ext4_fsblk_t current_block;
677
678         num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
679                                 *blks, new_blocks, &err);
680         if (err)
681                 return err;
682
683         branch[0].key = cpu_to_le32(new_blocks[0]);
684         /*
685          * metadata blocks and data blocks are allocated.
686          */
687         for (n = 1; n <= indirect_blks;  n++) {
688                 /*
689                  * Get buffer_head for parent block, zero it out
690                  * and set the pointer to new one, then send
691                  * parent to disk.
692                  */
693                 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
694                 branch[n].bh = bh;
695                 lock_buffer(bh);
696                 BUFFER_TRACE(bh, "call get_create_access");
697                 err = ext4_journal_get_create_access(handle, bh);
698                 if (err) {
699                         unlock_buffer(bh);
700                         brelse(bh);
701                         goto failed;
702                 }
703
704                 memset(bh->b_data, 0, blocksize);
705                 branch[n].p = (__le32 *) bh->b_data + offsets[n];
706                 branch[n].key = cpu_to_le32(new_blocks[n]);
707                 *branch[n].p = branch[n].key;
708                 if ( n == indirect_blks) {
709                         current_block = new_blocks[n];
710                         /*
711                          * End of chain, update the last new metablock of
712                          * the chain to point to the new allocated
713                          * data blocks numbers
714                          */
715                         for (i=1; i < num; i++)
716                                 *(branch[n].p + i) = cpu_to_le32(++current_block);
717                 }
718                 BUFFER_TRACE(bh, "marking uptodate");
719                 set_buffer_uptodate(bh);
720                 unlock_buffer(bh);
721
722                 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
723                 err = ext4_journal_dirty_metadata(handle, bh);
724                 if (err)
725                         goto failed;
726         }
727         *blks = num;
728         return err;
729 failed:
730         /* Allocation failed, free what we already allocated */
731         for (i = 1; i <= n ; i++) {
732                 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
733                 ext4_journal_forget(handle, branch[i].bh);
734         }
735         for (i = 0; i <indirect_blks; i++)
736                 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
737
738         ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
739
740         return err;
741 }
742
743 /**
744  * ext4_splice_branch - splice the allocated branch onto inode.
745  * @inode: owner
746  * @block: (logical) number of block we are adding
747  * @chain: chain of indirect blocks (with a missing link - see
748  *      ext4_alloc_branch)
749  * @where: location of missing link
750  * @num:   number of indirect blocks we are adding
751  * @blks:  number of direct blocks we are adding
752  *
753  * This function fills the missing link and does all housekeeping needed in
754  * inode (->i_blocks, etc.). In case of success we end up with the full
755  * chain to new block and return 0.
756  */
757 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
758                         ext4_lblk_t block, Indirect *where, int num, int blks)
759 {
760         int i;
761         int err = 0;
762         struct ext4_block_alloc_info *block_i;
763         ext4_fsblk_t current_block;
764
765         block_i = EXT4_I(inode)->i_block_alloc_info;
766         /*
767          * If we're splicing into a [td]indirect block (as opposed to the
768          * inode) then we need to get write access to the [td]indirect block
769          * before the splice.
770          */
771         if (where->bh) {
772                 BUFFER_TRACE(where->bh, "get_write_access");
773                 err = ext4_journal_get_write_access(handle, where->bh);
774                 if (err)
775                         goto err_out;
776         }
777         /* That's it */
778
779         *where->p = where->key;
780
781         /*
782          * Update the host buffer_head or inode to point to more just allocated
783          * direct blocks blocks
784          */
785         if (num == 0 && blks > 1) {
786                 current_block = le32_to_cpu(where->key) + 1;
787                 for (i = 1; i < blks; i++)
788                         *(where->p + i ) = cpu_to_le32(current_block++);
789         }
790
791         /*
792          * update the most recently allocated logical & physical block
793          * in i_block_alloc_info, to assist find the proper goal block for next
794          * allocation
795          */
796         if (block_i) {
797                 block_i->last_alloc_logical_block = block + blks - 1;
798                 block_i->last_alloc_physical_block =
799                                 le32_to_cpu(where[num].key) + blks - 1;
800         }
801
802         /* We are done with atomic stuff, now do the rest of housekeeping */
803
804         inode->i_ctime = ext4_current_time(inode);
805         ext4_mark_inode_dirty(handle, inode);
806
807         /* had we spliced it onto indirect block? */
808         if (where->bh) {
809                 /*
810                  * If we spliced it onto an indirect block, we haven't
811                  * altered the inode.  Note however that if it is being spliced
812                  * onto an indirect block at the very end of the file (the
813                  * file is growing) then we *will* alter the inode to reflect
814                  * the new i_size.  But that is not done here - it is done in
815                  * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
816                  */
817                 jbd_debug(5, "splicing indirect only\n");
818                 BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
819                 err = ext4_journal_dirty_metadata(handle, where->bh);
820                 if (err)
821                         goto err_out;
822         } else {
823                 /*
824                  * OK, we spliced it into the inode itself on a direct block.
825                  * Inode was dirtied above.
826                  */
827                 jbd_debug(5, "splicing direct\n");
828         }
829         return err;
830
831 err_out:
832         for (i = 1; i <= num; i++) {
833                 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
834                 ext4_journal_forget(handle, where[i].bh);
835                 ext4_free_blocks(handle, inode,
836                                         le32_to_cpu(where[i-1].key), 1, 0);
837         }
838         ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
839
840         return err;
841 }
842
843 /*
844  * Allocation strategy is simple: if we have to allocate something, we will
845  * have to go the whole way to leaf. So let's do it before attaching anything
846  * to tree, set linkage between the newborn blocks, write them if sync is
847  * required, recheck the path, free and repeat if check fails, otherwise
848  * set the last missing link (that will protect us from any truncate-generated
849  * removals - all blocks on the path are immune now) and possibly force the
850  * write on the parent block.
851  * That has a nice additional property: no special recovery from the failed
852  * allocations is needed - we simply release blocks and do not touch anything
853  * reachable from inode.
854  *
855  * `handle' can be NULL if create == 0.
856  *
857  * return > 0, # of blocks mapped or allocated.
858  * return = 0, if plain lookup failed.
859  * return < 0, error case.
860  *
861  *
862  * Need to be called with
863  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
864  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
865  */
866 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
867                 ext4_lblk_t iblock, unsigned long maxblocks,
868                 struct buffer_head *bh_result,
869                 int create, int extend_disksize)
870 {
871         int err = -EIO;
872         ext4_lblk_t offsets[4];
873         Indirect chain[4];
874         Indirect *partial;
875         ext4_fsblk_t goal;
876         int indirect_blks;
877         int blocks_to_boundary = 0;
878         int depth;
879         struct ext4_inode_info *ei = EXT4_I(inode);
880         int count = 0;
881         ext4_fsblk_t first_block = 0;
882         loff_t disksize;
883
884
885         J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
886         J_ASSERT(handle != NULL || create == 0);
887         depth = ext4_block_to_path(inode, iblock, offsets,
888                                         &blocks_to_boundary);
889
890         if (depth == 0)
891                 goto out;
892
893         partial = ext4_get_branch(inode, depth, offsets, chain, &err);
894
895         /* Simplest case - block found, no allocation needed */
896         if (!partial) {
897                 first_block = le32_to_cpu(chain[depth - 1].key);
898                 clear_buffer_new(bh_result);
899                 count++;
900                 /*map more blocks*/
901                 while (count < maxblocks && count <= blocks_to_boundary) {
902                         ext4_fsblk_t blk;
903
904                         blk = le32_to_cpu(*(chain[depth-1].p + count));
905
906                         if (blk == first_block + count)
907                                 count++;
908                         else
909                                 break;
910                 }
911                 goto got_it;
912         }
913
914         /* Next simple case - plain lookup or failed read of indirect block */
915         if (!create || err == -EIO)
916                 goto cleanup;
917
918         /*
919          * Okay, we need to do block allocation.  Lazily initialize the block
920          * allocation info here if necessary
921         */
922         if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
923                 ext4_init_block_alloc_info(inode);
924
925         goal = ext4_find_goal(inode, iblock, partial);
926
927         /* the number of blocks need to allocate for [d,t]indirect blocks */
928         indirect_blks = (chain + depth) - partial - 1;
929
930         /*
931          * Next look up the indirect map to count the totoal number of
932          * direct blocks to allocate for this branch.
933          */
934         count = ext4_blks_to_allocate(partial, indirect_blks,
935                                         maxblocks, blocks_to_boundary);
936         /*
937          * Block out ext4_truncate while we alter the tree
938          */
939         err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
940                                         &count, goal,
941                                         offsets + (partial - chain), partial);
942
943         /*
944          * The ext4_splice_branch call will free and forget any buffers
945          * on the new chain if there is a failure, but that risks using
946          * up transaction credits, especially for bitmaps where the
947          * credits cannot be returned.  Can we handle this somehow?  We
948          * may need to return -EAGAIN upwards in the worst case.  --sct
949          */
950         if (!err)
951                 err = ext4_splice_branch(handle, inode, iblock,
952                                         partial, indirect_blks, count);
953         /*
954          * i_disksize growing is protected by i_data_sem.  Don't forget to
955          * protect it if you're about to implement concurrent
956          * ext4_get_block() -bzzz
957         */
958         if (!err && extend_disksize) {
959                 disksize = ((loff_t) iblock + count) << inode->i_blkbits;
960                 if (disksize > i_size_read(inode))
961                         disksize = i_size_read(inode);
962                 if (disksize > ei->i_disksize)
963                         ei->i_disksize = disksize;
964         }
965         if (err)
966                 goto cleanup;
967
968         set_buffer_new(bh_result);
969 got_it:
970         map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
971         if (count > blocks_to_boundary)
972                 set_buffer_boundary(bh_result);
973         err = count;
974         /* Clean up and exit */
975         partial = chain + depth - 1;    /* the whole chain */
976 cleanup:
977         while (partial > chain) {
978                 BUFFER_TRACE(partial->bh, "call brelse");
979                 brelse(partial->bh);
980                 partial--;
981         }
982         BUFFER_TRACE(bh_result, "returned");
983 out:
984         return err;
985 }
986
987 /*
988  * Calculate the number of metadata blocks need to reserve
989  * to allocate @blocks for non extent file based file
990  */
991 static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
992 {
993         int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
994         int ind_blks, dind_blks, tind_blks;
995
996         /* number of new indirect blocks needed */
997         ind_blks = (blocks + icap - 1) / icap;
998
999         dind_blks = (ind_blks + icap - 1) / icap;
1000
1001         tind_blks = 1;
1002
1003         return ind_blks + dind_blks + tind_blks;
1004 }
1005
1006 /*
1007  * Calculate the number of metadata blocks need to reserve
1008  * to allocate given number of blocks
1009  */
1010 static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
1011 {
1012         if (!blocks)
1013                 return 0;
1014
1015         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
1016                 return ext4_ext_calc_metadata_amount(inode, blocks);
1017
1018         return ext4_indirect_calc_metadata_amount(inode, blocks);
1019 }
1020
1021 static void ext4_da_update_reserve_space(struct inode *inode, int used)
1022 {
1023         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1024         int total, mdb, mdb_free;
1025
1026         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1027         /* recalculate the number of metablocks still need to be reserved */
1028         total = EXT4_I(inode)->i_reserved_data_blocks - used;
1029         mdb = ext4_calc_metadata_amount(inode, total);
1030
1031         /* figure out how many metablocks to release */
1032         BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1033         mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
1034
1035         /* Account for allocated meta_blocks */
1036         mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
1037
1038         /* update fs free blocks counter for truncate case */
1039         percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
1040
1041         /* update per-inode reservations */
1042         BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
1043         EXT4_I(inode)->i_reserved_data_blocks -= used;
1044
1045         BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1046         EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1047         EXT4_I(inode)->i_allocated_meta_blocks = 0;
1048         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1049
1050         /*
1051          * If we have done all the pending block allocations and if
1052          * there aren't any writers on the inode, we can discard the
1053          * inode's preallocations.
1054          */
1055         if (!total && (atomic_read(&inode->i_writecount) == 0))
1056                 ext4_discard_reservation(inode);
1057 }
1058
1059 /*
1060  * The ext4_get_blocks_wrap() function try to look up the requested blocks,
1061  * and returns if the blocks are already mapped.
1062  *
1063  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
1064  * and store the allocated blocks in the result buffer head and mark it
1065  * mapped.
1066  *
1067  * If file type is extents based, it will call ext4_ext_get_blocks(),
1068  * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
1069  * based files
1070  *
1071  * On success, it returns the number of blocks being mapped or allocate.
1072  * if create==0 and the blocks are pre-allocated and uninitialized block,
1073  * the result buffer head is unmapped. If the create ==1, it will make sure
1074  * the buffer head is mapped.
1075  *
1076  * It returns 0 if plain look up failed (blocks have not been allocated), in
1077  * that casem, buffer head is unmapped
1078  *
1079  * It returns the error in case of allocation failure.
1080  */
1081 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1082                         unsigned long max_blocks, struct buffer_head *bh,
1083                         int create, int extend_disksize, int flag)
1084 {
1085         int retval;
1086
1087         clear_buffer_mapped(bh);
1088         clear_buffer_unwritten(bh);
1089
1090         /*
1091          * Try to see if we can get  the block without requesting
1092          * for new file system block.
1093          */
1094         down_read((&EXT4_I(inode)->i_data_sem));
1095         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
1096                 retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
1097                                 bh, 0, 0);
1098         } else {
1099                 retval = ext4_get_blocks_handle(handle,
1100                                 inode, block, max_blocks, bh, 0, 0);
1101         }
1102         up_read((&EXT4_I(inode)->i_data_sem));
1103
1104         /* If it is only a block(s) look up */
1105         if (!create)
1106                 return retval;
1107
1108         /*
1109          * Returns if the blocks have already allocated
1110          *
1111          * Note that if blocks have been preallocated
1112          * ext4_ext_get_block() returns th create = 0
1113          * with buffer head unmapped.
1114          */
1115         if (retval > 0 && buffer_mapped(bh))
1116                 return retval;
1117
1118         /*
1119          * When we call get_blocks without the create flag, the
1120          * BH_Unwritten flag could have gotten set if the blocks
1121          * requested were part of a uninitialized extent.  We need to
1122          * clear this flag now that we are committed to convert all or
1123          * part of the uninitialized extent to be an initialized
1124          * extent.  This is because we need to avoid the combination
1125          * of BH_Unwritten and BH_Mapped flags being simultaneously
1126          * set on the buffer_head.
1127          */
1128         clear_buffer_unwritten(bh);
1129
1130         /*
1131          * New blocks allocate and/or writing to uninitialized extent
1132          * will possibly result in updating i_data, so we take
1133          * the write lock of i_data_sem, and call get_blocks()
1134          * with create == 1 flag.
1135          */
1136         down_write((&EXT4_I(inode)->i_data_sem));
1137
1138         /*
1139          * if the caller is from delayed allocation writeout path
1140          * we have already reserved fs blocks for allocation
1141          * let the underlying get_block() function know to
1142          * avoid double accounting
1143          */
1144         if (flag)
1145                 EXT4_I(inode)->i_delalloc_reserved_flag = 1;
1146         /*
1147          * We need to check for EXT4 here because migrate
1148          * could have changed the inode type in between
1149          */
1150         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
1151                 retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
1152                                 bh, create, extend_disksize);
1153         } else {
1154                 retval = ext4_get_blocks_handle(handle, inode, block,
1155                                 max_blocks, bh, create, extend_disksize);
1156
1157                 if (retval > 0 && buffer_new(bh)) {
1158                         /*
1159                          * We allocated new blocks which will result in
1160                          * i_data's format changing.  Force the migrate
1161                          * to fail by clearing migrate flags
1162                          */
1163                         EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
1164                                                         ~EXT4_EXT_MIGRATE;
1165                 }
1166         }
1167
1168         if (flag) {
1169                 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1170                 /*
1171                  * Update reserved blocks/metadata blocks
1172                  * after successful block allocation
1173                  * which were deferred till now
1174                  */
1175                 if ((retval > 0) && buffer_delay(bh))
1176                         ext4_da_update_reserve_space(inode, retval);
1177         }
1178
1179         up_write((&EXT4_I(inode)->i_data_sem));
1180         return retval;
1181 }
1182
1183 /* Maximum number of blocks we map for direct IO at once. */
1184 #define DIO_MAX_BLOCKS 4096
1185
1186 static int ext4_get_block(struct inode *inode, sector_t iblock,
1187                         struct buffer_head *bh_result, int create)
1188 {
1189         handle_t *handle = ext4_journal_current_handle();
1190         int ret = 0, started = 0;
1191         unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1192         int dio_credits;
1193
1194         if (create && !handle) {
1195                 /* Direct IO write... */
1196                 if (max_blocks > DIO_MAX_BLOCKS)
1197                         max_blocks = DIO_MAX_BLOCKS;
1198                 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
1199                 handle = ext4_journal_start(inode, dio_credits);
1200                 if (IS_ERR(handle)) {
1201                         ret = PTR_ERR(handle);
1202                         goto out;
1203                 }
1204                 started = 1;
1205         }
1206
1207         ret = ext4_get_blocks_wrap(handle, inode, iblock,
1208                                         max_blocks, bh_result, create, 0, 0);
1209         if (ret > 0) {
1210                 bh_result->b_size = (ret << inode->i_blkbits);
1211                 ret = 0;
1212         }
1213         if (started)
1214                 ext4_journal_stop(handle);
1215 out:
1216         return ret;
1217 }
1218
1219 /*
1220  * `handle' can be NULL if create is zero
1221  */
1222 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1223                                 ext4_lblk_t block, int create, int *errp)
1224 {
1225         struct buffer_head dummy;
1226         int fatal = 0, err;
1227
1228         J_ASSERT(handle != NULL || create == 0);
1229
1230         dummy.b_state = 0;
1231         dummy.b_blocknr = -1000;
1232         buffer_trace_init(&dummy.b_history);
1233         err = ext4_get_blocks_wrap(handle, inode, block, 1,
1234                                         &dummy, create, 1, 0);
1235         /*
1236          * ext4_get_blocks_handle() returns number of blocks
1237          * mapped. 0 in case of a HOLE.
1238          */
1239         if (err > 0) {
1240                 if (err > 1)
1241                         WARN_ON(1);
1242                 err = 0;
1243         }
1244         *errp = err;
1245         if (!err && buffer_mapped(&dummy)) {
1246                 struct buffer_head *bh;
1247                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1248                 if (!bh) {
1249                         *errp = -EIO;
1250                         goto err;
1251                 }
1252                 if (buffer_new(&dummy)) {
1253                         J_ASSERT(create != 0);
1254                         J_ASSERT(handle != NULL);
1255
1256                         /*
1257                          * Now that we do not always journal data, we should
1258                          * keep in mind whether this should always journal the
1259                          * new buffer as metadata.  For now, regular file
1260                          * writes use ext4_get_block instead, so it's not a
1261                          * problem.
1262                          */
1263                         lock_buffer(bh);
1264                         BUFFER_TRACE(bh, "call get_create_access");
1265                         fatal = ext4_journal_get_create_access(handle, bh);
1266                         if (!fatal && !buffer_uptodate(bh)) {
1267                                 memset(bh->b_data,0,inode->i_sb->s_blocksize);
1268                                 set_buffer_uptodate(bh);
1269                         }
1270                         unlock_buffer(bh);
1271                         BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1272                         err = ext4_journal_dirty_metadata(handle, bh);
1273                         if (!fatal)
1274                                 fatal = err;
1275                 } else {
1276                         BUFFER_TRACE(bh, "not a new buffer");
1277                 }
1278                 if (fatal) {
1279                         *errp = fatal;
1280                         brelse(bh);
1281                         bh = NULL;
1282                 }
1283                 return bh;
1284         }
1285 err:
1286         return NULL;
1287 }
1288
1289 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1290                                ext4_lblk_t block, int create, int *err)
1291 {
1292         struct buffer_head * bh;
1293
1294         bh = ext4_getblk(handle, inode, block, create, err);
1295         if (!bh)
1296                 return bh;
1297         if (buffer_uptodate(bh))
1298                 return bh;
1299         ll_rw_block(READ_META, 1, &bh);
1300         wait_on_buffer(bh);
1301         if (buffer_uptodate(bh))
1302                 return bh;
1303         put_bh(bh);
1304         *err = -EIO;
1305         return NULL;
1306 }
1307
1308 static int walk_page_buffers(   handle_t *handle,
1309                                 struct buffer_head *head,
1310                                 unsigned from,
1311                                 unsigned to,
1312                                 int *partial,
1313                                 int (*fn)(      handle_t *handle,
1314                                                 struct buffer_head *bh))
1315 {
1316         struct buffer_head *bh;
1317         unsigned block_start, block_end;
1318         unsigned blocksize = head->b_size;
1319         int err, ret = 0;
1320         struct buffer_head *next;
1321
1322         for (   bh = head, block_start = 0;
1323                 ret == 0 && (bh != head || !block_start);
1324                 block_start = block_end, bh = next)
1325         {
1326                 next = bh->b_this_page;
1327                 block_end = block_start + blocksize;
1328                 if (block_end <= from || block_start >= to) {
1329                         if (partial && !buffer_uptodate(bh))
1330                                 *partial = 1;
1331                         continue;
1332                 }
1333                 err = (*fn)(handle, bh);
1334                 if (!ret)
1335                         ret = err;
1336         }
1337         return ret;
1338 }
1339
1340 /*
1341  * To preserve ordering, it is essential that the hole instantiation and
1342  * the data write be encapsulated in a single transaction.  We cannot
1343  * close off a transaction and start a new one between the ext4_get_block()
1344  * and the commit_write().  So doing the jbd2_journal_start at the start of
1345  * prepare_write() is the right place.
1346  *
1347  * Also, this function can nest inside ext4_writepage() ->
1348  * block_write_full_page(). In that case, we *know* that ext4_writepage()
1349  * has generated enough buffer credits to do the whole page.  So we won't
1350  * block on the journal in that case, which is good, because the caller may
1351  * be PF_MEMALLOC.
1352  *
1353  * By accident, ext4 can be reentered when a transaction is open via
1354  * quota file writes.  If we were to commit the transaction while thus
1355  * reentered, there can be a deadlock - we would be holding a quota
1356  * lock, and the commit would never complete if another thread had a
1357  * transaction open and was blocking on the quota lock - a ranking
1358  * violation.
1359  *
1360  * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
1361  * will _not_ run commit under these circumstances because handle->h_ref
1362  * is elevated.  We'll still have enough credits for the tiny quotafile
1363  * write.
1364  */
1365 static int do_journal_get_write_access(handle_t *handle,
1366                                         struct buffer_head *bh)
1367 {
1368         if (!buffer_mapped(bh) || buffer_freed(bh))
1369                 return 0;
1370         return ext4_journal_get_write_access(handle, bh);
1371 }
1372
1373 static int ext4_write_begin(struct file *file, struct address_space *mapping,
1374                                 loff_t pos, unsigned len, unsigned flags,
1375                                 struct page **pagep, void **fsdata)
1376 {
1377         struct inode *inode = mapping->host;
1378         int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
1379         handle_t *handle;
1380         int retries = 0;
1381         struct page *page;
1382         pgoff_t index;
1383         unsigned from, to;
1384
1385         index = pos >> PAGE_CACHE_SHIFT;
1386         from = pos & (PAGE_CACHE_SIZE - 1);
1387         to = from + len;
1388
1389 retry:
1390         handle = ext4_journal_start(inode, needed_blocks);
1391         if (IS_ERR(handle)) {
1392                 ret = PTR_ERR(handle);
1393                 goto out;
1394         }
1395
1396         /* We cannot recurse into the filesystem as the transaction is already
1397          * started */
1398         flags |= AOP_FLAG_NOFS;
1399
1400         page = grab_cache_page_write_begin(mapping, index, flags);
1401         if (!page) {
1402                 ext4_journal_stop(handle);
1403                 ret = -ENOMEM;
1404                 goto out;
1405         }
1406         *pagep = page;
1407
1408         ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1409                                 ext4_get_block);
1410
1411         if (!ret && ext4_should_journal_data(inode)) {
1412                 ret = walk_page_buffers(handle, page_buffers(page),
1413                                 from, to, NULL, do_journal_get_write_access);
1414         }
1415
1416         if (ret) {
1417                 unlock_page(page);
1418                 ext4_journal_stop(handle);
1419                 page_cache_release(page);
1420         }
1421
1422         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1423                 goto retry;
1424 out:
1425         return ret;
1426 }
1427
1428 /* For write_end() in data=journal mode */
1429 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1430 {
1431         if (!buffer_mapped(bh) || buffer_freed(bh))
1432                 return 0;
1433         set_buffer_uptodate(bh);
1434         return ext4_journal_dirty_metadata(handle, bh);
1435 }
1436
1437 /*
1438  * We need to pick up the new inode size which generic_commit_write gave us
1439  * `file' can be NULL - eg, when called from page_symlink().
1440  *
1441  * ext4 never places buffers on inode->i_mapping->private_list.  metadata
1442  * buffers are managed internally.
1443  */
1444 static int ext4_ordered_write_end(struct file *file,
1445                                 struct address_space *mapping,
1446                                 loff_t pos, unsigned len, unsigned copied,
1447                                 struct page *page, void *fsdata)
1448 {
1449         handle_t *handle = ext4_journal_current_handle();
1450         struct inode *inode = mapping->host;
1451         int ret = 0, ret2;
1452
1453         ret = ext4_jbd2_file_inode(handle, inode);
1454
1455         if (ret == 0) {
1456                 /*
1457                  * generic_write_end() will run mark_inode_dirty() if i_size
1458                  * changes.  So let's piggyback the i_disksize mark_inode_dirty
1459                  * into that.
1460                  */
1461                 loff_t new_i_size;
1462
1463                 new_i_size = pos + copied;
1464                 if (new_i_size > EXT4_I(inode)->i_disksize)
1465                         EXT4_I(inode)->i_disksize = new_i_size;
1466                 ret2 = generic_write_end(file, mapping, pos, len, copied,
1467                                                         page, fsdata);
1468                 copied = ret2;
1469                 if (ret2 < 0)
1470                         ret = ret2;
1471         }
1472         ret2 = ext4_journal_stop(handle);
1473         if (!ret)
1474                 ret = ret2;
1475
1476         return ret ? ret : copied;
1477 }
1478
1479 static int ext4_writeback_write_end(struct file *file,
1480                                 struct address_space *mapping,
1481                                 loff_t pos, unsigned len, unsigned copied,
1482                                 struct page *page, void *fsdata)
1483 {
1484         handle_t *handle = ext4_journal_current_handle();
1485         struct inode *inode = mapping->host;
1486         int ret = 0, ret2;
1487         loff_t new_i_size;
1488
1489         new_i_size = pos + copied;
1490         if (new_i_size > EXT4_I(inode)->i_disksize)
1491                 EXT4_I(inode)->i_disksize = new_i_size;
1492
1493         ret2 = generic_write_end(file, mapping, pos, len, copied,
1494                                                         page, fsdata);
1495         copied = ret2;
1496         if (ret2 < 0)
1497                 ret = ret2;
1498
1499         ret2 = ext4_journal_stop(handle);
1500         if (!ret)
1501                 ret = ret2;
1502
1503         return ret ? ret : copied;
1504 }
1505
1506 static int ext4_journalled_write_end(struct file *file,
1507                                 struct address_space *mapping,
1508                                 loff_t pos, unsigned len, unsigned copied,
1509                                 struct page *page, void *fsdata)
1510 {
1511         handle_t *handle = ext4_journal_current_handle();
1512         struct inode *inode = mapping->host;
1513         int ret = 0, ret2;
1514         int partial = 0;
1515         unsigned from, to;
1516
1517         from = pos & (PAGE_CACHE_SIZE - 1);
1518         to = from + len;
1519
1520         if (copied < len) {
1521                 if (!PageUptodate(page))
1522                         copied = 0;
1523                 page_zero_new_buffers(page, from+copied, to);
1524         }
1525
1526         ret = walk_page_buffers(handle, page_buffers(page), from,
1527                                 to, &partial, write_end_fn);
1528         if (!partial)
1529                 SetPageUptodate(page);
1530         if (pos+copied > inode->i_size)
1531                 i_size_write(inode, pos+copied);
1532         EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1533         if (inode->i_size > EXT4_I(inode)->i_disksize) {
1534                 EXT4_I(inode)->i_disksize = inode->i_size;
1535                 ret2 = ext4_mark_inode_dirty(handle, inode);
1536                 if (!ret)
1537                         ret = ret2;
1538         }
1539
1540         unlock_page(page);
1541         ret2 = ext4_journal_stop(handle);
1542         if (!ret)
1543                 ret = ret2;
1544         page_cache_release(page);
1545
1546         return ret ? ret : copied;
1547 }
1548
1549 static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1550 {
1551        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1552        unsigned long md_needed, mdblocks, total = 0;
1553
1554         /*
1555          * recalculate the amount of metadata blocks to reserve
1556          * in order to allocate nrblocks
1557          * worse case is one extent per block
1558          */
1559         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1560         total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
1561         mdblocks = ext4_calc_metadata_amount(inode, total);
1562         BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
1563
1564         md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1565         total = md_needed + nrblocks;
1566
1567         if (ext4_claim_free_blocks(sbi, total)) {
1568                 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1569                 return -ENOSPC;
1570         }
1571         EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
1572         EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
1573
1574         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1575         return 0;       /* success */
1576 }
1577
1578 static void ext4_da_release_space(struct inode *inode, int to_free)
1579 {
1580         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1581         int total, mdb, mdb_free, release;
1582
1583         if (!to_free)
1584                 return;         /* Nothing to release, exit */
1585
1586         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1587
1588         if (!EXT4_I(inode)->i_reserved_data_blocks) {
1589                 /*
1590                  * if there is no reserved blocks, but we try to free some
1591                  * then the counter is messed up somewhere.
1592                  * but since this function is called from invalidate
1593                  * page, it's harmless to return without any action
1594                  */
1595                 printk(KERN_INFO "ext4 delalloc try to release %d reserved "
1596                             "blocks for inode %lu, but there is no reserved "
1597                             "data blocks\n", to_free, inode->i_ino);
1598                 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1599                 return;
1600         }
1601
1602         /* recalculate the number of metablocks still need to be reserved */
1603         total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
1604         mdb = ext4_calc_metadata_amount(inode, total);
1605
1606         /* figure out how many metablocks to release */
1607         BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1608         mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
1609
1610         release = to_free + mdb_free;
1611
1612         /* update fs free blocks counter for truncate case */
1613         percpu_counter_add(&sbi->s_freeblocks_counter, release);
1614
1615         /* update per-inode reservations */
1616         BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
1617         EXT4_I(inode)->i_reserved_data_blocks -= to_free;
1618
1619         BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1620         EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1621         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1622 }
1623
1624 static void ext4_da_page_release_reservation(struct page *page,
1625                                                 unsigned long offset)
1626 {
1627         int to_release = 0;
1628         struct buffer_head *head, *bh;
1629         unsigned int curr_off = 0;
1630
1631         head = page_buffers(page);
1632         bh = head;
1633         do {
1634                 unsigned int next_off = curr_off + bh->b_size;
1635
1636                 if ((offset <= curr_off) && (buffer_delay(bh))) {
1637                         to_release++;
1638                         clear_buffer_delay(bh);
1639                 }
1640                 curr_off = next_off;
1641         } while ((bh = bh->b_this_page) != head);
1642         ext4_da_release_space(page->mapping->host, to_release);
1643 }
1644
1645 /*
1646  * Delayed allocation stuff
1647  */
1648
1649 struct mpage_da_data {
1650         struct inode *inode;
1651         struct buffer_head lbh;                 /* extent of blocks */
1652         unsigned long first_page, next_page;    /* extent of pages */
1653         get_block_t *get_block;
1654         struct writeback_control *wbc;
1655         int io_done;
1656         long pages_written;
1657 };
1658
1659 /*
1660  * mpage_da_submit_io - walks through extent of pages and try to write
1661  * them with writepage() call back
1662  *
1663  * @mpd->inode: inode
1664  * @mpd->first_page: first page of the extent
1665  * @mpd->next_page: page after the last page of the extent
1666  * @mpd->get_block: the filesystem's block mapper function
1667  *
1668  * By the time mpage_da_submit_io() is called we expect all blocks
1669  * to be allocated. this may be wrong if allocation failed.
1670  *
1671  * As pages are already locked by write_cache_pages(), we can't use it
1672  */
1673 static int mpage_da_submit_io(struct mpage_da_data *mpd)
1674 {
1675         long pages_skipped;
1676         struct pagevec pvec;
1677         unsigned long index, end;
1678         int ret = 0, err, nr_pages, i;
1679         struct inode *inode = mpd->inode;
1680         struct address_space *mapping = inode->i_mapping;
1681
1682         BUG_ON(mpd->next_page <= mpd->first_page);
1683         /*
1684          * We need to start from the first_page to the next_page - 1
1685          * to make sure we also write the mapped dirty buffer_heads.
1686          * If we look at mpd->lbh.b_blocknr we would only be looking
1687          * at the currently mapped buffer_heads.
1688          */
1689         index = mpd->first_page;
1690         end = mpd->next_page - 1;
1691
1692         pagevec_init(&pvec, 0);
1693         while (index <= end) {
1694                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1695                 if (nr_pages == 0)
1696                         break;
1697                 for (i = 0; i < nr_pages; i++) {
1698                         struct page *page = pvec.pages[i];
1699
1700                         index = page->index;
1701                         if (index > end)
1702                                 break;
1703                         index++;
1704
1705                         BUG_ON(!PageLocked(page));
1706                         BUG_ON(PageWriteback(page));
1707
1708                         pages_skipped = mpd->wbc->pages_skipped;
1709                         err = mapping->a_ops->writepage(page, mpd->wbc);
1710                         if (!err)
1711                                 mpd->pages_written++;
1712                         /*
1713                          * In error case, we have to continue because
1714                          * remaining pages are still locked
1715                          * XXX: unlock and re-dirty them?
1716                          */
1717                         if (ret == 0)
1718                                 ret = err;
1719                 }
1720                 pagevec_release(&pvec);
1721         }
1722         return ret;
1723 }
1724
1725 /*
1726  * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
1727  *
1728  * @mpd->inode - inode to walk through
1729  * @exbh->b_blocknr - first block on a disk
1730  * @exbh->b_size - amount of space in bytes
1731  * @logical - first logical block to start assignment with
1732  *
1733  * the function goes through all passed space and put actual disk
1734  * block numbers into buffer heads, dropping BH_Delay
1735  */
1736 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1737                                  struct buffer_head *exbh)
1738 {
1739         struct inode *inode = mpd->inode;
1740         struct address_space *mapping = inode->i_mapping;
1741         int blocks = exbh->b_size >> inode->i_blkbits;
1742         sector_t pblock = exbh->b_blocknr, cur_logical;
1743         struct buffer_head *head, *bh;
1744         pgoff_t index, end;
1745         struct pagevec pvec;
1746         int nr_pages, i;
1747
1748         index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1749         end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1750         cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1751
1752         pagevec_init(&pvec, 0);
1753
1754         while (index <= end) {
1755                 /* XXX: optimize tail */
1756                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1757                 if (nr_pages == 0)
1758                         break;
1759                 for (i = 0; i < nr_pages; i++) {
1760                         struct page *page = pvec.pages[i];
1761
1762                         index = page->index;
1763                         if (index > end)
1764                                 break;
1765                         index++;
1766
1767                         BUG_ON(!PageLocked(page));
1768                         BUG_ON(PageWriteback(page));
1769                         BUG_ON(!page_has_buffers(page));
1770
1771                         bh = page_buffers(page);
1772                         head = bh;
1773
1774                         /* skip blocks out of the range */
1775                         do {
1776                                 if (cur_logical >= logical)
1777                                         break;
1778                                 cur_logical++;
1779                         } while ((bh = bh->b_this_page) != head);
1780
1781                         do {
1782                                 if (cur_logical >= logical + blocks)
1783                                         break;
1784                                 if (buffer_delay(bh)) {
1785                                         bh->b_blocknr = pblock;
1786                                         clear_buffer_delay(bh);
1787                                         bh->b_bdev = inode->i_sb->s_bdev;
1788                                 } else if (buffer_unwritten(bh)) {
1789                                         bh->b_blocknr = pblock;
1790                                         clear_buffer_unwritten(bh);
1791                                         set_buffer_mapped(bh);
1792                                         set_buffer_new(bh);
1793                                         bh->b_bdev = inode->i_sb->s_bdev;
1794                                 } else if (buffer_mapped(bh))
1795                                         BUG_ON(bh->b_blocknr != pblock);
1796
1797                                 cur_logical++;
1798                                 pblock++;
1799                         } while ((bh = bh->b_this_page) != head);
1800                 }
1801                 pagevec_release(&pvec);
1802         }
1803 }
1804
1805
1806 /*
1807  * __unmap_underlying_blocks - just a helper function to unmap
1808  * set of blocks described by @bh
1809  */
1810 static inline void __unmap_underlying_blocks(struct inode *inode,
1811                                              struct buffer_head *bh)
1812 {
1813         struct block_device *bdev = inode->i_sb->s_bdev;
1814         int blocks, i;
1815
1816         blocks = bh->b_size >> inode->i_blkbits;
1817         for (i = 0; i < blocks; i++)
1818                 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
1819 }
1820
1821 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
1822                                         sector_t logical, long blk_cnt)
1823 {
1824         int nr_pages, i;
1825         pgoff_t index, end;
1826         struct pagevec pvec;
1827         struct inode *inode = mpd->inode;
1828         struct address_space *mapping = inode->i_mapping;
1829
1830         index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1831         end   = (logical + blk_cnt - 1) >>
1832                                 (PAGE_CACHE_SHIFT - inode->i_blkbits);
1833         while (index <= end) {
1834                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1835                 if (nr_pages == 0)
1836                         break;
1837                 for (i = 0; i < nr_pages; i++) {
1838                         struct page *page = pvec.pages[i];
1839                         index = page->index;
1840                         if (index > end)
1841                                 break;
1842                         index++;
1843
1844                         BUG_ON(!PageLocked(page));
1845                         BUG_ON(PageWriteback(page));
1846                         block_invalidatepage(page, 0);
1847                         ClearPageUptodate(page);
1848                         unlock_page(page);
1849                 }
1850         }
1851         return;
1852 }
1853
1854 /*
1855  * mpage_da_map_blocks - go through given space
1856  *
1857  * @mpd->lbh - bh describing space
1858  * @mpd->get_block - the filesystem's block mapper function
1859  *
1860  * The function skips space we know is already mapped to disk blocks.
1861  *
1862  */
1863 static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
1864 {
1865         int err = 0;
1866         struct buffer_head *lbh = &mpd->lbh;
1867         sector_t next = lbh->b_blocknr;
1868         struct buffer_head new;
1869
1870         /*
1871          * We consider only non-mapped and non-allocated blocks
1872          */
1873         if (buffer_mapped(lbh) && !buffer_delay(lbh))
1874                 return 0;
1875
1876         new.b_state = lbh->b_state;
1877         new.b_blocknr = 0;
1878         new.b_size = lbh->b_size;
1879
1880         /*
1881          * If we didn't accumulate anything
1882          * to write simply return
1883          */
1884         if (!new.b_size)
1885                 return 0;
1886         err = mpd->get_block(mpd->inode, next, &new, 1);
1887         if (err) {
1888
1889                 /* If get block returns with error
1890                  * we simply return. Later writepage
1891                  * will redirty the page and writepages
1892                  * will find the dirty page again
1893                  */
1894                 if (err == -EAGAIN)
1895                         return 0;
1896                 /*
1897                  * get block failure will cause us
1898                  * to loop in writepages. Because
1899                  * a_ops->writepage won't be able to
1900                  * make progress. The page will be redirtied
1901                  * by writepage and writepages will again
1902                  * try to write the same.
1903                  */
1904                 printk(KERN_EMERG "%s block allocation failed for inode %lu "
1905                                   "at logical offset %llu with max blocks "
1906                                   "%zd with error %d\n",
1907                                   __func__, mpd->inode->i_ino,
1908                                   (unsigned long long)next,
1909                                   lbh->b_size >> mpd->inode->i_blkbits, err);
1910                 printk(KERN_EMERG "This should not happen.!! "
1911                                         "Data will be lost\n");
1912                 /* invlaidate all the pages */
1913                 ext4_da_block_invalidatepages(mpd, next,
1914                                 lbh->b_size >> mpd->inode->i_blkbits);
1915                 return err;
1916         }
1917         BUG_ON(new.b_size == 0);
1918
1919         if (buffer_new(&new))
1920                 __unmap_underlying_blocks(mpd->inode, &new);
1921
1922         /*
1923          * If blocks are delayed marked, we need to
1924          * put actual blocknr and drop delayed bit
1925          */
1926         if (buffer_delay(lbh) || buffer_unwritten(lbh))
1927                 mpage_put_bnr_to_bhs(mpd, next, &new);
1928
1929         return 0;
1930 }
1931
1932 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
1933                 (1 << BH_Delay) | (1 << BH_Unwritten))
1934
1935 /*
1936  * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1937  *
1938  * @mpd->lbh - extent of blocks
1939  * @logical - logical number of the block in the file
1940  * @bh - bh of the block (used to access block's state)
1941  *
1942  * the function is used to collect contig. blocks in same state
1943  */
1944 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1945                                    sector_t logical, struct buffer_head *bh)
1946 {
1947         sector_t next;
1948         size_t b_size = bh->b_size;
1949         struct buffer_head *lbh = &mpd->lbh;
1950         int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
1951
1952         /* check if thereserved journal credits might overflow */
1953         if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
1954                 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1955                         /*
1956                          * With non-extent format we are limited by the journal
1957                          * credit available.  Total credit needed to insert
1958                          * nrblocks contiguous blocks is dependent on the
1959                          * nrblocks.  So limit nrblocks.
1960                          */
1961                         goto flush_it;
1962                 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
1963                                 EXT4_MAX_TRANS_DATA) {
1964                         /*
1965                          * Adding the new buffer_head would make it cross the
1966                          * allowed limit for which we have journal credit
1967                          * reserved. So limit the new bh->b_size
1968                          */
1969                         b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
1970                                                 mpd->inode->i_blkbits;
1971                         /* we will do mpage_da_submit_io in the next loop */
1972                 }
1973         }
1974         /*
1975          * First block in the extent
1976          */
1977         if (lbh->b_size == 0) {
1978                 lbh->b_blocknr = logical;
1979                 lbh->b_size = b_size;
1980                 lbh->b_state = bh->b_state & BH_FLAGS;
1981                 return;
1982         }
1983
1984         next = lbh->b_blocknr + nrblocks;
1985         /*
1986          * Can we merge the block to our big extent?
1987          */
1988         if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
1989                 lbh->b_size += b_size;
1990                 return;
1991         }
1992
1993 flush_it:
1994         /*
1995          * We couldn't merge the block to our extent, so we
1996          * need to flush current  extent and start new one
1997          */
1998         if (mpage_da_map_blocks(mpd) == 0)
1999                 mpage_da_submit_io(mpd);
2000         mpd->io_done = 1;
2001         return;
2002 }
2003
2004 /*
2005  * __mpage_da_writepage - finds extent of pages and blocks
2006  *
2007  * @page: page to consider
2008  * @wbc: not used, we just follow rules
2009  * @data: context
2010  *
2011  * The function finds extents of pages and scan them for all blocks.
2012  */
2013 static int __mpage_da_writepage(struct page *page,
2014                                 struct writeback_control *wbc, void *data)
2015 {
2016         struct mpage_da_data *mpd = data;
2017         struct inode *inode = mpd->inode;
2018         struct buffer_head *bh, *head, fake;
2019         sector_t logical;
2020
2021         if (mpd->io_done) {
2022                 /*
2023                  * Rest of the page in the page_vec
2024                  * redirty then and skip then. We will
2025                  * try to to write them again after
2026                  * starting a new transaction
2027                  */
2028                 redirty_page_for_writepage(wbc, page);
2029                 unlock_page(page);
2030                 return MPAGE_DA_EXTENT_TAIL;
2031         }
2032         /*
2033          * Can we merge this page to current extent?
2034          */
2035         if (mpd->next_page != page->index) {
2036                 /*
2037                  * Nope, we can't. So, we map non-allocated blocks
2038                  * and start IO on them using writepage()
2039                  */
2040                 if (mpd->next_page != mpd->first_page) {
2041                         if (mpage_da_map_blocks(mpd) == 0)
2042                                 mpage_da_submit_io(mpd);
2043                         /*
2044                          * skip rest of the page in the page_vec
2045                          */
2046                         mpd->io_done = 1;
2047                         redirty_page_for_writepage(wbc, page);
2048                         unlock_page(page);
2049                         return MPAGE_DA_EXTENT_TAIL;
2050                 }
2051
2052                 /*
2053                  * Start next extent of pages ...
2054                  */
2055                 mpd->first_page = page->index;
2056
2057                 /*
2058                  * ... and blocks
2059                  */
2060                 mpd->lbh.b_size = 0;
2061                 mpd->lbh.b_state = 0;
2062                 mpd->lbh.b_blocknr = 0;
2063         }
2064
2065         mpd->next_page = page->index + 1;
2066         logical = (sector_t) page->index <<
2067                   (PAGE_CACHE_SHIFT - inode->i_blkbits);
2068
2069         if (!page_has_buffers(page)) {
2070                 /*
2071                  * There is no attached buffer heads yet (mmap?)
2072                  * we treat the page asfull of dirty blocks
2073                  */
2074                 bh = &fake;
2075                 bh->b_size = PAGE_CACHE_SIZE;
2076                 bh->b_state = 0;
2077                 set_buffer_dirty(bh);
2078                 set_buffer_uptodate(bh);
2079                 mpage_add_bh_to_extent(mpd, logical, bh);
2080                 if (mpd->io_done)
2081                         return MPAGE_DA_EXTENT_TAIL;
2082         } else {
2083                 /*
2084                  * Page with regular buffer heads, just add all dirty ones
2085                  */
2086                 head = page_buffers(page);
2087                 bh = head;
2088                 do {
2089                         BUG_ON(buffer_locked(bh));
2090                         /*
2091                          * We need to try to allocate
2092                          * unmapped blocks in the same page.
2093                          * Otherwise we won't make progress
2094                          * with the page in ext4_da_writepage
2095                          */
2096                         if (buffer_dirty(bh) &&
2097                                 (!buffer_mapped(bh) || buffer_delay(bh))) {
2098                                 mpage_add_bh_to_extent(mpd, logical, bh);
2099                                 if (mpd->io_done)
2100                                         return MPAGE_DA_EXTENT_TAIL;
2101                         } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2102                                 /*
2103                                  * mapped dirty buffer. We need to update
2104                                  * the b_state because we look at
2105                                  * b_state in mpage_da_map_blocks. We don't
2106                                  * update b_size because if we find an
2107                                  * unmapped buffer_head later we need to
2108                                  * use the b_state flag of that buffer_head.
2109                                  */
2110                                 if (mpd->lbh.b_size == 0)
2111                                         mpd->lbh.b_state =
2112                                                 bh->b_state & BH_FLAGS;
2113                         }
2114                         logical++;
2115                 } while ((bh = bh->b_this_page) != head);
2116         }
2117
2118         return 0;
2119 }
2120
2121 /*
2122  * mpage_da_writepages - walk the list of dirty pages of the given
2123  * address space, allocates non-allocated blocks, maps newly-allocated
2124  * blocks to existing bhs and issue IO them
2125  *
2126  * @mapping: address space structure to write
2127  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2128  * @get_block: the filesystem's block mapper function.
2129  *
2130  * This is a library function, which implements the writepages()
2131  * address_space_operation.
2132  */
2133 static int mpage_da_writepages(struct address_space *mapping,
2134                                struct writeback_control *wbc,
2135                                get_block_t get_block)
2136 {
2137         struct mpage_da_data mpd;
2138         long to_write;
2139         int ret;
2140
2141         if (!get_block)
2142                 return generic_writepages(mapping, wbc);
2143
2144         mpd.wbc = wbc;
2145         mpd.inode = mapping->host;
2146         mpd.lbh.b_size = 0;
2147         mpd.lbh.b_state = 0;
2148         mpd.lbh.b_blocknr = 0;
2149         mpd.first_page = 0;
2150         mpd.next_page = 0;
2151         mpd.get_block = get_block;
2152         mpd.io_done = 0;
2153         mpd.pages_written = 0;
2154
2155         to_write = wbc->nr_to_write;
2156
2157         ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
2158
2159         /*
2160          * Handle last extent of pages
2161          */
2162         if (!mpd.io_done && mpd.next_page != mpd.first_page) {
2163                 if (mpage_da_map_blocks(&mpd) == 0)
2164                         mpage_da_submit_io(&mpd);
2165         }
2166
2167         wbc->nr_to_write = to_write - mpd.pages_written;
2168         return ret;
2169 }
2170
2171 /*
2172  * this is a special callback for ->write_begin() only
2173  * it's intention is to return mapped block or reserve space
2174  */
2175 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2176                                   struct buffer_head *bh_result, int create)
2177 {
2178         int ret = 0;
2179         sector_t invalid_block = ~((sector_t) 0xffff);
2180
2181         if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
2182                 invalid_block = ~0;
2183
2184         BUG_ON(create == 0);
2185         BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2186
2187         /*
2188          * first, we need to know whether the block is allocated already
2189          * preallocated blocks are unmapped but should treated
2190          * the same as allocated blocks.
2191          */
2192         ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
2193         if ((ret == 0) && !buffer_delay(bh_result)) {
2194                 /* the block isn't (pre)allocated yet, let's reserve space */
2195                 /*
2196                  * XXX: __block_prepare_write() unmaps passed block,
2197                  * is it OK?
2198                  */
2199                 ret = ext4_da_reserve_space(inode, 1);
2200                 if (ret)
2201                         /* not enough space to reserve */
2202                         return ret;
2203
2204                 map_bh(bh_result, inode->i_sb, invalid_block);
2205                 set_buffer_new(bh_result);
2206                 set_buffer_delay(bh_result);
2207         } else if (ret > 0) {
2208                 bh_result->b_size = (ret << inode->i_blkbits);
2209                 /*
2210                  * With sub-block writes into unwritten extents
2211                  * we also need to mark the buffer as new so that
2212                  * the unwritten parts of the buffer gets correctly zeroed.
2213                  */
2214                 if (buffer_unwritten(bh_result))
2215                         set_buffer_new(bh_result);
2216                 ret = 0;
2217         }
2218
2219         return ret;
2220 }
2221 #define         EXT4_DELALLOC_RSVED     1
2222 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2223                                    struct buffer_head *bh_result, int create)
2224 {
2225         int ret;
2226         unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2227         loff_t disksize = EXT4_I(inode)->i_disksize;
2228         handle_t *handle = NULL;
2229
2230         handle = ext4_journal_current_handle();
2231         if (!handle) {
2232                 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2233                                    bh_result, 0, 0, 0);
2234                 BUG_ON(!ret);
2235         } else {
2236                 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2237                                    bh_result, create, 0, EXT4_DELALLOC_RSVED);
2238         }
2239
2240         if (ret > 0) {
2241                 bh_result->b_size = (ret << inode->i_blkbits);
2242
2243                 /*
2244                  * Update on-disk size along with block allocation
2245                  * we don't use 'extend_disksize' as size may change
2246                  * within already allocated block -bzzz
2247                  */
2248                 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2249                 if (disksize > i_size_read(inode))
2250                         disksize = i_size_read(inode);
2251                 if (disksize > EXT4_I(inode)->i_disksize) {
2252                         /*
2253                          * XXX: replace with spinlock if seen contended -bzzz
2254                          */
2255                         down_write(&EXT4_I(inode)->i_data_sem);
2256                         if (disksize > EXT4_I(inode)->i_disksize)
2257                                 EXT4_I(inode)->i_disksize = disksize;
2258                         up_write(&EXT4_I(inode)->i_data_sem);
2259
2260                         if (EXT4_I(inode)->i_disksize == disksize) {
2261                                 ret = ext4_mark_inode_dirty(handle, inode);
2262                                 return ret;
2263                         }
2264                 }
2265                 ret = 0;
2266         }
2267         return ret;
2268 }
2269
2270 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2271 {
2272         /*
2273          * unmapped buffer is possible for holes.
2274          * delay buffer is possible with delayed allocation
2275          */
2276         return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
2277 }
2278
2279 static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
2280                                    struct buffer_head *bh_result, int create)
2281 {
2282         int ret = 0;
2283         unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2284
2285         /*
2286          * we don't want to do block allocation in writepage
2287          * so call get_block_wrap with create = 0
2288          */
2289         ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
2290                                    bh_result, 0, 0, 0);
2291         if (ret > 0) {
2292                 bh_result->b_size = (ret << inode->i_blkbits);
2293                 ret = 0;
2294         }
2295         return ret;
2296 }
2297
2298 /*
2299  * get called vi ext4_da_writepages after taking page lock (have journal handle)
2300  * get called via journal_submit_inode_data_buffers (no journal handle)
2301  * get called via shrink_page_list via pdflush (no journal handle)
2302  * or grab_page_cache when doing write_begin (have journal handle)
2303  */
2304 static int ext4_da_writepage(struct page *page,
2305                                 struct writeback_control *wbc)
2306 {
2307         int ret = 0;
2308         loff_t size;
2309         unsigned long len;
2310         struct buffer_head *page_bufs;
2311         struct inode *inode = page->mapping->host;
2312
2313         size = i_size_read(inode);
2314         if (page->index == size >> PAGE_CACHE_SHIFT)
2315                 len = size & ~PAGE_CACHE_MASK;
2316         else
2317                 len = PAGE_CACHE_SIZE;
2318
2319         if (page_has_buffers(page)) {
2320                 page_bufs = page_buffers(page);
2321                 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2322                                         ext4_bh_unmapped_or_delay)) {
2323                         /*
2324                          * We don't want to do  block allocation
2325                          * So redirty the page and return
2326                          * We may reach here when we do a journal commit
2327                          * via journal_submit_inode_data_buffers.
2328                          * If we don't have mapping block we just ignore
2329                          * them. We can also reach here via shrink_page_list
2330                          */
2331                         redirty_page_for_writepage(wbc, page);
2332                         unlock_page(page);
2333                         return 0;
2334                 }
2335         } else {
2336                 /*
2337                  * The test for page_has_buffers() is subtle:
2338                  * We know the page is dirty but it lost buffers. That means
2339                  * that at some moment in time after write_begin()/write_end()
2340                  * has been called all buffers have been clean and thus they
2341                  * must have been written at least once. So they are all
2342                  * mapped and we can happily proceed with mapping them
2343                  * and writing the page.
2344                  *
2345                  * Try to initialize the buffer_heads and check whether
2346                  * all are mapped and non delay. We don't want to
2347                  * do block allocation here.
2348                  */
2349                 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
2350                                                 ext4_normal_get_block_write);
2351                 if (!ret) {
2352                         page_bufs = page_buffers(page);
2353                         /* check whether all are mapped and non delay */
2354                         if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2355                                                 ext4_bh_unmapped_or_delay)) {
2356                                 redirty_page_for_writepage(wbc, page);
2357                                 unlock_page(page);
2358                                 return 0;
2359                         }
2360                 } else {
2361                         /*
2362                          * We can't do block allocation here
2363                          * so just redity the page and unlock
2364                          * and return
2365                          */
2366                         redirty_page_for_writepage(wbc, page);
2367                         unlock_page(page);
2368                         return 0;
2369                 }
2370                 /* now mark the buffer_heads as dirty and uptodate */
2371                 block_commit_write(page, 0, PAGE_CACHE_SIZE);
2372         }
2373
2374         if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2375                 ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
2376         else
2377                 ret = block_write_full_page(page,
2378                                                 ext4_normal_get_block_write,
2379                                                 wbc);
2380
2381         return ret;
2382 }
2383
2384 /*
2385  * This is called via ext4_da_writepages() to
2386  * calulate the total number of credits to reserve to fit
2387  * a single extent allocation into a single transaction,
2388  * ext4_da_writpeages() will loop calling this before
2389  * the block allocation.
2390  */
2391
2392 static int ext4_da_writepages_trans_blocks(struct inode *inode)
2393 {
2394         int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
2395
2396         /*
2397          * With non-extent format the journal credit needed to
2398          * insert nrblocks contiguous block is dependent on
2399          * number of contiguous block. So we will limit
2400          * number of contiguous block to a sane value
2401          */
2402         if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
2403             (max_blocks > EXT4_MAX_TRANS_DATA))
2404                 max_blocks = EXT4_MAX_TRANS_DATA;
2405
2406         return ext4_chunk_trans_blocks(inode, max_blocks);
2407 }
2408
2409 static int ext4_da_writepages(struct address_space *mapping,
2410                               struct writeback_control *wbc)
2411 {
2412         handle_t *handle = NULL;
2413         loff_t range_start = 0;
2414         struct inode *inode = mapping->host;
2415         int needed_blocks, ret = 0, nr_to_writebump = 0;
2416         long to_write, pages_skipped = 0;
2417         struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2418
2419         /*
2420          * No pages to write? This is mainly a kludge to avoid starting
2421          * a transaction for special inodes like journal inode on last iput()
2422          * because that could violate lock ordering on umount
2423          */
2424         if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2425                 return 0;
2426
2427         /*
2428          * If the filesystem has aborted, it is read-only, so return
2429          * right away instead of dumping stack traces later on that
2430          * will obscure the real source of the problem.  We test
2431          * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
2432          * the latter could be true if the filesystem is mounted
2433          * read-only, and in that case, ext4_da_writepages should
2434          * *never* be called, so if that ever happens, we would want
2435          * the stack trace.
2436          */
2437         if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
2438                 return -EROFS;
2439
2440         /*
2441          * Make sure nr_to_write is >= sbi->s_mb_stream_request
2442          * This make sure small files blocks are allocated in
2443          * single attempt. This ensure that small files
2444          * get less fragmented.
2445          */
2446         if (wbc->nr_to_write < sbi->s_mb_stream_request) {
2447                 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2448                 wbc->nr_to_write = sbi->s_mb_stream_request;
2449         }
2450
2451         if (!wbc->range_cyclic)
2452                 /*
2453                  * If range_cyclic is not set force range_cont
2454                  * and save the old writeback_index
2455                  */
2456                 wbc->range_cont = 1;
2457
2458         range_start =  wbc->range_start;
2459         pages_skipped = wbc->pages_skipped;
2460
2461 restart_loop:
2462         to_write = wbc->nr_to_write;
2463         while (!ret && to_write > 0) {
2464
2465                 /*
2466                  * we  insert one extent at a time. So we need
2467                  * credit needed for single extent allocation.
2468                  * journalled mode is currently not supported
2469                  * by delalloc
2470                  */
2471                 BUG_ON(ext4_should_journal_data(inode));
2472                 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2473
2474                 /* start a new transaction*/
2475                 handle = ext4_journal_start(inode, needed_blocks);
2476                 if (IS_ERR(handle)) {
2477                         ret = PTR_ERR(handle);
2478                         printk(KERN_CRIT "%s: jbd2_start: "
2479                                "%ld pages, ino %lu; err %d\n", __func__,
2480                                 wbc->nr_to_write, inode->i_ino, ret);
2481                         dump_stack();
2482                         goto out_writepages;
2483                 }
2484                 if (ext4_should_order_data(inode)) {
2485                         /*
2486                          * With ordered mode we need to add
2487                          * the inode to the journal handl
2488                          * when we do block allocation.
2489                          */
2490                         ret = ext4_jbd2_file_inode(handle, inode);
2491                         if (ret) {
2492                                 ext4_journal_stop(handle);
2493                                 goto out_writepages;
2494                         }
2495                 }
2496
2497                 to_write -= wbc->nr_to_write;
2498                 ret = mpage_da_writepages(mapping, wbc,
2499                                           ext4_da_get_block_write);
2500                 ext4_journal_stop(handle);
2501                 if (ret == MPAGE_DA_EXTENT_TAIL) {
2502                         /*
2503                          * got one extent now try with
2504                          * rest of the pages
2505                          */
2506                         to_write += wbc->nr_to_write;
2507                         ret = 0;
2508                 } else if (wbc->nr_to_write) {
2509                         /*
2510                          * There is no more writeout needed
2511                          * or we requested for a noblocking writeout
2512                          * and we found the device congested
2513                          */
2514                         to_write += wbc->nr_to_write;
2515                         break;
2516                 }
2517                 wbc->nr_to_write = to_write;
2518         }
2519
2520         if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
2521                 /* We skipped pages in this loop */
2522                 wbc->range_start = range_start;
2523                 wbc->nr_to_write = to_write +
2524                                 wbc->pages_skipped - pages_skipped;
2525                 wbc->pages_skipped = pages_skipped;
2526                 goto restart_loop;
2527         }
2528
2529 out_writepages:
2530         wbc->nr_to_write = to_write - nr_to_writebump;
2531         wbc->range_start = range_start;
2532         return ret;
2533 }
2534
2535 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2536                                 loff_t pos, unsigned len, unsigned flags,
2537                                 struct page **pagep, void **fsdata)
2538 {
2539         int ret, retries = 0;
2540         struct page *page;
2541         pgoff_t index;
2542         unsigned from, to;
2543         struct inode *inode = mapping->host;
2544         handle_t *handle;
2545
2546         index = pos >> PAGE_CACHE_SHIFT;
2547         from = pos & (PAGE_CACHE_SIZE - 1);
2548         to = from + len;
2549
2550 retry:
2551         /*
2552          * With delayed allocation, we don't log the i_disksize update
2553          * if there is delayed block allocation. But we still need
2554          * to journalling the i_disksize update if writes to the end
2555          * of file which has an already mapped buffer.
2556          */
2557         handle = ext4_journal_start(inode, 1);
2558         if (IS_ERR(handle)) {
2559                 ret = PTR_ERR(handle);
2560                 goto out;
2561         }
2562         /* We cannot recurse into the filesystem as the transaction is already
2563          * started */
2564         flags |= AOP_FLAG_NOFS;
2565
2566         page = grab_cache_page_write_begin(mapping, index, flags);
2567         if (!page) {
2568                 ext4_journal_stop(handle);
2569                 ret = -ENOMEM;
2570                 goto out;
2571         }
2572         *pagep = page;
2573
2574         ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
2575                                                         ext4_da_get_block_prep);
2576         if (ret < 0) {
2577                 unlock_page(page);
2578                 ext4_journal_stop(handle);
2579                 page_cache_release(page);
2580         }
2581
2582         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
2583                 goto retry;
2584 out:
2585         return ret;
2586 }
2587
2588 /*
2589  * Check if we should update i_disksize
2590  * when write to the end of file but not require block allocation
2591  */
2592 static int ext4_da_should_update_i_disksize(struct page *page,
2593                                          unsigned long offset)
2594 {
2595         struct buffer_head *bh;
2596         struct inode *inode = page->mapping->host;
2597         unsigned int idx;
2598         int i;
2599
2600         bh = page_buffers(page);
2601         idx = offset >> inode->i_blkbits;
2602
2603         for (i=0; i < idx; i++)
2604                 bh = bh->b_this_page;
2605
2606         if (!buffer_mapped(bh) || (buffer_delay(bh)))
2607                 return 0;
2608         return 1;
2609 }
2610
2611 static int ext4_da_write_end(struct file *file,
2612                                 struct address_space *mapping,
2613                                 loff_t pos, unsigned len, unsigned copied,
2614                                 struct page *page, void *fsdata)
2615 {
2616         struct inode *inode = mapping->host;
2617         int ret = 0, ret2;
2618         handle_t *handle = ext4_journal_current_handle();
2619         loff_t new_i_size;
2620         unsigned long start, end;
2621
2622         start = pos & (PAGE_CACHE_SIZE - 1);
2623         end = start + copied -1;
2624
2625         /*
2626          * generic_write_end() will run mark_inode_dirty() if i_size
2627          * changes.  So let's piggyback the i_disksize mark_inode_dirty
2628          * into that.
2629          */
2630
2631         new_i_size = pos + copied;
2632         if (new_i_size > EXT4_I(inode)->i_disksize) {
2633                 if (ext4_da_should_update_i_disksize(page, end)) {
2634                         down_write(&EXT4_I(inode)->i_data_sem);
2635                         if (new_i_size > EXT4_I(inode)->i_disksize) {
2636                                 /*
2637                                  * Updating i_disksize when extending file
2638                                  * without needing block allocation
2639                                  */
2640                                 if (ext4_should_order_data(inode))
2641                                         ret = ext4_jbd2_file_inode(handle,
2642                                                                    inode);
2643
2644                                 EXT4_I(inode)->i_disksize = new_i_size;
2645                         }
2646                         up_write(&EXT4_I(inode)->i_data_sem);
2647                 }
2648         }
2649         ret2 = generic_write_end(file, mapping, pos, len, copied,
2650                                                         page, fsdata);
2651         copied = ret2;
2652         if (ret2 < 0)
2653                 ret = ret2;
2654         ret2 = ext4_journal_stop(handle);
2655         if (!ret)
2656                 ret = ret2;
2657
2658         return ret ? ret : copied;
2659 }
2660
2661 static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2662 {
2663         /*
2664          * Drop reserved blocks
2665          */
2666         BUG_ON(!PageLocked(page));
2667         if (!page_has_buffers(page))
2668                 goto out;
2669
2670         ext4_da_page_release_reservation(page, offset);
2671
2672 out:
2673         ext4_invalidatepage(page, offset);
2674
2675         return;
2676 }
2677
2678 /*
2679  * Force all delayed allocation blocks to be allocated for a given inode.
2680  */
2681 int ext4_alloc_da_blocks(struct inode *inode)
2682 {
2683         if (!EXT4_I(inode)->i_reserved_data_blocks &&
2684             !EXT4_I(inode)->i_reserved_meta_blocks)
2685                 return 0;
2686
2687         /*
2688          * We do something simple for now.  The filemap_flush() will
2689          * also start triggering a write of the data blocks, which is
2690          * not strictly speaking necessary (and for users of
2691          * laptop_mode, not even desirable).  However, to do otherwise
2692          * would require replicating code paths in:
2693          *
2694          * ext4_da_writepages() ->
2695          *    write_cache_pages() ---> (via passed in callback function)
2696          *        __mpage_da_writepage() -->
2697          *           mpage_add_bh_to_extent()
2698          *           mpage_da_map_blocks()
2699          *
2700          * The problem is that write_cache_pages(), located in
2701          * mm/page-writeback.c, marks pages clean in preparation for
2702          * doing I/O, which is not desirable if we're not planning on
2703          * doing I/O at all.
2704          *
2705          * We could call write_cache_pages(), and then redirty all of
2706          * the pages by calling redirty_page_for_writeback() but that
2707          * would be ugly in the extreme.  So instead we would need to
2708          * replicate parts of the code in the above functions,
2709          * simplifying them becuase we wouldn't actually intend to
2710          * write out the pages, but rather only collect contiguous
2711          * logical block extents, call the multi-block allocator, and
2712          * then update the buffer heads with the block allocations.
2713          *
2714          * For now, though, we'll cheat by calling filemap_flush(),
2715          * which will map the blocks, and start the I/O, but not
2716          * actually wait for the I/O to complete.
2717          */
2718         return filemap_flush(inode->i_mapping);
2719 }
2720
2721 /*
2722  * bmap() is special.  It gets used by applications such as lilo and by
2723  * the swapper to find the on-disk block of a specific piece of data.
2724  *
2725  * Naturally, this is dangerous if the block concerned is still in the
2726  * journal.  If somebody makes a swapfile on an ext4 data-journaling
2727  * filesystem and enables swap, then they may get a nasty shock when the
2728  * data getting swapped to that swapfile suddenly gets overwritten by
2729  * the original zero's written out previously to the journal and
2730  * awaiting writeback in the kernel's buffer cache.
2731  *
2732  * So, if we see any bmap calls here on a modified, data-journaled file,
2733  * take extra steps to flush any blocks which might be in the cache.
2734  */
2735 static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2736 {
2737         struct inode *inode = mapping->host;
2738         journal_t *journal;
2739         int err;
2740
2741         if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
2742                         test_opt(inode->i_sb, DELALLOC)) {
2743                 /*
2744                  * With delalloc we want to sync the file
2745                  * so that we can make sure we allocate
2746                  * blocks for file
2747                  */
2748                 filemap_write_and_wait(mapping);
2749         }
2750
2751         if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
2752                 /*
2753                  * This is a REALLY heavyweight approach, but the use of
2754                  * bmap on dirty files is expected to be extremely rare:
2755                  * only if we run lilo or swapon on a freshly made file
2756                  * do we expect this to happen.
2757                  *
2758                  * (bmap requires CAP_SYS_RAWIO so this does not
2759                  * represent an unprivileged user DOS attack --- we'd be
2760                  * in trouble if mortal users could trigger this path at
2761                  * will.)
2762                  *
2763                  * NB. EXT4_STATE_JDATA is not set on files other than
2764                  * regular files.  If somebody wants to bmap a directory
2765                  * or symlink and gets confused because the buffer
2766                  * hasn't yet been flushed to disk, they deserve
2767                  * everything they get.
2768                  */
2769
2770                 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
2771                 journal = EXT4_JOURNAL(inode);
2772                 jbd2_journal_lock_updates(journal);
2773                 err = jbd2_journal_flush(journal);
2774                 jbd2_journal_unlock_updates(journal);
2775
2776                 if (err)
2777                         return 0;
2778         }
2779
2780         return generic_block_bmap(mapping,block,ext4_get_block);
2781 }
2782
2783 static int bget_one(handle_t *handle, struct buffer_head *bh)
2784 {
2785         get_bh(bh);
2786         return 0;
2787 }
2788
2789 static int bput_one(handle_t *handle, struct buffer_head *bh)
2790 {
2791         put_bh(bh);
2792         return 0;
2793 }
2794
2795 /*
2796  * Note that we don't need to start a transaction unless we're journaling data
2797  * because we should have holes filled from ext4_page_mkwrite(). We even don't
2798  * need to file the inode to the transaction's list in ordered mode because if
2799  * we are writing back data added by write(), the inode is already there and if
2800  * we are writing back data modified via mmap(), noone guarantees in which
2801  * transaction the data will hit the disk. In case we are journaling data, we
2802  * cannot start transaction directly because transaction start ranks above page
2803  * lock so we have to do some magic.
2804  *
2805  * In all journaling modes block_write_full_page() will start the I/O.
2806  *
2807  * Problem:
2808  *
2809  *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
2810  *              ext4_writepage()
2811  *
2812  * Similar for:
2813  *
2814  *      ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
2815  *
2816  * Same applies to ext4_get_block().  We will deadlock on various things like
2817  * lock_journal and i_data_sem
2818  *
2819  * Setting PF_MEMALLOC here doesn't work - too many internal memory
2820  * allocations fail.
2821  *
2822  * 16May01: If we're reentered then journal_current_handle() will be
2823  *          non-zero. We simply *return*.
2824  *
2825  * 1 July 2001: @@@ FIXME:
2826  *   In journalled data mode, a data buffer may be metadata against the
2827  *   current transaction.  But the same file is part of a shared mapping
2828  *   and someone does a writepage() on it.
2829  *
2830  *   We will move the buffer onto the async_data list, but *after* it has
2831  *   been dirtied. So there's a small window where we have dirty data on
2832  *   BJ_Metadata.
2833  *
2834  *   Note that this only applies to the last partial page in the file.  The
2835  *   bit which block_write_full_page() uses prepare/commit for.  (That's
2836  *   broken code anyway: it's wrong for msync()).
2837  *
2838  *   It's a rare case: affects the final partial page, for journalled data
2839  *   where the file is subject to bith write() and writepage() in the same
2840  *   transction.  To fix it we'll need a custom block_write_full_page().
2841  *   We'll probably need that anyway for journalling writepage() output.
2842  *
2843  * We don't honour synchronous mounts for writepage().  That would be
2844  * disastrous.  Any write() or metadata operation will sync the fs for
2845  * us.
2846  *
2847  */
2848 static int __ext4_normal_writepage(struct page *page,
2849                                 struct writeback_control *wbc)
2850 {
2851         struct inode *inode = page->mapping->host;
2852
2853         if (test_opt(inode->i_sb, NOBH))
2854                 return nobh_writepage(page,
2855                                         ext4_normal_get_block_write, wbc);
2856         else
2857                 return block_write_full_page(page,
2858                                                 ext4_normal_get_block_write,
2859                                                 wbc);
2860 }
2861
2862 static int ext4_normal_writepage(struct page *page,
2863                                 struct writeback_control *wbc)
2864 {
2865         struct inode *inode = page->mapping->host;
2866         loff_t size = i_size_read(inode);
2867         loff_t len;
2868
2869         J_ASSERT(PageLocked(page));
2870         if (page->index == size >> PAGE_CACHE_SHIFT)
2871                 len = size & ~PAGE_CACHE_MASK;
2872         else
2873                 len = PAGE_CACHE_SIZE;
2874
2875         if (page_has_buffers(page)) {
2876                 /* if page has buffers it should all be mapped
2877                  * and allocated. If there are not buffers attached
2878                  * to the page we know the page is dirty but it lost
2879                  * buffers. That means that at some moment in time
2880                  * after write_begin() / write_end() has been called
2881                  * all buffers have been clean and thus they must have been
2882                  * written at least once. So they are all mapped and we can
2883                  * happily proceed with mapping them and writing the page.
2884                  */
2885                 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2886                                         ext4_bh_unmapped_or_delay));
2887         }
2888
2889         if (!ext4_journal_current_handle())
2890                 return __ext4_normal_writepage(page, wbc);
2891
2892         redirty_page_for_writepage(wbc, page);
2893         unlock_page(page);
2894         return 0;
2895 }
2896
2897 static int __ext4_journalled_writepage(struct page *page,
2898                                 struct writeback_control *wbc)
2899 {
2900         struct address_space *mapping = page->mapping;
2901         struct inode *inode = mapping->host;
2902         struct buffer_head *page_bufs;
2903         handle_t *handle = NULL;
2904         int ret = 0;
2905         int err;
2906
2907         ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
2908                                         ext4_normal_get_block_write);
2909         if (ret != 0)
2910                 goto out_unlock;
2911
2912         page_bufs = page_buffers(page);
2913         walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
2914                                                                 bget_one);
2915         /* As soon as we unlock the page, it can go away, but we have
2916          * references to buffers so we are safe */
2917         unlock_page(page);
2918
2919         handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
2920         if (IS_ERR(handle)) {
2921                 ret = PTR_ERR(handle);
2922                 goto out;
2923         }
2924
2925         ret = walk_page_buffers(handle, page_bufs, 0,
2926                         PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
2927
2928         err = walk_page_buffers(handle, page_bufs, 0,
2929                                 PAGE_CACHE_SIZE, NULL, write_end_fn);
2930         if (ret == 0)
2931                 ret = err;
2932         err = ext4_journal_stop(handle);
2933         if (!ret)
2934                 ret = err;
2935
2936         walk_page_buffers(handle, page_bufs, 0,
2937                                 PAGE_CACHE_SIZE, NULL, bput_one);
2938         EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
2939         goto out;
2940
2941 out_unlock:
2942         unlock_page(page);
2943 out:
2944         return ret;
2945 }
2946
2947 static int ext4_journalled_writepage(struct page *page,
2948                                 struct writeback_control *wbc)
2949 {
2950         struct inode *inode = page->mapping->host;
2951         loff_t size = i_size_read(inode);
2952         loff_t len;
2953
2954         J_ASSERT(PageLocked(page));
2955         if (page->index == size >> PAGE_CACHE_SHIFT)
2956                 len = size & ~PAGE_CACHE_MASK;
2957         else
2958                 len = PAGE_CACHE_SIZE;
2959
2960         if (page_has_buffers(page)) {
2961                 /* if page has buffers it should all be mapped
2962                  * and allocated. If there are not buffers attached
2963                  * to the page we know the page is dirty but it lost
2964                  * buffers. That means that at some moment in time
2965                  * after write_begin() / write_end() has been called
2966                  * all buffers have been clean and thus they must have been
2967                  * written at least once. So they are all mapped and we can
2968                  * happily proceed with mapping them and writing the page.
2969                  */
2970                 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2971                                         ext4_bh_unmapped_or_delay));
2972         }
2973
2974         if (ext4_journal_current_handle())
2975                 goto no_write;
2976
2977         if (PageChecked(page)) {
2978                 /*
2979                  * It's mmapped pagecache.  Add buffers and journal it.  There
2980                  * doesn't seem much point in redirtying the page here.
2981                  */
2982                 ClearPageChecked(page);
2983                 return __ext4_journalled_writepage(page, wbc);
2984         } else {
2985                 /*
2986                  * It may be a page full of checkpoint-mode buffers.  We don't
2987                  * really know unless we go poke around in the buffer_heads.
2988                  * But block_write_full_page will do the right thing.
2989                  */
2990                 return block_write_full_page(page,
2991                                                 ext4_normal_get_block_write,
2992                                                 wbc);
2993         }
2994 no_write:
2995         redirty_page_for_writepage(wbc, page);
2996         unlock_page(page);
2997         return 0;
2998 }
2999
3000 static int ext4_readpage(struct file *file, struct page *page)
3001 {
3002         return mpage_readpage(page, ext4_get_block);
3003 }
3004
3005 static int
3006 ext4_readpages(struct file *file, struct address_space *mapping,
3007                 struct list_head *pages, unsigned nr_pages)
3008 {
3009         return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3010 }
3011
3012 static void ext4_invalidatepage(struct page *page, unsigned long offset)
3013 {
3014         journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3015
3016         /*
3017          * If it's a full truncate we just forget about the pending dirtying
3018          */
3019         if (offset == 0)
3020                 ClearPageChecked(page);
3021
3022         jbd2_journal_invalidatepage(journal, page, offset);
3023 }
3024
3025 static int ext4_releasepage(struct page *page, gfp_t wait)
3026 {
3027         journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3028
3029         WARN_ON(PageChecked(page));
3030         if (!page_has_buffers(page))
3031                 return 0;
3032         return jbd2_journal_try_to_free_buffers(journal, page, wait);
3033 }
3034
3035 /*
3036  * If the O_DIRECT write will extend the file then add this inode to the
3037  * orphan list.  So recovery will truncate it back to the original size
3038  * if the machine crashes during the write.
3039  *
3040  * If the O_DIRECT write is intantiating holes inside i_size and the machine
3041  * crashes then stale disk data _may_ be exposed inside the file. But current
3042  * VFS code falls back into buffered path in that case so we are safe.
3043  */
3044 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3045                         const struct iovec *iov, loff_t offset,
3046                         unsigned long nr_segs)
3047 {
3048         struct file *file = iocb->ki_filp;
3049         struct inode *inode = file->f_mapping->host;
3050         struct ext4_inode_info *ei = EXT4_I(inode);
3051         handle_t *handle;
3052         ssize_t ret;
3053         int orphan = 0;
3054         size_t count = iov_length(iov, nr_segs);
3055
3056         if (rw == WRITE) {
3057                 loff_t final_size = offset + count;
3058
3059                 if (final_size > inode->i_size) {
3060                         /* Credits for sb + inode write */
3061                         handle = ext4_journal_start(inode, 2);
3062                         if (IS_ERR(handle)) {
3063                                 ret = PTR_ERR(handle);
3064                                 goto out;
3065                         }
3066                         ret = ext4_orphan_add(handle, inode);
3067                         if (ret) {
3068                                 ext4_journal_stop(handle);
3069                                 goto out;
3070                         }
3071                         orphan = 1;
3072                         ei->i_disksize = inode->i_size;
3073                         ext4_journal_stop(handle);
3074                 }
3075         }
3076
3077         ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3078                                  offset, nr_segs,
3079                                  ext4_get_block, NULL);
3080
3081         if (orphan) {
3082                 int err;
3083
3084                 /* Credits for sb + inode write */
3085                 handle = ext4_journal_start(inode, 2);
3086                 if (IS_ERR(handle)) {
3087                         /* This is really bad luck. We've written the data
3088                          * but cannot extend i_size. Bail out and pretend
3089                          * the write failed... */
3090                         ret = PTR_ERR(handle);
3091                         goto out;
3092                 }
3093                 if (inode->i_nlink)
3094                         ext4_orphan_del(handle, inode);
3095                 if (ret > 0) {
3096                         loff_t end = offset + ret;
3097                         if (end > inode->i_size) {
3098                                 ei->i_disksize = end;
3099                                 i_size_write(inode, end);
3100                                 /*
3101                                  * We're going to return a positive `ret'
3102                                  * here due to non-zero-length I/O, so there's
3103                                  * no way of reporting error returns from
3104                                  * ext4_mark_inode_dirty() to userspace.  So
3105                                  * ignore it.
3106                                  */
3107                                 ext4_mark_inode_dirty(handle, inode);
3108                         }
3109                 }
3110                 err = ext4_journal_stop(handle);
3111                 if (ret == 0)
3112                         ret = err;
3113         }
3114 out:
3115         return ret;
3116 }
3117
3118 /*
3119  * Pages can be marked dirty completely asynchronously from ext4's journalling
3120  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
3121  * much here because ->set_page_dirty is called under VFS locks.  The page is
3122  * not necessarily locked.
3123  *
3124  * We cannot just dirty the page and leave attached buffers clean, because the
3125  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
3126  * or jbddirty because all the journalling code will explode.
3127  *
3128  * So what we do is to mark the page "pending dirty" and next time writepage
3129  * is called, propagate that into the buffers appropriately.
3130  */
3131 static int ext4_journalled_set_page_dirty(struct page *page)
3132 {
3133         SetPageChecked(page);
3134         return __set_page_dirty_nobuffers(page);
3135 }
3136
3137 static const struct address_space_operations ext4_ordered_aops = {
3138         .readpage               = ext4_readpage,
3139         .readpages              = ext4_readpages,
3140         .writepage              = ext4_normal_writepage,
3141         .sync_page              = block_sync_page,
3142         .write_begin            = ext4_write_begin,
3143         .write_end              = ext4_ordered_write_end,
3144         .bmap                   = ext4_bmap,
3145         .invalidatepage         = ext4_invalidatepage,
3146         .releasepage            = ext4_releasepage,
3147         .direct_IO              = ext4_direct_IO,
3148         .migratepage            = buffer_migrate_page,
3149         .is_partially_uptodate  = block_is_partially_uptodate,
3150 };
3151
3152 static const struct address_space_operations ext4_writeback_aops = {
3153         .readpage               = ext4_readpage,
3154         .readpages              = ext4_readpages,
3155         .writepage              = ext4_normal_writepage,
3156         .sync_page              = block_sync_page,
3157         .write_begin            = ext4_write_begin,
3158         .write_end              = ext4_writeback_write_end,
3159         .bmap                   = ext4_bmap,
3160         .invalidatepage         = ext4_invalidatepage,
3161         .releasepage            = ext4_releasepage,
3162         .direct_IO              = ext4_direct_IO,
3163         .migratepage            = buffer_migrate_page,
3164         .is_partially_uptodate  = block_is_partially_uptodate,
3165 };
3166
3167 static const struct address_space_operations ext4_journalled_aops = {
3168         .readpage               = ext4_readpage,
3169         .readpages              = ext4_readpages,
3170         .writepage              = ext4_journalled_writepage,
3171         .sync_page              = block_sync_page,
3172         .write_begin            = ext4_write_begin,
3173         .write_end              = ext4_journalled_write_end,
3174         .set_page_dirty         = ext4_journalled_set_page_dirty,
3175         .bmap                   = ext4_bmap,
3176         .invalidatepage         = ext4_invalidatepage,
3177         .releasepage            = ext4_releasepage,
3178         .is_partially_uptodate  = block_is_partially_uptodate,
3179 };
3180
3181 static const struct address_space_operations ext4_da_aops = {
3182         .readpage               = ext4_readpage,
3183         .readpages              = ext4_readpages,
3184         .writepage              = ext4_da_writepage,
3185         .writepages             = ext4_da_writepages,
3186         .sync_page              = block_sync_page,
3187         .write_begin            = ext4_da_write_begin,
3188         .write_end              = ext4_da_write_end,
3189         .bmap                   = ext4_bmap,
3190         .invalidatepage         = ext4_da_invalidatepage,
3191         .releasepage            = ext4_releasepage,
3192         .direct_IO              = ext4_direct_IO,
3193         .migratepage            = buffer_migrate_page,
3194         .is_partially_uptodate  = block_is_partially_uptodate,
3195 };
3196
3197 void ext4_set_aops(struct inode *inode)
3198 {
3199         if (ext4_should_order_data(inode) &&
3200                 test_opt(inode->i_sb, DELALLOC))
3201                 inode->i_mapping->a_ops = &ext4_da_aops;
3202         else if (ext4_should_order_data(inode))
3203                 inode->i_mapping->a_ops = &ext4_ordered_aops;
3204         else if (ext4_should_writeback_data(inode) &&
3205                  test_opt(inode->i_sb, DELALLOC))
3206                 inode->i_mapping->a_ops = &ext4_da_aops;
3207         else if (ext4_should_writeback_data(inode))
3208                 inode->i_mapping->a_ops = &ext4_writeback_aops;
3209         else
3210                 inode->i_mapping->a_ops = &ext4_journalled_aops;
3211 }
3212
3213 /*
3214  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3215  * up to the end of the block which corresponds to `from'.
3216  * This required during truncate. We need to physically zero the tail end
3217  * of that block so it doesn't yield old data if the file is later grown.
3218  */
3219 int ext4_block_truncate_page(handle_t *handle,
3220                 struct address_space *mapping, loff_t from)
3221 {
3222         ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3223         unsigned offset = from & (PAGE_CACHE_SIZE-1);
3224         unsigned blocksize, length, pos;
3225         ext4_lblk_t iblock;
3226         struct inode *inode = mapping->host;
3227         struct buffer_head *bh;
3228         struct page *page;
3229         int err = 0;
3230
3231         page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
3232         if (!page)
3233                 return -EINVAL;
3234
3235         blocksize = inode->i_sb->s_blocksize;
3236         length = blocksize - (offset & (blocksize - 1));
3237         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3238
3239         /*
3240          * For "nobh" option,  we can only work if we don't need to
3241          * read-in the page - otherwise we create buffers to do the IO.
3242          */
3243         if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
3244              ext4_should_writeback_data(inode) && PageUptodate(page)) {
3245                 zero_user(page, offset, length);
3246                 set_page_dirty(page);
3247                 goto unlock;
3248         }
3249
3250         if (!page_has_buffers(page))
3251                 create_empty_buffers(page, blocksize, 0);
3252
3253         /* Find the buffer that contains "offset" */
3254         bh = page_buffers(page);
3255         pos = blocksize;
3256         while (offset >= pos) {
3257                 bh = bh->b_this_page;
3258                 iblock++;
3259                 pos += blocksize;
3260         }
3261
3262         err = 0;
3263         if (buffer_freed(bh)) {
3264                 BUFFER_TRACE(bh, "freed: skip");
3265                 goto unlock;
3266         }
3267
3268         if (!buffer_mapped(bh)) {
3269                 BUFFER_TRACE(bh, "unmapped");
3270                 ext4_get_block(inode, iblock, bh, 0);
3271                 /* unmapped? It's a hole - nothing to do */
3272                 if (!buffer_mapped(bh)) {
3273                         BUFFER_TRACE(bh, "still unmapped");
3274                         goto unlock;
3275                 }
3276         }
3277
3278         /* Ok, it's mapped. Make sure it's up-to-date */
3279         if (PageUptodate(page))
3280                 set_buffer_uptodate(bh);
3281
3282         if (!buffer_uptodate(bh)) {
3283                 err = -EIO;
3284                 ll_rw_block(READ, 1, &bh);
3285                 wait_on_buffer(bh);
3286                 /* Uhhuh. Read error. Complain and punt. */
3287                 if (!buffer_uptodate(bh))
3288                         goto unlock;
3289         }
3290
3291         if (ext4_should_journal_data(inode)) {
3292                 BUFFER_TRACE(bh, "get write access");
3293                 err = ext4_journal_get_write_access(handle, bh);
3294                 if (err)
3295                         goto unlock;
3296         }
3297
3298         zero_user(page, offset, length);
3299
3300         BUFFER_TRACE(bh, "zeroed end of block");
3301
3302         err = 0;
3303         if (ext4_should_journal_data(inode)) {
3304                 err = ext4_journal_dirty_metadata(handle, bh);
3305         } else {
3306                 if (ext4_should_order_data(inode))
3307                         err = ext4_jbd2_file_inode(handle, inode);
3308                 mark_buffer_dirty(bh);
3309         }
3310
3311 unlock:
3312         unlock_page(page);
3313         page_cache_release(page);
3314         return err;
3315 }
3316
3317 /*
3318  * Probably it should be a library function... search for first non-zero word
3319  * or memcmp with zero_page, whatever is better for particular architecture.
3320  * Linus?
3321  */
3322 static inline int all_zeroes(__le32 *p, __le32 *q)
3323 {
3324         while (p < q)
3325                 if (*p++)
3326                         return 0;
3327         return 1;
3328 }
3329
3330 /**
3331  *      ext4_find_shared - find the indirect blocks for partial truncation.
3332  *      @inode:   inode in question
3333  *      @depth:   depth of the affected branch
3334  *      @offsets: offsets of pointers in that branch (see ext4_block_to_path)
3335  *      @chain:   place to store the pointers to partial indirect blocks
3336  *      @top:     place to the (detached) top of branch
3337  *
3338  *      This is a helper function used by ext4_truncate().
3339  *
3340  *      When we do truncate() we may have to clean the ends of several
3341  *      indirect blocks but leave the blocks themselves alive. Block is
3342  *      partially truncated if some data below the new i_size is refered
3343  *      from it (and it is on the path to the first completely truncated
3344  *      data block, indeed).  We have to free the top of that path along
3345  *      with everything to the right of the path. Since no allocation
3346  *      past the truncation point is possible until ext4_truncate()
3347  *      finishes, we may safely do the latter, but top of branch may
3348  *      require special attention - pageout below the truncation point
3349  *      might try to populate it.
3350  *
3351  *      We atomically detach the top of branch from the tree, store the
3352  *      block number of its root in *@top, pointers to buffer_heads of
3353  *      partially truncated blocks - in @chain[].bh and pointers to
3354  *      their last elements that should not be removed - in
3355  *      @chain[].p. Return value is the pointer to last filled element
3356  *      of @chain.
3357  *
3358  *      The work left to caller to do the actual freeing of subtrees:
3359  *              a) free the subtree starting from *@top
3360  *              b) free the subtrees whose roots are stored in
3361  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
3362  *              c) free the subtrees growing from the inode past the @chain[0].
3363  *                      (no partially truncated stuff there).  */
3364
3365 static Indirect *ext4_find_shared(struct inode *inode, int depth,
3366                         ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
3367 {
3368         Indirect *partial, *p;
3369         int k, err;
3370
3371         *top = 0;
3372         /* Make k index the deepest non-null offest + 1 */
3373         for (k = depth; k > 1 && !offsets[k-1]; k--)
3374                 ;
3375         partial = ext4_get_branch(inode, k, offsets, chain, &err);
3376         /* Writer: pointers */
3377         if (!partial)
3378                 partial = chain + k-1;
3379         /*
3380          * If the branch acquired continuation since we've looked at it -
3381          * fine, it should all survive and (new) top doesn't belong to us.
3382          */
3383         if (!partial->key && *partial->p)
3384                 /* Writer: end */
3385                 goto no_top;
3386         for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
3387                 ;
3388         /*
3389          * OK, we've found the last block that must survive. The rest of our
3390          * branch should be detached before unlocking. However, if that rest
3391          * of branch is all ours and does not grow immediately from the inode
3392          * it's easier to cheat and just decrement partial->p.
3393          */
3394         if (p == chain + k - 1 && p > chain) {
3395                 p->p--;
3396         } else {
3397                 *top = *p->p;
3398                 /* Nope, don't do this in ext4.  Must leave the tree intact */
3399 #if 0
3400                 *p->p = 0;
3401 #endif
3402         }
3403         /* Writer: end */
3404
3405         while(partial > p) {
3406                 brelse(partial->bh);
3407                 partial--;
3408         }
3409 no_top:
3410         return partial;
3411 }
3412
3413 /*
3414  * Zero a number of block pointers in either an inode or an indirect block.
3415  * If we restart the transaction we must again get write access to the
3416  * indirect block for further modification.
3417  *
3418  * We release `count' blocks on disk, but (last - first) may be greater
3419  * than `count' because there can be holes in there.
3420  */
3421 static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3422                 struct buffer_head *bh, ext4_fsblk_t block_to_free,
3423                 unsigned long count, __le32 *first, __le32 *last)
3424 {
3425         __le32 *p;
3426         if (try_to_extend_transaction(handle, inode)) {
3427                 if (bh) {
3428                         BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
3429                         ext4_journal_dirty_metadata(handle, bh);
3430                 }
3431                 ext4_mark_inode_dirty(handle, inode);
3432                 ext4_journal_test_restart(handle, inode);
3433                 if (bh) {
3434                         BUFFER_TRACE(bh, "retaking write access");
3435                         ext4_journal_get_write_access(handle, bh);
3436                 }
3437         }
3438
3439         /*
3440          * Any buffers which are on the journal will be in memory. We find
3441          * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
3442          * on them.  We've already detached each block from the file, so
3443          * bforget() in jbd2_journal_forget() should be safe.
3444          *
3445          * AKPM: turn on bforget in jbd2_journal_forget()!!!
3446          */
3447         for (p = first; p < last; p++) {
3448                 u32 nr = le32_to_cpu(*p);
3449                 if (nr) {
3450                         struct buffer_head *tbh;
3451
3452                         *p = 0;
3453                         tbh = sb_find_get_block(inode->i_sb, nr);
3454                         ext4_forget(handle, 0, inode, tbh, nr);
3455                 }
3456         }
3457
3458         ext4_free_blocks(handle, inode, block_to_free, count, 0);
3459 }
3460
3461 /**
3462  * ext4_free_data - free a list of data blocks
3463  * @handle:     handle for this transaction
3464  * @inode:      inode we are dealing with
3465  * @this_bh:    indirect buffer_head which contains *@first and *@last
3466  * @first:      array of block numbers
3467  * @last:       points immediately past the end of array
3468  *
3469  * We are freeing all blocks refered from that array (numbers are stored as
3470  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
3471  *
3472  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
3473  * blocks are contiguous then releasing them at one time will only affect one
3474  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
3475  * actually use a lot of journal space.
3476  *
3477  * @this_bh will be %NULL if @first and @last point into the inode's direct
3478  * block pointers.
3479  */
3480 static void ext4_free_data(handle_t *handle, struct inode *inode,
3481                            struct buffer_head *this_bh,
3482                            __le32 *first, __le32 *last)
3483 {
3484         ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
3485         unsigned long count = 0;            /* Number of blocks in the run */
3486         __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
3487                                                corresponding to
3488                                                block_to_free */
3489         ext4_fsblk_t nr;                    /* Current block # */
3490         __le32 *p;                          /* Pointer into inode/ind
3491                                                for current block */
3492         int err;
3493
3494         if (this_bh) {                          /* For indirect block */
3495                 BUFFER_TRACE(this_bh, "get_write_access");
3496                 err = ext4_journal_get_write_access(handle, this_bh);
3497                 /* Important: if we can't update the indirect pointers
3498                  * to the blocks, we can't free them. */
3499                 if (err)
3500                         return;
3501         }
3502
3503         for (p = first; p < last; p++) {
3504                 nr = le32_to_cpu(*p);
3505                 if (nr) {
3506                         /* accumulate blocks to free if they're contiguous */
3507                         if (count == 0) {
3508                                 block_to_free = nr;
3509                                 block_to_free_p = p;
3510                                 count = 1;
3511                         } else if (nr == block_to_free + count) {
3512                                 count++;
3513                         } else {
3514                                 ext4_clear_blocks(handle, inode, this_bh,
3515                                                   block_to_free,
3516                                                   count, block_to_free_p, p);
3517                                 block_to_free = nr;
3518                                 block_to_free_p = p;
3519                                 count = 1;
3520                         }
3521                 }
3522         }
3523
3524         if (count > 0)
3525                 ext4_clear_blocks(handle, inode, this_bh, block_to_free,
3526                                   count, block_to_free_p, p);
3527
3528         if (this_bh) {
3529                 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
3530
3531                 /*
3532                  * The buffer head should have an attached journal head at this
3533                  * point. However, if the data is corrupted and an indirect
3534                  * block pointed to itself, it would have been detached when
3535                  * the block was cleared. Check for this instead of OOPSing.
3536                  */
3537                 if (bh2jh(this_bh))
3538                         ext4_journal_dirty_metadata(handle, this_bh);
3539                 else
3540                         ext4_error(inode->i_sb, __func__,
3541                                    "circular indirect block detected, "
3542                                    "inode=%lu, block=%llu",
3543                                    inode->i_ino,
3544                                    (unsigned long long) this_bh->b_blocknr);
3545         }
3546 }
3547
3548 /**
3549  *      ext4_free_branches - free an array of branches
3550  *      @handle: JBD handle for this transaction
3551  *      @inode: inode we are dealing with
3552  *      @parent_bh: the buffer_head which contains *@first and *@last
3553  *      @first: array of block numbers
3554  *      @last:  pointer immediately past the end of array
3555  *      @depth: depth of the branches to free
3556  *
3557  *      We are freeing all blocks refered from these branches (numbers are
3558  *      stored as little-endian 32-bit) and updating @inode->i_blocks
3559  *      appropriately.
3560  */
3561 static void ext4_free_branches(handle_t *handle, struct inode *inode,
3562                                struct buffer_head *parent_bh,
3563                                __le32 *first, __le32 *last, int depth)
3564 {
3565         ext4_fsblk_t nr;
3566         __le32 *p;
3567
3568         if (is_handle_aborted(handle))
3569                 return;
3570
3571         if (depth--) {
3572                 struct buffer_head *bh;
3573                 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
3574                 p = last;
3575                 while (--p >= first) {
3576                         nr = le32_to_cpu(*p);
3577                         if (!nr)
3578                                 continue;               /* A hole */
3579
3580                         /* Go read the buffer for the next level down */
3581                         bh = sb_bread(inode->i_sb, nr);
3582
3583                         /*
3584                          * A read failure? Report error and clear slot
3585                          * (should be rare).
3586                          */
3587                         if (!bh) {
3588                                 ext4_error(inode->i_sb, "ext4_free_branches",
3589                                            "Read failure, inode=%lu, block=%llu",
3590                                            inode->i_ino, nr);
3591                                 continue;
3592                         }
3593
3594                         /* This zaps the entire block.  Bottom up. */
3595                         BUFFER_TRACE(bh, "free child branches");
3596                         ext4_free_branches(handle, inode, bh,
3597                                            (__le32*)bh->b_data,
3598                                            (__le32*)bh->b_data + addr_per_block,
3599                                            depth);
3600
3601                         /*
3602                          * We've probably journalled the indirect block several
3603                          * times during the truncate.  But it's no longer
3604                          * needed and we now drop it from the transaction via
3605                          * jbd2_journal_revoke().
3606                          *
3607                          * That's easy if it's exclusively part of this
3608                          * transaction.  But if it's part of the committing
3609                          * transaction then jbd2_journal_forget() will simply
3610                          * brelse() it.  That means that if the underlying
3611                          * block is reallocated in ext4_get_block(),
3612                          * unmap_underlying_metadata() will find this block
3613                          * and will try to get rid of it.  damn, damn.
3614                          *
3615                          * If this block has already been committed to the
3616                          * journal, a revoke record will be written.  And
3617                          * revoke records must be emitted *before* clearing
3618                          * this block's bit in the bitmaps.
3619                          */
3620                         ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
3621
3622                         /*
3623                          * Everything below this this pointer has been
3624                          * released.  Now let this top-of-subtree go.
3625                          *
3626                          * We want the freeing of this indirect block to be
3627                          * atomic in the journal with the updating of the
3628                          * bitmap block which owns it.  So make some room in
3629                          * the journal.
3630                          *
3631                          * We zero the parent pointer *after* freeing its
3632                          * pointee in the bitmaps, so if extend_transaction()
3633                          * for some reason fails to put the bitmap changes and
3634                          * the release into the same transaction, recovery
3635                          * will merely complain about releasing a free block,
3636                          * rather than leaking blocks.
3637                          */
3638                         if (is_handle_aborted(handle))
3639                                 return;
3640                         if (try_to_extend_transaction(handle, inode)) {
3641                                 ext4_mark_inode_dirty(handle, inode);
3642                                 ext4_journal_test_restart(handle, inode);
3643                         }
3644
3645                         ext4_free_blocks(handle, inode, nr, 1, 1);
3646
3647                         if (parent_bh) {
3648                                 /*
3649                                  * The block which we have just freed is
3650                                  * pointed to by an indirect block: journal it
3651                                  */
3652                                 BUFFER_TRACE(parent_bh, "get_write_access");
3653                                 if (!ext4_journal_get_write_access(handle,
3654                                                                    parent_bh)){
3655                                         *p = 0;
3656                                         BUFFER_TRACE(parent_bh,
3657                                         "call ext4_journal_dirty_metadata");
3658                                         ext4_journal_dirty_metadata(handle,
3659                                                                     parent_bh);
3660                                 }
3661                         }
3662                 }
3663         } else {
3664                 /* We have reached the bottom of the tree. */
3665                 BUFFER_TRACE(parent_bh, "free data blocks");
3666                 ext4_free_data(handle, inode, parent_bh, first, last);
3667         }
3668 }
3669
3670 int ext4_can_truncate(struct inode *inode)
3671 {
3672         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
3673                 return 0;
3674         if (S_ISREG(inode->i_mode))
3675                 return 1;
3676         if (S_ISDIR(inode->i_mode))
3677                 return 1;
3678         if (S_ISLNK(inode->i_mode))
3679                 return !ext4_inode_is_fast_symlink(inode);
3680         return 0;
3681 }
3682
3683 /*
3684  * ext4_truncate()
3685  *
3686  * We block out ext4_get_block() block instantiations across the entire
3687  * transaction, and VFS/VM ensures that ext4_truncate() cannot run
3688  * simultaneously on behalf of the same inode.
3689  *
3690  * As we work through the truncate and commmit bits of it to the journal there
3691  * is one core, guiding principle: the file's tree must always be consistent on
3692  * disk.  We must be able to restart the truncate after a crash.
3693  *
3694  * The file's tree may be transiently inconsistent in memory (although it
3695  * probably isn't), but whenever we close off and commit a journal transaction,
3696  * the contents of (the filesystem + the journal) must be consistent and
3697  * restartable.  It's pretty simple, really: bottom up, right to left (although
3698  * left-to-right works OK too).
3699  *
3700  * Note that at recovery time, journal replay occurs *before* the restart of
3701  * truncate against the orphan inode list.
3702  *
3703  * The committed inode has the new, desired i_size (which is the same as
3704  * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
3705  * that this inode's truncate did not complete and it will again call
3706  * ext4_truncate() to have another go.  So there will be instantiated blocks
3707  * to the right of the truncation point in a crashed ext4 filesystem.  But
3708  * that's fine - as long as they are linked from the inode, the post-crash
3709  * ext4_truncate() run will find them and release them.
3710  */
3711 void ext4_truncate(struct inode *inode)
3712 {
3713         handle_t *handle;
3714         struct ext4_inode_info *ei = EXT4_I(inode);
3715         __le32 *i_data = ei->i_data;
3716         int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
3717         struct address_space *mapping = inode->i_mapping;
3718         ext4_lblk_t offsets[4];
3719         Indirect chain[4];
3720         Indirect *partial;
3721         __le32 nr = 0;
3722         int n;
3723         ext4_lblk_t last_block;
3724         unsigned blocksize = inode->i_sb->s_blocksize;
3725
3726         if (!ext4_can_truncate(inode))
3727                 return;
3728
3729         if (inode->i_size == 0)
3730                 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
3731
3732         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
3733                 ext4_ext_truncate(inode);
3734                 return;
3735         }
3736
3737         handle = start_transaction(inode);
3738         if (IS_ERR(handle))
3739                 return;         /* AKPM: return what? */
3740
3741         last_block = (inode->i_size + blocksize-1)
3742                                         >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
3743
3744         if (inode->i_size & (blocksize - 1))
3745                 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
3746                         goto out_stop;
3747
3748         n = ext4_block_to_path(inode, last_block, offsets, NULL);
3749         if (n == 0)
3750                 goto out_stop;  /* error */
3751
3752         /*
3753          * OK.  This truncate is going to happen.  We add the inode to the
3754          * orphan list, so that if this truncate spans multiple transactions,
3755          * and we crash, we will resume the truncate when the filesystem
3756          * recovers.  It also marks the inode dirty, to catch the new size.
3757          *
3758          * Implication: the file must always be in a sane, consistent
3759          * truncatable state while each transaction commits.
3760          */
3761         if (ext4_orphan_add(handle, inode))
3762                 goto out_stop;
3763
3764         /*
3765          * From here we block out all ext4_get_block() callers who want to
3766          * modify the block allocation tree.
3767          */
3768         down_write(&ei->i_data_sem);
3769
3770         ext4_discard_reservation(inode);
3771
3772         /*
3773          * The orphan list entry will now protect us from any crash which
3774          * occurs before the truncate completes, so it is now safe to propagate
3775          * the new, shorter inode size (held for now in i_size) into the
3776          * on-disk inode. We do this via i_disksize, which is the value which
3777          * ext4 *really* writes onto the disk inode.
3778          */
3779         ei->i_disksize = inode->i_size;
3780
3781         if (n == 1) {           /* direct blocks */
3782                 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
3783                                i_data + EXT4_NDIR_BLOCKS);
3784                 goto do_indirects;
3785         }
3786
3787         partial = ext4_find_shared(inode, n, offsets, chain, &nr);
3788         /* Kill the top of shared branch (not detached) */
3789         if (nr) {
3790                 if (partial == chain) {
3791                         /* Shared branch grows from the inode */
3792                         ext4_free_branches(handle, inode, NULL,
3793                                            &nr, &nr+1, (chain+n-1) - partial);
3794                         *partial->p = 0;
3795                         /*
3796                          * We mark the inode dirty prior to restart,
3797                          * and prior to stop.  No need for it here.
3798                          */
3799                 } else {
3800                         /* Shared branch grows from an indirect block */
3801                         BUFFER_TRACE(partial->bh, "get_write_access");
3802                         ext4_free_branches(handle, inode, partial->bh,
3803                                         partial->p,
3804                                         partial->p+1, (chain+n-1) - partial);
3805                 }
3806         }
3807         /* Clear the ends of indirect blocks on the shared branch */
3808         while (partial > chain) {
3809                 ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
3810                                    (__le32*)partial->bh->b_data+addr_per_block,
3811                                    (chain+n-1) - partial);
3812                 BUFFER_TRACE(partial->bh, "call brelse");
3813                 brelse (partial->bh);
3814                 partial--;
3815         }
3816 do_indirects:
3817         /* Kill the remaining (whole) subtrees */
3818         switch (offsets[0]) {
3819         default:
3820                 nr = i_data[EXT4_IND_BLOCK];
3821                 if (nr) {
3822                         ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
3823                         i_data[EXT4_IND_BLOCK] = 0;
3824                 }
3825         case EXT4_IND_BLOCK:
3826                 nr = i_data[EXT4_DIND_BLOCK];
3827                 if (nr) {
3828                         ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
3829                         i_data[EXT4_DIND_BLOCK] = 0;
3830                 }
3831         case EXT4_DIND_BLOCK:
3832                 nr = i_data[EXT4_TIND_BLOCK];
3833                 if (nr) {
3834                         ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
3835                         i_data[EXT4_TIND_BLOCK] = 0;
3836                 }
3837         case EXT4_TIND_BLOCK:
3838                 ;
3839         }
3840
3841         up_write(&ei->i_data_sem);
3842         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3843         ext4_mark_inode_dirty(handle, inode);
3844
3845         /*
3846          * In a multi-transaction truncate, we only make the final transaction
3847          * synchronous
3848          */
3849         if (IS_SYNC(inode))
3850                 handle->h_sync = 1;
3851 out_stop:
3852         /*
3853          * If this was a simple ftruncate(), and the file will remain alive
3854          * then we need to clear up the orphan record which we created above.
3855          * However, if this was a real unlink then we were called by
3856          * ext4_delete_inode(), and we allow that function to clean up the
3857          * orphan info for us.
3858          */
3859         if (inode->i_nlink)
3860                 ext4_orphan_del(handle, inode);
3861
3862         ext4_journal_stop(handle);
3863 }
3864
3865 static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
3866                 unsigned long ino, struct ext4_iloc *iloc)
3867 {
3868         ext4_group_t block_group;
3869         unsigned long offset;
3870         ext4_fsblk_t block;
3871         struct ext4_group_desc *gdp;
3872
3873         if (!ext4_valid_inum(sb, ino)) {
3874                 /*
3875                  * This error is already checked for in namei.c unless we are
3876                  * looking at an NFS filehandle, in which case no error
3877                  * report is needed
3878                  */
3879                 return 0;
3880         }
3881
3882         block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
3883         gdp = ext4_get_group_desc(sb, block_group, NULL);
3884         if (!gdp)
3885                 return 0;
3886
3887         /*
3888          * Figure out the offset within the block group inode table
3889          */
3890         offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
3891                 EXT4_INODE_SIZE(sb);
3892         block = ext4_inode_table(sb, gdp) +
3893                 (offset >> EXT4_BLOCK_SIZE_BITS(sb));
3894
3895         iloc->block_group = block_group;
3896         iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
3897         return block;
3898 }
3899
3900 /*
3901  * ext4_get_inode_loc returns with an extra refcount against the inode's
3902  * underlying buffer_head on success. If 'in_mem' is true, we have all
3903  * data in memory that is needed to recreate the on-disk version of this
3904  * inode.
3905  */
3906 static int __ext4_get_inode_loc(struct inode *inode,
3907                                 struct ext4_iloc *iloc, int in_mem)
3908 {
3909         ext4_fsblk_t block;
3910         struct buffer_head *bh;
3911
3912         block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
3913         if (!block)
3914                 return -EIO;
3915
3916         bh = sb_getblk(inode->i_sb, block);
3917         if (!bh) {
3918                 ext4_error (inode->i_sb, "ext4_get_inode_loc",
3919                                 "unable to read inode block - "
3920                                 "inode=%lu, block=%llu",
3921                                  inode->i_ino, block);
3922                 return -EIO;
3923         }
3924         if (!buffer_uptodate(bh)) {
3925                 lock_buffer(bh);
3926
3927                 /*
3928                  * If the buffer has the write error flag, we have failed
3929                  * to write out another inode in the same block.  In this
3930                  * case, we don't have to read the block because we may
3931                  * read the old inode data successfully.
3932                  */
3933                 if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
3934                         set_buffer_uptodate(bh);
3935
3936                 if (buffer_uptodate(bh)) {
3937                         /* someone brought it uptodate while we waited */
3938                         unlock_buffer(bh);
3939                         goto has_buffer;
3940                 }
3941
3942                 /*
3943                  * If we have all information of the inode in memory and this
3944                  * is the only valid inode in the block, we need not read the
3945                  * block.
3946                  */
3947                 if (in_mem) {
3948                         struct buffer_head *bitmap_bh;
3949                         struct ext4_group_desc *desc;
3950                         int inodes_per_buffer;
3951                         int inode_offset, i;
3952                         ext4_group_t block_group;
3953                         int start;
3954
3955                         block_group = (inode->i_ino - 1) /
3956                                         EXT4_INODES_PER_GROUP(inode->i_sb);
3957                         inodes_per_buffer = bh->b_size /
3958                                 EXT4_INODE_SIZE(inode->i_sb);
3959                         inode_offset = ((inode->i_ino - 1) %
3960                                         EXT4_INODES_PER_GROUP(inode->i_sb));
3961                         start = inode_offset & ~(inodes_per_buffer - 1);
3962
3963                         /* Is the inode bitmap in cache? */
3964                         desc = ext4_get_group_desc(inode->i_sb,
3965                                                 block_group, NULL);
3966                         if (!desc)
3967                                 goto make_io;
3968
3969                         bitmap_bh = sb_getblk(inode->i_sb,
3970                                 ext4_inode_bitmap(inode->i_sb, desc));
3971                         if (!bitmap_bh)
3972                                 goto make_io;
3973
3974                         /*
3975                          * If the inode bitmap isn't in cache then the
3976                          * optimisation may end up performing two reads instead
3977                          * of one, so skip it.
3978                          */
3979                         if (!buffer_uptodate(bitmap_bh)) {
3980                                 brelse(bitmap_bh);
3981                                 goto make_io;
3982                         }
3983                         for (i = start; i < start + inodes_per_buffer; i++) {
3984                                 if (i == inode_offset)
3985                                         continue;
3986                                 if (ext4_test_bit(i, bitmap_bh->b_data))
3987                                         break;
3988                         }
3989                         brelse(bitmap_bh);
3990                         if (i == start + inodes_per_buffer) {
3991                                 /* all other inodes are free, so skip I/O */
3992                                 memset(bh->b_data, 0, bh->b_size);
3993                                 set_buffer_uptodate(bh);
3994                                 unlock_buffer(bh);
3995                                 goto has_buffer;
3996                         }
3997                 }
3998
3999 make_io:
4000                 /*
4001                  * There are other valid inodes in the buffer, this inode
4002                  * has in-inode xattrs, or we don't have this inode in memory.
4003                  * Read the block from disk.
4004                  */
4005                 get_bh(bh);
4006                 bh->b_end_io = end_buffer_read_sync;
4007                 submit_bh(READ_META, bh);
4008                 wait_on_buffer(bh);
4009                 if (!buffer_uptodate(bh)) {
4010                         ext4_error(inode->i_sb, "ext4_get_inode_loc",
4011                                         "unable to read inode block - "
4012                                         "inode=%lu, block=%llu",
4013                                         inode->i_ino, block);
4014                         brelse(bh);
4015                         return -EIO;
4016                 }
4017         }
4018 has_buffer:
4019         iloc->bh = bh;
4020         return 0;
4021 }
4022
4023 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4024 {
4025         /* We have all inode data except xattrs in memory here. */
4026         return __ext4_get_inode_loc(inode, iloc,
4027                 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
4028 }
4029
4030 void ext4_set_inode_flags(struct inode *inode)
4031 {
4032         unsigned int flags = EXT4_I(inode)->i_flags;
4033
4034         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
4035         if (flags & EXT4_SYNC_FL)
4036                 inode->i_flags |= S_SYNC;
4037         if (flags & EXT4_APPEND_FL)
4038                 inode->i_flags |= S_APPEND;
4039         if (flags & EXT4_IMMUTABLE_FL)
4040                 inode->i_flags |= S_IMMUTABLE;
4041         if (flags & EXT4_NOATIME_FL)
4042                 inode->i_flags |= S_NOATIME;
4043         if (flags & EXT4_DIRSYNC_FL)
4044                 inode->i_flags |= S_DIRSYNC;
4045 }
4046
4047 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
4048 void ext4_get_inode_flags(struct ext4_inode_info *ei)
4049 {
4050         unsigned int flags = ei->vfs_inode.i_flags;
4051
4052         ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
4053                         EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
4054         if (flags & S_SYNC)
4055                 ei->i_flags |= EXT4_SYNC_FL;
4056         if (flags & S_APPEND)
4057                 ei->i_flags |= EXT4_APPEND_FL;
4058         if (flags & S_IMMUTABLE)
4059                 ei->i_flags |= EXT4_IMMUTABLE_FL;
4060         if (flags & S_NOATIME)
4061                 ei->i_flags |= EXT4_NOATIME_FL;
4062         if (flags & S_DIRSYNC)
4063                 ei->i_flags |= EXT4_DIRSYNC_FL;
4064 }
4065 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
4066                                         struct ext4_inode_info *ei)
4067 {
4068         blkcnt_t i_blocks ;
4069         struct inode *inode = &(ei->vfs_inode);
4070         struct super_block *sb = inode->i_sb;
4071
4072         if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4073                                 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
4074                 /* we are using combined 48 bit field */
4075                 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
4076                                         le32_to_cpu(raw_inode->i_blocks_lo);
4077                 if (ei->i_flags & EXT4_HUGE_FILE_FL) {
4078                         /* i_blocks represent file system block size */
4079                         return i_blocks  << (inode->i_blkbits - 9);
4080                 } else {
4081                         return i_blocks;
4082                 }
4083         } else {
4084                 return le32_to_cpu(raw_inode->i_blocks_lo);
4085         }
4086 }
4087
4088 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4089 {
4090         struct ext4_iloc iloc;
4091         struct ext4_inode *raw_inode;
4092         struct ext4_inode_info *ei;
4093         struct buffer_head *bh;
4094         struct inode *inode;
4095         long ret;
4096         int block;
4097
4098         inode = iget_locked(sb, ino);
4099         if (!inode)
4100                 return ERR_PTR(-ENOMEM);
4101         if (!(inode->i_state & I_NEW))
4102                 return inode;
4103
4104         ei = EXT4_I(inode);
4105 #ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
4106         ei->i_acl = EXT4_ACL_NOT_CACHED;
4107         ei->i_default_acl = EXT4_ACL_NOT_CACHED;
4108 #endif
4109         ei->i_block_alloc_info = NULL;
4110
4111         ret = __ext4_get_inode_loc(inode, &iloc, 0);
4112         if (ret < 0)
4113                 goto bad_inode;
4114         bh = iloc.bh;
4115         raw_inode = ext4_raw_inode(&iloc);
4116         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
4117         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
4118         inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
4119         if(!(test_opt (inode->i_sb, NO_UID32))) {
4120                 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
4121                 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
4122         }
4123         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4124
4125         ei->i_state = 0;
4126         ei->i_dir_start_lookup = 0;
4127         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4128         /* We now have enough fields to check if the inode was active or not.
4129          * This is needed because nfsd might try to access dead inodes
4130          * the test is that same one that e2fsck uses
4131          * NeilBrown 1999oct15
4132          */
4133         if (inode->i_nlink == 0) {
4134                 if (inode->i_mode == 0 ||
4135                     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
4136                         /* this inode is deleted */
4137                         brelse (bh);
4138                         ret = -ESTALE;
4139                         goto bad_inode;
4140                 }
4141                 /* The only unlinked inodes we let through here have
4142                  * valid i_mode and are being read by the orphan
4143                  * recovery code: that's fine, we're about to complete
4144                  * the process of deleting those. */
4145         }
4146         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
4147         inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
4148         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
4149         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
4150                 ei->i_file_acl |=
4151                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
4152         inode->i_size = ext4_isize(raw_inode);
4153         ei->i_disksize = inode->i_size;
4154         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4155         ei->i_block_group = iloc.block_group;
4156         /*
4157          * NOTE! The in-memory inode i_data array is in little-endian order
4158          * even on big-endian machines: we do NOT byteswap the block numbers!
4159          */
4160         for (block = 0; block < EXT4_N_BLOCKS; block++)
4161                 ei->i_data[block] = raw_inode->i_block[block];
4162         INIT_LIST_HEAD(&ei->i_orphan);
4163
4164         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4165                 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
4166                 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
4167                     EXT4_INODE_SIZE(inode->i_sb)) {
4168                         brelse (bh);
4169                         ret = -EIO;
4170                         goto bad_inode;
4171                 }
4172                 if (ei->i_extra_isize == 0) {
4173                         /* The extra space is currently unused. Use it. */
4174                         ei->i_extra_isize = sizeof(struct ext4_inode) -
4175                                             EXT4_GOOD_OLD_INODE_SIZE;
4176                 } else {
4177                         __le32 *magic = (void *)raw_inode +
4178                                         EXT4_GOOD_OLD_INODE_SIZE +
4179                                         ei->i_extra_isize;
4180                         if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4181                                  ei->i_state |= EXT4_STATE_XATTR;
4182                 }
4183         } else
4184                 ei->i_extra_isize = 0;
4185
4186         EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
4187         EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
4188         EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
4189         EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
4190
4191         inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
4192         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4193                 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4194                         inode->i_version |=
4195                         (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4196         }
4197
4198         if (ei->i_file_acl &&
4199             ((ei->i_file_acl <
4200               (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
4201                EXT4_SB(sb)->s_gdb_count)) ||
4202              (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
4203                 ext4_error(sb, __func__,
4204                            "bad extended attribute block %llu in inode #%lu",
4205                            ei->i_file_acl, inode->i_ino);
4206                 ret = -EIO;
4207                 goto bad_inode;
4208         }
4209
4210         if (S_ISREG(inode->i_mode)) {
4211                 inode->i_op = &ext4_file_inode_operations;
4212                 inode->i_fop = &ext4_file_operations;
4213                 ext4_set_aops(inode);
4214         } else if (S_ISDIR(inode->i_mode)) {
4215                 inode->i_op = &ext4_dir_inode_operations;
4216                 inode->i_fop = &ext4_dir_operations;
4217         } else if (S_ISLNK(inode->i_mode)) {
4218                 if (ext4_inode_is_fast_symlink(inode))
4219                         inode->i_op = &ext4_fast_symlink_inode_operations;
4220                 else {
4221                         inode->i_op = &ext4_symlink_inode_operations;
4222                         ext4_set_aops(inode);
4223                 }
4224         } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
4225               S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
4226                 inode->i_op = &ext4_special_inode_operations;
4227                 if (raw_inode->i_block[0])
4228                         init_special_inode(inode, inode->i_mode,
4229                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
4230                 else
4231                         init_special_inode(inode, inode->i_mode,
4232                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4233         } else {
4234                 brelse(bh);
4235                 ret = -EIO;
4236                 ext4_error(inode->i_sb, __func__,
4237                            "bogus i_mode (%o) for inode=%lu",
4238                            inode->i_mode, inode->i_ino);
4239                 goto bad_inode;
4240         }
4241         brelse (iloc.bh);
4242         ext4_set_inode_flags(inode);
4243         unlock_new_inode(inode);
4244         return inode;
4245
4246 bad_inode:
4247         iget_failed(inode);
4248         return ERR_PTR(ret);
4249 }
4250
4251 static int ext4_inode_blocks_set(handle_t *handle,
4252                                 struct ext4_inode *raw_inode,
4253                                 struct ext4_inode_info *ei)
4254 {
4255         struct inode *inode = &(ei->vfs_inode);
4256         u64 i_blocks = inode->i_blocks;
4257         struct super_block *sb = inode->i_sb;
4258         int err = 0;
4259
4260         if (i_blocks <= ~0U) {
4261                 /*
4262                  * i_blocks can be represnted in a 32 bit variable
4263                  * as multiple of 512 bytes
4264                  */
4265                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
4266                 raw_inode->i_blocks_high = 0;
4267                 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
4268         } else if (i_blocks <= 0xffffffffffffULL) {
4269                 /*
4270                  * i_blocks can be represented in a 48 bit variable
4271                  * as multiple of 512 bytes
4272                  */
4273                 err = ext4_update_rocompat_feature(handle, sb,
4274                                             EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
4275                 if (err)
4276                         goto  err_out;
4277                 /* i_block is stored in the split  48 bit fields */
4278                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
4279                 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
4280                 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
4281         } else {
4282                 /*
4283                  * i_blocks should be represented in a 48 bit variable
4284                  * as multiple of  file system block size
4285                  */
4286                 err = ext4_update_rocompat_feature(handle, sb,
4287                                             EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
4288                 if (err)
4289                         goto  err_out;
4290                 ei->i_flags |= EXT4_HUGE_FILE_FL;
4291                 /* i_block is stored in file system block size */
4292                 i_blocks = i_blocks >> (inode->i_blkbits - 9);
4293                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
4294                 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
4295         }
4296 err_out:
4297         return err;
4298 }
4299
4300 /*
4301  * Post the struct inode info into an on-disk inode location in the
4302  * buffer-cache.  This gobbles the caller's reference to the
4303  * buffer_head in the inode location struct.
4304  *
4305  * The caller must have write access to iloc->bh.
4306  */
4307 static int ext4_do_update_inode(handle_t *handle,
4308                                 struct inode *inode,
4309                                 struct ext4_iloc *iloc)
4310 {
4311         struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4312         struct ext4_inode_info *ei = EXT4_I(inode);
4313         struct buffer_head *bh = iloc->bh;
4314         int err = 0, rc, block;
4315
4316         /* For fields not not tracking in the in-memory inode,
4317          * initialise them to zero for new inodes. */
4318         if (ei->i_state & EXT4_STATE_NEW)
4319                 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
4320
4321         ext4_get_inode_flags(ei);
4322         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
4323         if(!(test_opt(inode->i_sb, NO_UID32))) {
4324                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
4325                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
4326 /*
4327  * Fix up interoperability with old kernels. Otherwise, old inodes get
4328  * re-used with the upper 16 bits of the uid/gid intact
4329  */
4330                 if(!ei->i_dtime) {
4331                         raw_inode->i_uid_high =
4332                                 cpu_to_le16(high_16_bits(inode->i_uid));
4333                         raw_inode->i_gid_high =
4334                                 cpu_to_le16(high_16_bits(inode->i_gid));
4335                 } else {
4336                         raw_inode->i_uid_high = 0;
4337                         raw_inode->i_gid_high = 0;
4338                 }
4339         } else {
4340                 raw_inode->i_uid_low =
4341                         cpu_to_le16(fs_high2lowuid(inode->i_uid));
4342                 raw_inode->i_gid_low =
4343                         cpu_to_le16(fs_high2lowgid(inode->i_gid));
4344                 raw_inode->i_uid_high = 0;
4345                 raw_inode->i_gid_high = 0;
4346         }
4347         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
4348
4349         EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
4350         EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
4351         EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
4352         EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
4353
4354         if (ext4_inode_blocks_set(handle, raw_inode, ei))
4355                 goto out_brelse;
4356         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
4357         /* clear the migrate flag in the raw_inode */
4358         raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
4359         if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
4360             cpu_to_le32(EXT4_OS_HURD))
4361                 raw_inode->i_file_acl_high =
4362                         cpu_to_le16(ei->i_file_acl >> 32);
4363         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
4364         ext4_isize_set(raw_inode, ei->i_disksize);
4365         if (ei->i_disksize > 0x7fffffffULL) {
4366                 struct super_block *sb = inode->i_sb;
4367                 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
4368                                 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
4369                                 EXT4_SB(sb)->s_es->s_rev_level ==
4370                                 cpu_to_le32(EXT4_GOOD_OLD_REV)) {
4371                         /* If this is the first large file
4372                          * created, add a flag to the superblock.
4373                          */
4374                         err = ext4_journal_get_write_access(handle,
4375                                         EXT4_SB(sb)->s_sbh);
4376                         if (err)
4377                                 goto out_brelse;
4378                         ext4_update_dynamic_rev(sb);
4379                         EXT4_SET_RO_COMPAT_FEATURE(sb,
4380                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
4381                         sb->s_dirt = 1;
4382                         handle->h_sync = 1;
4383                         err = ext4_journal_dirty_metadata(handle,
4384                                         EXT4_SB(sb)->s_sbh);
4385                 }
4386         }
4387         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
4388         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
4389                 if (old_valid_dev(inode->i_rdev)) {
4390                         raw_inode->i_block[0] =
4391                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
4392                         raw_inode->i_block[1] = 0;
4393                 } else {
4394                         raw_inode->i_block[0] = 0;
4395                         raw_inode->i_block[1] =
4396                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
4397                         raw_inode->i_block[2] = 0;
4398                 }
4399         } else for (block = 0; block < EXT4_N_BLOCKS; block++)
4400                 raw_inode->i_block[block] = ei->i_data[block];
4401
4402         raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
4403         if (ei->i_extra_isize) {
4404                 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
4405                         raw_inode->i_version_hi =
4406                         cpu_to_le32(inode->i_version >> 32);
4407                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4408         }
4409
4410
4411         BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
4412         rc = ext4_journal_dirty_metadata(handle, bh);
4413         if (!err)
4414                 err = rc;
4415         ei->i_state &= ~EXT4_STATE_NEW;
4416
4417 out_brelse:
4418         brelse (bh);
4419         ext4_std_error(inode->i_sb, err);
4420         return err;
4421 }
4422
4423 /*
4424  * ext4_write_inode()
4425  *
4426  * We are called from a few places:
4427  *
4428  * - Within generic_file_write() for O_SYNC files.
4429  *   Here, there will be no transaction running. We wait for any running
4430  *   trasnaction to commit.
4431  *
4432  * - Within sys_sync(), kupdate and such.
4433  *   We wait on commit, if tol to.
4434  *
4435  * - Within prune_icache() (PF_MEMALLOC == true)
4436  *   Here we simply return.  We can't afford to block kswapd on the
4437  *   journal commit.
4438  *
4439  * In all cases it is actually safe for us to return without doing anything,
4440  * because the inode has been copied into a raw inode buffer in
4441  * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
4442  * knfsd.
4443  *
4444  * Note that we are absolutely dependent upon all inode dirtiers doing the
4445  * right thing: they *must* call mark_inode_dirty() after dirtying info in
4446  * which we are interested.
4447  *
4448  * It would be a bug for them to not do this.  The code:
4449  *
4450  *      mark_inode_dirty(inode)
4451  *      stuff();
4452  *      inode->i_size = expr;
4453  *
4454  * is in error because a kswapd-driven write_inode() could occur while
4455  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
4456  * will no longer be on the superblock's dirty inode list.
4457  */
4458 int ext4_write_inode(struct inode *inode, int wait)
4459 {
4460         if (current->flags & PF_MEMALLOC)
4461                 return 0;
4462
4463         if (ext4_journal_current_handle()) {
4464                 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
4465                 dump_stack();
4466                 return -EIO;
4467         }
4468
4469         if (!wait)
4470                 return 0;
4471
4472         return ext4_force_commit(inode->i_sb);
4473 }
4474
4475 /*
4476  * ext4_setattr()
4477  *
4478  * Called from notify_change.
4479  *
4480  * We want to trap VFS attempts to truncate the file as soon as
4481  * possible.  In particular, we want to make sure that when the VFS
4482  * shrinks i_size, we put the inode on the orphan list and modify
4483  * i_disksize immediately, so that during the subsequent flushing of
4484  * dirty pages and freeing of disk blocks, we can guarantee that any
4485  * commit will leave the blocks being flushed in an unused state on
4486  * disk.  (On recovery, the inode will get truncated and the blocks will
4487  * be freed, so we have a strong guarantee that no future commit will
4488  * leave these blocks visible to the user.)
4489  *
4490  * Another thing we have to assure is that if we are in ordered mode
4491  * and inode is still attached to the committing transaction, we must
4492  * we start writeout of all the dirty pages which are being truncated.
4493  * This way we are sure that all the data written in the previous
4494  * transaction are already on disk (truncate waits for pages under
4495  * writeback).
4496  *
4497  * Called with inode->i_mutex down.
4498  */
4499 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4500 {
4501         struct inode *inode = dentry->d_inode;
4502         int error, rc = 0;
4503         const unsigned int ia_valid = attr->ia_valid;
4504
4505         error = inode_change_ok(inode, attr);
4506         if (error)
4507                 return error;
4508
4509         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
4510                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
4511                 handle_t *handle;
4512
4513                 /* (user+group)*(old+new) structure, inode write (sb,
4514                  * inode block, ? - but truncate inode update has it) */
4515                 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
4516                                         EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
4517                 if (IS_ERR(handle)) {
4518                         error = PTR_ERR(handle);
4519                         goto err_out;
4520                 }
4521                 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
4522                 if (error) {
4523                         ext4_journal_stop(handle);
4524                         return error;
4525                 }
4526                 /* Update corresponding info in inode so that everything is in
4527                  * one transaction */
4528                 if (attr->ia_valid & ATTR_UID)
4529                         inode->i_uid = attr->ia_uid;
4530                 if (attr->ia_valid & ATTR_GID)
4531                         inode->i_gid = attr->ia_gid;
4532                 error = ext4_mark_inode_dirty(handle, inode);
4533                 ext4_journal_stop(handle);
4534         }
4535
4536         if (attr->ia_valid & ATTR_SIZE) {
4537                 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
4538                         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4539
4540                         if (attr->ia_size > sbi->s_bitmap_maxbytes) {
4541                                 error = -EFBIG;
4542                                 goto err_out;
4543                         }
4544                 }
4545         }
4546
4547         if (S_ISREG(inode->i_mode) &&
4548             attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
4549                 handle_t *handle;
4550
4551                 handle = ext4_journal_start(inode, 3);
4552                 if (IS_ERR(handle)) {
4553                         error = PTR_ERR(handle);
4554                         goto err_out;
4555                 }
4556
4557                 error = ext4_orphan_add(handle, inode);
4558                 EXT4_I(inode)->i_disksize = attr->ia_size;
4559                 rc = ext4_mark_inode_dirty(handle, inode);
4560                 if (!error)
4561                         error = rc;
4562                 ext4_journal_stop(handle);
4563
4564                 if (ext4_should_order_data(inode)) {
4565                         error = ext4_begin_ordered_truncate(inode,
4566                                                             attr->ia_size);
4567                         if (error) {
4568                                 /* Do as much error cleanup as possible */
4569                                 handle = ext4_journal_start(inode, 3);
4570                                 if (IS_ERR(handle)) {
4571                                         ext4_orphan_del(NULL, inode);
4572                                         goto err_out;
4573                                 }
4574                                 ext4_orphan_del(handle, inode);
4575                                 ext4_journal_stop(handle);
4576                                 goto err_out;
4577                         }
4578                 }
4579         }
4580
4581         rc = inode_setattr(inode, attr);
4582
4583         /* If inode_setattr's call to ext4_truncate failed to get a
4584          * transaction handle at all, we need to clean up the in-core
4585          * orphan list manually. */
4586         if (inode->i_nlink)
4587                 ext4_orphan_del(NULL, inode);
4588
4589         if (!rc && (ia_valid & ATTR_MODE))
4590                 rc = ext4_acl_chmod(inode);
4591
4592 err_out:
4593         ext4_std_error(inode->i_sb, error);
4594         if (!error)
4595                 error = rc;
4596         return error;
4597 }
4598
4599 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4600                  struct kstat *stat)
4601 {
4602         struct inode *inode;
4603         unsigned long delalloc_blocks;
4604
4605         inode = dentry->d_inode;
4606         generic_fillattr(inode, stat);
4607
4608         /*
4609          * We can't update i_blocks if the block allocation is delayed
4610          * otherwise in the case of system crash before the real block
4611          * allocation is done, we will have i_blocks inconsistent with
4612          * on-disk file blocks.
4613          * We always keep i_blocks updated together with real
4614          * allocation. But to not confuse with user, stat
4615          * will return the blocks that include the delayed allocation
4616          * blocks for this file.
4617          */
4618         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
4619         delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
4620         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
4621
4622         stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
4623         return 0;
4624 }
4625
4626 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
4627                                       int chunk)
4628 {
4629         int indirects;
4630
4631         /* if nrblocks are contiguous */
4632         if (chunk) {
4633                 /*
4634                  * With N contiguous data blocks, it need at most
4635                  * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
4636                  * 2 dindirect blocks
4637                  * 1 tindirect block
4638                  */
4639                 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
4640                 return indirects + 3;
4641         }
4642         /*
4643          * if nrblocks are not contiguous, worse case, each block touch
4644          * a indirect block, and each indirect block touch a double indirect
4645          * block, plus a triple indirect block
4646          */
4647         indirects = nrblocks * 2 + 1;
4648         return indirects;
4649 }
4650
4651 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4652 {
4653         if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
4654                 return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
4655         return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
4656 }
4657
4658 /*
4659  * Account for index blocks, block groups bitmaps and block group
4660  * descriptor blocks if modify datablocks and index blocks
4661  * worse case, the indexs blocks spread over different block groups
4662  *
4663  * If datablocks are discontiguous, they are possible to spread over
4664  * different block groups too. If they are contiugous, with flexbg,
4665  * they could still across block group boundary.
4666  *
4667  * Also account for superblock, inode, quota and xattr blocks
4668  */
4669 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4670 {
4671         int groups, gdpblocks;
4672         int idxblocks;
4673         int ret = 0;
4674
4675         /*
4676          * How many index blocks need to touch to modify nrblocks?
4677          * The "Chunk" flag indicating whether the nrblocks is
4678          * physically contiguous on disk
4679          *
4680          * For Direct IO and fallocate, they calls get_block to allocate
4681          * one single extent at a time, so they could set the "Chunk" flag
4682          */
4683         idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
4684
4685         ret = idxblocks;
4686
4687         /*
4688          * Now let's see how many group bitmaps and group descriptors need
4689          * to account
4690          */
4691         groups = idxblocks;
4692         if (chunk)
4693                 groups += 1;
4694         else
4695                 groups += nrblocks;
4696
4697         gdpblocks = groups;
4698         if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
4699                 groups = EXT4_SB(inode->i_sb)->s_groups_count;
4700         if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
4701                 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
4702
4703         /* bitmaps and block group descriptor blocks */
4704         ret += groups + gdpblocks;
4705
4706         /* Blocks for super block, inode, quota and xattr blocks */
4707         ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
4708
4709         return ret;
4710 }
4711
4712 /*
4713  * Calulate the total number of credits to reserve to fit
4714  * the modification of a single pages into a single transaction,
4715  * which may include multiple chunks of block allocations.
4716  *
4717  * This could be called via ext4_write_begin()
4718  *
4719  * We need to consider the worse case, when
4720  * one new block per extent.
4721  */
4722 int ext4_writepage_trans_blocks(struct inode *inode)
4723 {
4724         int bpp = ext4_journal_blocks_per_page(inode);
4725         int ret;
4726
4727         ret = ext4_meta_trans_blocks(inode, bpp, 0);
4728
4729         /* Account for data blocks for journalled mode */
4730         if (ext4_should_journal_data(inode))
4731                 ret += bpp;
4732         return ret;
4733 }
4734
4735 /*
4736  * Calculate the journal credits for a chunk of data modification.
4737  *
4738  * This is called from DIO, fallocate or whoever calling
4739  * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
4740  *
4741  * journal buffers for data blocks are not included here, as DIO
4742  * and fallocate do no need to journal data buffers.
4743  */
4744 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
4745 {
4746         return ext4_meta_trans_blocks(inode, nrblocks, 1);
4747 }
4748
4749 /*
4750  * The caller must have previously called ext4_reserve_inode_write().
4751  * Give this, we know that the caller already has write access to iloc->bh.
4752  */
4753 int ext4_mark_iloc_dirty(handle_t *handle,
4754                 struct inode *inode, struct ext4_iloc *iloc)
4755 {
4756         int err = 0;
4757
4758         if (test_opt(inode->i_sb, I_VERSION))
4759                 inode_inc_iversion(inode);
4760
4761         /* the do_update_inode consumes one bh->b_count */
4762         get_bh(iloc->bh);
4763
4764         /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
4765         err = ext4_do_update_inode(handle, inode, iloc);
4766         put_bh(iloc->bh);
4767         return err;
4768 }
4769
4770 /*
4771  * On success, We end up with an outstanding reference count against
4772  * iloc->bh.  This _must_ be cleaned up later.
4773  */
4774
4775 int
4776 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
4777                          struct ext4_iloc *iloc)
4778 {
4779         int err = 0;
4780         if (handle) {
4781                 err = ext4_get_inode_loc(inode, iloc);
4782                 if (!err) {
4783                         BUFFER_TRACE(iloc->bh, "get_write_access");
4784                         err = ext4_journal_get_write_access(handle, iloc->bh);
4785                         if (err) {
4786                                 brelse(iloc->bh);
4787                                 iloc->bh = NULL;
4788                         }
4789                 }
4790         }
4791         ext4_std_error(inode->i_sb, err);
4792         return err;
4793 }
4794
4795 /*
4796  * Expand an inode by new_extra_isize bytes.
4797  * Returns 0 on success or negative error number on failure.
4798  */
4799 static int ext4_expand_extra_isize(struct inode *inode,
4800                                    unsigned int new_extra_isize,
4801                                    struct ext4_iloc iloc,
4802                                    handle_t *handle)
4803 {
4804         struct ext4_inode *raw_inode;
4805         struct ext4_xattr_ibody_header *header;
4806         struct ext4_xattr_entry *entry;
4807
4808         if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
4809                 return 0;
4810
4811         raw_inode = ext4_raw_inode(&iloc);
4812
4813         header = IHDR(inode, raw_inode);
4814         entry = IFIRST(header);
4815
4816         /* No extended attributes present */
4817         if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
4818                 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
4819                 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
4820                         new_extra_isize);
4821                 EXT4_I(inode)->i_extra_isize = new_extra_isize;
4822                 return 0;
4823         }
4824
4825         /* try to expand with EAs present */
4826         return ext4_expand_extra_isize_ea(inode, new_extra_isize,
4827                                           raw_inode, handle);
4828 }
4829
4830 /*
4831  * What we do here is to mark the in-core inode as clean with respect to inode
4832  * dirtiness (it may still be data-dirty).
4833  * This means that the in-core inode may be reaped by prune_icache
4834  * without having to perform any I/O.  This is a very good thing,
4835  * because *any* task may call prune_icache - even ones which
4836  * have a transaction open against a different journal.
4837  *
4838  * Is this cheating?  Not really.  Sure, we haven't written the
4839  * inode out, but prune_icache isn't a user-visible syncing function.
4840  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
4841  * we start and wait on commits.
4842  *
4843  * Is this efficient/effective?  Well, we're being nice to the system
4844  * by cleaning up our inodes proactively so they can be reaped
4845  * without I/O.  But we are potentially leaving up to five seconds'
4846  * worth of inodes floating about which prune_icache wants us to
4847  * write out.  One way to fix that would be to get prune_icache()
4848  * to do a write_super() to free up some memory.  It has the desired
4849  * effect.
4850  */
4851 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4852 {
4853         struct ext4_iloc iloc;
4854         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4855         static unsigned int mnt_count;
4856         int err, ret;
4857
4858         might_sleep();
4859         err = ext4_reserve_inode_write(handle, inode, &iloc);
4860         if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
4861             !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
4862                 /*
4863                  * We need extra buffer credits since we may write into EA block
4864                  * with this same handle. If journal_extend fails, then it will
4865                  * only result in a minor loss of functionality for that inode.
4866                  * If this is felt to be critical, then e2fsck should be run to
4867                  * force a large enough s_min_extra_isize.
4868                  */
4869                 if ((jbd2_journal_extend(handle,
4870                              EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
4871                         ret = ext4_expand_extra_isize(inode,
4872                                                       sbi->s_want_extra_isize,
4873                                                       iloc, handle);
4874                         if (ret) {
4875                                 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
4876                                 if (mnt_count !=
4877                                         le16_to_cpu(sbi->s_es->s_mnt_count)) {
4878                                         ext4_warning(inode->i_sb, __func__,
4879                                         "Unable to expand inode %lu. Delete"
4880                                         " some EAs or run e2fsck.",
4881                                         inode->i_ino);
4882                                         mnt_count =
4883                                           le16_to_cpu(sbi->s_es->s_mnt_count);
4884                                 }
4885                         }
4886                 }
4887         }
4888         if (!err)
4889                 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
4890         return err;
4891 }
4892
4893 /*
4894  * ext4_dirty_inode() is called from __mark_inode_dirty()
4895  *
4896  * We're really interested in the case where a file is being extended.
4897  * i_size has been changed by generic_commit_write() and we thus need
4898  * to include the updated inode in the current transaction.
4899  *
4900  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
4901  * are allocated to the file.
4902  *
4903  * If the inode is marked synchronous, we don't honour that here - doing
4904  * so would cause a commit on atime updates, which we don't bother doing.
4905  * We handle synchronous inodes at the highest possible level.
4906  */
4907 void ext4_dirty_inode(struct inode *inode)
4908 {
4909         handle_t *current_handle = ext4_journal_current_handle();
4910         handle_t *handle;
4911
4912         handle = ext4_journal_start(inode, 2);
4913         if (IS_ERR(handle))
4914                 goto out;
4915         if (current_handle &&
4916                 current_handle->h_transaction != handle->h_transaction) {
4917                 /* This task has a transaction open against a different fs */
4918                 printk(KERN_EMERG "%s: transactions do not match!\n",
4919                        __func__);
4920         } else {
4921                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
4922                                 current_handle);
4923                 ext4_mark_inode_dirty(handle, inode);
4924         }
4925         ext4_journal_stop(handle);
4926 out:
4927         return;
4928 }
4929
4930 #if 0
4931 /*
4932  * Bind an inode's backing buffer_head into this transaction, to prevent
4933  * it from being flushed to disk early.  Unlike
4934  * ext4_reserve_inode_write, this leaves behind no bh reference and
4935  * returns no iloc structure, so the caller needs to repeat the iloc
4936  * lookup to mark the inode dirty later.
4937  */
4938 static int ext4_pin_inode(handle_t *handle, struct inode *inode)
4939 {
4940         struct ext4_iloc iloc;
4941
4942         int err = 0;
4943         if (handle) {
4944                 err = ext4_get_inode_loc(inode, &iloc);
4945                 if (!err) {
4946                         BUFFER_TRACE(iloc.bh, "get_write_access");
4947                         err = jbd2_journal_get_write_access(handle, iloc.bh);
4948                         if (!err)
4949                                 err = ext4_journal_dirty_metadata(handle,
4950                                                                   iloc.bh);
4951                         brelse(iloc.bh);
4952                 }
4953         }
4954         ext4_std_error(inode->i_sb, err);
4955         return err;
4956 }
4957 #endif
4958
4959 int ext4_change_inode_journal_flag(struct inode *inode, int val)
4960 {
4961         journal_t *journal;
4962         handle_t *handle;
4963         int err;
4964
4965         /*
4966          * We have to be very careful here: changing a data block's
4967          * journaling status dynamically is dangerous.  If we write a
4968          * data block to the journal, change the status and then delete
4969          * that block, we risk forgetting to revoke the old log record
4970          * from the journal and so a subsequent replay can corrupt data.
4971          * So, first we make sure that the journal is empty and that
4972          * nobody is changing anything.
4973          */
4974
4975         journal = EXT4_JOURNAL(inode);
4976         if (is_journal_aborted(journal))
4977                 return -EROFS;
4978
4979         jbd2_journal_lock_updates(journal);
4980         jbd2_journal_flush(journal);
4981
4982         /*
4983          * OK, there are no updates running now, and all cached data is
4984          * synced to disk.  We are now in a completely consistent state
4985          * which doesn't have anything in the journal, and we know that
4986          * no filesystem updates are running, so it is safe to modify
4987          * the inode's in-core data-journaling state flag now.
4988          */
4989
4990         if (val)
4991                 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
4992         else
4993                 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
4994         ext4_set_aops(inode);
4995
4996         jbd2_journal_unlock_updates(journal);
4997
4998         /* Finally we can mark the inode as dirty. */
4999
5000         handle = ext4_journal_start(inode, 1);
5001         if (IS_ERR(handle))
5002                 return PTR_ERR(handle);
5003
5004         err = ext4_mark_inode_dirty(handle, inode);
5005         handle->h_sync = 1;
5006         ext4_journal_stop(handle);
5007         ext4_std_error(inode->i_sb, err);
5008
5009         return err;
5010 }
5011
5012 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
5013 {
5014         return !buffer_mapped(bh);
5015 }
5016
5017 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5018 {
5019         struct page *page = vmf->page;
5020         loff_t size;
5021         unsigned long len;
5022         int ret = -EINVAL;
5023         struct file *file = vma->vm_file;
5024         struct inode *inode = file->f_path.dentry->d_inode;
5025         struct address_space *mapping = inode->i_mapping;
5026
5027         /*
5028          * Get i_alloc_sem to stop truncates messing with the inode. We cannot
5029          * get i_mutex because we are already holding mmap_sem.
5030          */
5031         down_read(&inode->i_alloc_sem);
5032         size = i_size_read(inode);
5033         if (page->mapping != mapping || size <= page_offset(page)
5034             || !PageUptodate(page)) {
5035                 /* page got truncated from under us? */
5036                 goto out_unlock;
5037         }
5038         ret = 0;
5039         if (PageMappedToDisk(page))
5040                 goto out_unlock;
5041
5042         if (page->index == size >> PAGE_CACHE_SHIFT)
5043                 len = size & ~PAGE_CACHE_MASK;
5044         else
5045                 len = PAGE_CACHE_SIZE;
5046
5047         if (page_has_buffers(page)) {
5048                 /* return if we have all the buffers mapped */
5049                 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5050                                        ext4_bh_unmapped))
5051                         goto out_unlock;
5052         }
5053         /*
5054          * OK, we need to fill the hole... Do write_begin write_end
5055          * to do block allocation/reservation.We are not holding
5056          * inode.i__mutex here. That allow * parallel write_begin,
5057          * write_end call. lock_page prevent this from happening
5058          * on the same page though
5059          */
5060         ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
5061                         len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
5062         if (ret < 0)
5063                 goto out_unlock;
5064         ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
5065                         len, len, page, NULL);
5066         if (ret < 0)
5067                 goto out_unlock;
5068         ret = 0;
5069 out_unlock:
5070         if (ret)
5071                 ret = VM_FAULT_SIGBUS;
5072         up_read(&inode->i_alloc_sem);
5073         return ret;
5074 }