]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - fs/ext4/inode.c
ext4: better estimate credits needed for ext4_da_writepages()
[karo-tx-linux.git] / fs / ext4 / inode.c
index d6382b89ecbde3077720ebc6a9bb56254883e2d7..2b777e51b6774e24bf21651c0f58f4fe3f6da407 100644 (file)
@@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                                   new_size);
 }
 
-static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+                               unsigned int length);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
-               struct inode *inode, struct page *page, loff_t from,
-               loff_t length, int flags);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+                                 int pextents);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -423,66 +423,6 @@ static int __check_block_validity(struct inode *inode, const char *func,
 #define check_block_validity(inode, map)       \
        __check_block_validity((inode), __func__, __LINE__, (map))
 
-/*
- * Return the number of contiguous dirty pages in a given inode
- * starting at page frame idx.
- */
-static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
-                                   unsigned int max_pages)
-{
-       struct address_space *mapping = inode->i_mapping;
-       pgoff_t index;
-       struct pagevec pvec;
-       pgoff_t num = 0;
-       int i, nr_pages, done = 0;
-
-       if (max_pages == 0)
-               return 0;
-       pagevec_init(&pvec, 0);
-       while (!done) {
-               index = idx;
-               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-                                             PAGECACHE_TAG_DIRTY,
-                                             (pgoff_t)PAGEVEC_SIZE);
-               if (nr_pages == 0)
-                       break;
-               for (i = 0; i < nr_pages; i++) {
-                       struct page *page = pvec.pages[i];
-                       struct buffer_head *bh, *head;
-
-                       lock_page(page);
-                       if (unlikely(page->mapping != mapping) ||
-                           !PageDirty(page) ||
-                           PageWriteback(page) ||
-                           page->index != idx) {
-                               done = 1;
-                               unlock_page(page);
-                               break;
-                       }
-                       if (page_has_buffers(page)) {
-                               bh = head = page_buffers(page);
-                               do {
-                                       if (!buffer_delay(bh) &&
-                                           !buffer_unwritten(bh))
-                                               done = 1;
-                                       bh = bh->b_this_page;
-                               } while (!done && (bh != head));
-                       }
-                       unlock_page(page);
-                       if (done)
-                               break;
-                       idx++;
-                       num++;
-                       if (num >= max_pages) {
-                               done = 1;
-                               break;
-                       }
-               }
-               pagevec_release(&pvec);
-       }
-       return num;
-}
-
 #ifdef ES_AGGRESSIVE_TEST
 static void ext4_map_blocks_es_recheck(handle_t *handle,
                                       struct inode *inode,
@@ -1415,21 +1355,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
-                                            unsigned long offset)
+                                            unsigned int offset,
+                                            unsigned int length)
 {
        int to_release = 0;
        struct buffer_head *head, *bh;
        unsigned int curr_off = 0;
        struct inode *inode = page->mapping->host;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       unsigned int stop = offset + length;
        int num_clusters;
        ext4_fsblk_t lblk;
 
+       BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
        head = page_buffers(page);
        bh = head;
        do {
                unsigned int next_off = curr_off + bh->b_size;
 
+               if (next_off > stop)
+                       break;
+
                if ((offset <= curr_off) && (buffer_delay(bh))) {
                        to_release++;
                        clear_buffer_delay(bh);
@@ -1460,6 +1407,8 @@ static void ext4_da_page_release_reservation(struct page *page,
  * Delayed allocation stuff
  */
 
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd);
+
 /*
  * mpage_da_submit_io - walks through extent of pages and try to write
  * them with writepage() call back
@@ -1488,7 +1437,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
        struct ext4_io_submit io_submit;
 
        BUG_ON(mpd->next_page <= mpd->first_page);
-       memset(&io_submit, 0, sizeof(io_submit));
+       ext4_io_submit_init(&io_submit, mpd->wbc);
+       io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+       if (!io_submit.io_end) {
+               ext4_da_block_invalidatepages(mpd);
+               return -ENOMEM;
+       }
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
@@ -1576,6 +1530,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                pagevec_release(&pvec);
        }
        ext4_io_submit(&io_submit);
+       /* Drop io_end reference we got from init */
+       ext4_put_io_end_defer(io_submit.io_end);
        return ret;
 }
 
@@ -1606,7 +1562,7 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
                                break;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
-                       block_invalidatepage(page, 0);
+                       block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
                        ClearPageUptodate(page);
                        unlock_page(page);
                }
@@ -2234,35 +2190,40 @@ static int ext4_writepage(struct page *page,
                 */
                return __ext4_journalled_writepage(page, len);
 
-       memset(&io_submit, 0, sizeof(io_submit));
+       ext4_io_submit_init(&io_submit, wbc);
+       io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+       if (!io_submit.io_end) {
+               redirty_page_for_writepage(wbc, page);
+               unlock_page(page);
+               return -ENOMEM;
+       }
        ret = ext4_bio_write_page(&io_submit, page, len, wbc);
        ext4_io_submit(&io_submit);
+       /* Drop io_end reference we got from init */
+       ext4_put_io_end_defer(io_submit.io_end);
        return ret;
 }
 
 /*
- * This is called via ext4_da_writepages() to
- * calculate the total number of credits to reserve to fit
- * a single extent allocation into a single transaction,
- * ext4_da_writpeages() will loop calling this before
- * the block allocation.
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks upto full group size.
  */
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
 
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_da_writepages(). We map an extent of
+ * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
 static int ext4_da_writepages_trans_blocks(struct inode *inode)
 {
-       int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-
-       /*
-        * With non-extent format the journal credit needed to
-        * insert nrblocks contiguous block is dependent on
-        * number of contiguous block. So we will limit
-        * number of contiguous block to a sane value
-        */
-       if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
-           (max_blocks > EXT4_MAX_TRANS_DATA))
-               max_blocks = EXT4_MAX_TRANS_DATA;
+       int bpp = ext4_journal_blocks_per_page(inode);
 
-       return ext4_chunk_trans_blocks(inode, max_blocks);
+       return ext4_meta_trans_blocks(inode,
+                               MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
 }
 
 /*
@@ -2440,10 +2401,8 @@ static int ext4_da_writepages(struct address_space *mapping,
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
        int pages_written = 0;
-       unsigned int max_pages;
        int range_cyclic, cycled = 1, io_done = 0;
        int needed_blocks, ret = 0;
-       long desired_nr_to_write, nr_to_writebump = 0;
        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        pgoff_t done_index = 0;
@@ -2490,39 +2449,6 @@ static int ext4_da_writepages(struct address_space *mapping,
                end = wbc->range_end >> PAGE_CACHE_SHIFT;
        }
 
-       /*
-        * This works around two forms of stupidity.  The first is in
-        * the writeback code, which caps the maximum number of pages
-        * written to be 1024 pages.  This is wrong on multiple
-        * levels; different architectues have a different page size,
-        * which changes the maximum amount of data which gets
-        * written.  Secondly, 4 megabytes is way too small.  XFS
-        * forces this value to be 16 megabytes by multiplying
-        * nr_to_write parameter by four, and then relies on its
-        * allocator to allocate larger extents to make them
-        * contiguous.  Unfortunately this brings us to the second
-        * stupidity, which is that ext4's mballoc code only allocates
-        * at most 2048 blocks.  So we force contiguous writes up to
-        * the number of dirty blocks in the inode, or
-        * sbi->max_writeback_mb_bump whichever is smaller.
-        */
-       max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-       if (!range_cyclic && range_whole) {
-               if (wbc->nr_to_write == LONG_MAX)
-                       desired_nr_to_write = wbc->nr_to_write;
-               else
-                       desired_nr_to_write = wbc->nr_to_write * 8;
-       } else
-               desired_nr_to_write = ext4_num_dirty_pages(inode, index,
-                                                          max_pages);
-       if (desired_nr_to_write > max_pages)
-               desired_nr_to_write = max_pages;
-
-       if (wbc->nr_to_write < desired_nr_to_write) {
-               nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
-               wbc->nr_to_write = desired_nr_to_write;
-       }
-
 retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, index, end);
@@ -2615,7 +2541,6 @@ retry:
                mapping->writeback_index = done_index;
 
 out_writepages:
-       wbc->nr_to_write -= nr_to_writebump;
        wbc->range_start = range_start;
        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
        return ret;
@@ -2829,7 +2754,8 @@ static int ext4_da_write_end(struct file *file,
        return ret ? ret : copied;
 }
 
-static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+                                  unsigned int length)
 {
        /*
         * Drop reserved blocks
@@ -2838,10 +2764,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
        if (!page_has_buffers(page))
                goto out;
 
-       ext4_da_page_release_reservation(page, offset);
+       ext4_da_page_release_reservation(page, offset, length);
 
 out:
-       ext4_invalidatepage(page, offset);
+       ext4_invalidatepage(page, offset, length);
 
        return;
 }
@@ -2989,37 +2915,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+                               unsigned int length)
 {
-       trace_ext4_invalidatepage(page, offset);
+       trace_ext4_invalidatepage(page, offset, length);
 
        /* No journalling happens on data buffers when this function is used */
        WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
 
-       block_invalidatepage(page, offset);
+       block_invalidatepage(page, offset, length);
 }
 
 static int __ext4_journalled_invalidatepage(struct page *page,
-                                           unsigned long offset)
+                                           unsigned int offset,
+                                           unsigned int length)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 
-       trace_ext4_journalled_invalidatepage(page, offset);
+       trace_ext4_journalled_invalidatepage(page, offset, length);
 
        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
-       if (offset == 0)
+       if (offset == 0 && length == PAGE_CACHE_SIZE)
                ClearPageChecked(page);
 
-       return jbd2_journal_invalidatepage(journal, page, offset);
+       return jbd2_journal_invalidatepage(journal, page, offset, length);
 }
 
 /* Wrapper for aops... */
 static void ext4_journalled_invalidatepage(struct page *page,
-                                          unsigned long offset)
+                                          unsigned int offset,
+                                          unsigned int length)
 {
-       WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
+       WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
 }
 
 static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3067,9 +2996,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
        struct inode *inode = file_inode(iocb->ki_filp);
         ext4_io_end_t *io_end = iocb->private;
 
-       /* if not async direct IO or dio with 0 bytes write, just return */
-       if (!io_end || !size)
-               goto out;
+       /* if not async direct IO just return */
+       if (!io_end) {
+               inode_dio_done(inode);
+               if (is_async)
+                       aio_complete(iocb, ret, 0);
+               return;
+       }
 
        ext_debug("ext4_end_io_dio(): io_end 0x%p "
                  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3077,25 +3010,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  size);
 
        iocb->private = NULL;
-
-       /* if not aio dio with unwritten extents, just free io and return */
-       if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-               ext4_free_io_end(io_end);
-out:
-               inode_dio_done(inode);
-               if (is_async)
-                       aio_complete(iocb, ret, 0);
-               return;
-       }
-
        io_end->offset = offset;
        io_end->size = size;
        if (is_async) {
                io_end->iocb = iocb;
                io_end->result = ret;
        }
-
-       ext4_add_complete_io(io_end);
+       ext4_put_io_end_defer(io_end);
 }
 
 /*
@@ -3129,6 +3050,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        get_block_t *get_block_func = NULL;
        int dio_flags = 0;
        loff_t final_size = offset + count;
+       ext4_io_end_t *io_end = NULL;
 
        /* Use the old path for reads and writes beyond i_size. */
        if (rw != WRITE || final_size > inode->i_size)
@@ -3167,13 +3089,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
        iocb->private = NULL;
        ext4_inode_aio_set(inode, NULL);
        if (!is_sync_kiocb(iocb)) {
-               ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+               io_end = ext4_init_io_end(inode, GFP_NOFS);
                if (!io_end) {
                        ret = -ENOMEM;
                        goto retake_lock;
                }
                io_end->flag |= EXT4_IO_END_DIRECT;
-               iocb->private = io_end;
+               /*
+                * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+                */
+               iocb->private = ext4_get_io_end(io_end);
                /*
                 * we save the io structure for current async direct
                 * IO, so that later ext4_map_blocks() could flag the
@@ -3197,26 +3122,35 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                                   NULL,
                                   dio_flags);
 
-       if (iocb->private)
-               ext4_inode_aio_set(inode, NULL);
        /*
-        * The io_end structure takes a reference to the inode, that
-        * structure needs to be destroyed and the reference to the
-        * inode need to be dropped, when IO is complete, even with 0
-        * byte write, or failed.
-        *
-        * In the successful AIO DIO case, the io_end structure will
-        * be destroyed and the reference to the inode will be dropped
-        * after the end_io call back function is called.
-        *
-        * In the case there is 0 byte write, or error case, since VFS
-        * direct IO won't invoke the end_io call back function, we
-        * need to free the end_io structure here.
+        * Put our reference to io_end. This can free the io_end structure e.g.
+        * in sync IO case or in case of error. It can even perform extent
+        * conversion if all bios we submitted finished before we got here.
+        * Note that in that case iocb->private can be already set to NULL
+        * here.
         */
-       if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
-               ext4_free_io_end(iocb->private);
-               iocb->private = NULL;
-       } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+       if (io_end) {
+               ext4_inode_aio_set(inode, NULL);
+               ext4_put_io_end(io_end);
+               /*
+                * When no IO was submitted ext4_end_io_dio() was not
+                * called so we have to put iocb's reference.
+                */
+               if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+                       WARN_ON(iocb->private != io_end);
+                       WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+                       WARN_ON(io_end->iocb);
+                       /*
+                        * Generic code already did inode_dio_done() so we
+                        * have to clear EXT4_IO_END_DIRECT to not do it for
+                        * the second time.
+                        */
+                       io_end->flag = 0;
+                       ext4_put_io_end(io_end);
+                       iocb->private = NULL;
+               }
+       }
+       if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
                                                EXT4_STATE_DIO_UNWRITTEN)) {
                int err;
                /*
@@ -3355,89 +3289,56 @@ void ext4_set_aops(struct inode *inode)
                inode->i_mapping->a_ops = &ext4_aops;
 }
 
-
 /*
- * ext4_discard_partial_page_buffers()
- * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
- * This function finds and locks the page containing the offset
- * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
- * Calling functions that already have the page locked should call
- * ext4_discard_partial_page_buffers_no_lock directly.
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
  */
-int ext4_discard_partial_page_buffers(handle_t *handle,
-               struct address_space *mapping, loff_t from,
-               loff_t length, int flags)
+int ext4_block_truncate_page(handle_t *handle,
+               struct address_space *mapping, loff_t from)
 {
+       unsigned offset = from & (PAGE_CACHE_SIZE-1);
+       unsigned length;
+       unsigned blocksize;
        struct inode *inode = mapping->host;
-       struct page *page;
-       int err = 0;
 
-       page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
-                                  mapping_gfp_mask(mapping) & ~__GFP_FS);
-       if (!page)
-               return -ENOMEM;
-
-       err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
-               from, length, flags);
+       blocksize = inode->i_sb->s_blocksize;
+       length = blocksize - (offset & (blocksize - 1));
 
-       unlock_page(page);
-       page_cache_release(page);
-       return err;
+       return ext4_block_zero_page_range(handle, mapping, from, length);
 }
 
 /*
- * ext4_discard_partial_page_buffers_no_lock()
- * Zeros a page range of length 'length' starting from offset 'from'.
- * Buffer heads that correspond to the block aligned regions of the
- * zeroed range will be unmapped.  Unblock aligned regions
- * will have the corresponding buffer head mapped if needed so that
- * that region of the page can be updated with the partial zero out.
- *
- * This function assumes that the page has already been  locked.  The
- * The range to be discarded must be contained with in the given page.
- * If the specified range exceeds the end of the page it will be shortened
- * to the end of the page that corresponds to 'from'.  This function is
- * appropriate for updating a page and it buffer heads to be unmapped and
- * zeroed for blocks that have been either released, or are going to be
- * released.
- *
- * handle: The journal handle
- * inode:  The files inode
- * page:   A locked page that contains the offset "from"
- * from:   The starting byte offset (from the beginning of the file)
- *         to begin discarding
- * len:    The length of bytes to discard
- * flags:  Optional flags that may be used:
- *
- *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
- *         Only zero the regions of the page whose buffer heads
- *         have already been unmapped.  This flag is appropriate
- *         for updating the contents of a page whose blocks may
- *         have already been released, and we only want to zero
- *         out the regions that correspond to those released blocks.
- *
- * Returns zero on success or negative on failure.
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
  */
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
-               struct inode *inode, struct page *page, loff_t from,
-               loff_t length, int flags)
+int ext4_block_zero_page_range(handle_t *handle,
+               struct address_space *mapping, loff_t from, loff_t length)
 {
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-       unsigned int offset = from & (PAGE_CACHE_SIZE-1);
-       unsigned int blocksize, max, pos;
+       unsigned offset = from & (PAGE_CACHE_SIZE-1);
+       unsigned blocksize, max, pos;
        ext4_lblk_t iblock;
+       struct inode *inode = mapping->host;
        struct buffer_head *bh;
+       struct page *page;
        int err = 0;
 
-       blocksize = inode->i_sb->s_blocksize;
-       max = PAGE_CACHE_SIZE - offset;
+       page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+                                  mapping_gfp_mask(mapping) & ~__GFP_FS);
+       if (!page)
+               return -ENOMEM;
 
-       if (index != page->index)
-               return -EINVAL;
+       blocksize = inode->i_sb->s_blocksize;
+       max = blocksize - (offset & (blocksize - 1));
 
        /*
         * correct length if it does not fall between
-        * 'from' and the end of the page
+        * 'from' and the end of the block
         */
        if (length > max || length < 0)
                length = max;
@@ -3456,105 +3357,93 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
                pos += blocksize;
        }
 
-       pos = offset;
-       while (pos < offset + length) {
-               unsigned int end_of_block, range_to_discard;
-
-               err = 0;
-
-               /* The length of space left to zero and unmap */
-               range_to_discard = offset + length - pos;
+       err = 0;
+       if (buffer_freed(bh)) {
+               BUFFER_TRACE(bh, "freed: skip");
+               goto unlock;
+       }
 
-               /* The length of space until the end of the block */
-               end_of_block = blocksize - (pos & (blocksize-1));
+       if (!buffer_mapped(bh)) {
+               BUFFER_TRACE(bh, "unmapped");
+               ext4_get_block(inode, iblock, bh, 0);
+               /* unmapped? It's a hole - nothing to do */
+               if (!buffer_mapped(bh)) {
+                       BUFFER_TRACE(bh, "still unmapped");
+                       goto unlock;
+               }
+       }
 
-               /*
-                * Do not unmap or zero past end of block
-                * for this buffer head
-                */
-               if (range_to_discard > end_of_block)
-                       range_to_discard = end_of_block;
+       /* Ok, it's mapped. Make sure it's up-to-date */
+       if (PageUptodate(page))
+               set_buffer_uptodate(bh);
 
+       if (!buffer_uptodate(bh)) {
+               err = -EIO;
+               ll_rw_block(READ, 1, &bh);
+               wait_on_buffer(bh);
+               /* Uhhuh. Read error. Complain and punt. */
+               if (!buffer_uptodate(bh))
+                       goto unlock;
+       }
 
-               /*
-                * Skip this buffer head if we are only zeroing unampped
-                * regions of the page
-                */
-               if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
-                       buffer_mapped(bh))
-                               goto next;
-
-               /* If the range is block aligned, unmap */
-               if (range_to_discard == blocksize) {
-                       clear_buffer_dirty(bh);
-                       bh->b_bdev = NULL;
-                       clear_buffer_mapped(bh);
-                       clear_buffer_req(bh);
-                       clear_buffer_new(bh);
-                       clear_buffer_delay(bh);
-                       clear_buffer_unwritten(bh);
-                       clear_buffer_uptodate(bh);
-                       zero_user(page, pos, range_to_discard);
-                       BUFFER_TRACE(bh, "Buffer discarded");
-                       goto next;
-               }
+       if (ext4_should_journal_data(inode)) {
+               BUFFER_TRACE(bh, "get write access");
+               err = ext4_journal_get_write_access(handle, bh);
+               if (err)
+                       goto unlock;
+       }
 
-               /*
-                * If this block is not completely contained in the range
-                * to be discarded, then it is not going to be released. Because
-                * we need to keep this block, we need to make sure this part
-                * of the page is uptodate before we modify it by writeing
-                * partial zeros on it.
-                */
-               if (!buffer_mapped(bh)) {
-                       /*
-                        * Buffer head must be mapped before we can read
-                        * from the block
-                        */
-                       BUFFER_TRACE(bh, "unmapped");
-                       ext4_get_block(inode, iblock, bh, 0);
-                       /* unmapped? It's a hole - nothing to do */
-                       if (!buffer_mapped(bh)) {
-                               BUFFER_TRACE(bh, "still unmapped");
-                               goto next;
-                       }
-               }
+       zero_user(page, offset, length);
 
-               /* Ok, it's mapped. Make sure it's up-to-date */
-               if (PageUptodate(page))
-                       set_buffer_uptodate(bh);
+       BUFFER_TRACE(bh, "zeroed end of block");
 
-               if (!buffer_uptodate(bh)) {
-                       err = -EIO;
-                       ll_rw_block(READ, 1, &bh);
-                       wait_on_buffer(bh);
-                       /* Uhhuh. Read error. Complain and punt.*/
-                       if (!buffer_uptodate(bh))
-                               goto next;
-               }
+       err = 0;
+       if (ext4_should_journal_data(inode)) {
+               err = ext4_handle_dirty_metadata(handle, inode, bh);
+       } else {
+               mark_buffer_dirty(bh);
+               if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+                       err = ext4_jbd2_file_inode(handle, inode);
+       }
 
-               if (ext4_should_journal_data(inode)) {
-                       BUFFER_TRACE(bh, "get write access");
-                       err = ext4_journal_get_write_access(handle, bh);
-                       if (err)
-                               goto next;
-               }
+unlock:
+       unlock_page(page);
+       page_cache_release(page);
+       return err;
+}
 
-               zero_user(page, pos, range_to_discard);
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+                            loff_t lstart, loff_t length)
+{
+       struct super_block *sb = inode->i_sb;
+       struct address_space *mapping = inode->i_mapping;
+       unsigned partial = lstart & (sb->s_blocksize - 1);
+       ext4_fsblk_t start, end;
+       loff_t byte_end = (lstart + length - 1);
+       int err = 0;
 
-               err = 0;
-               if (ext4_should_journal_data(inode)) {
-                       err = ext4_handle_dirty_metadata(handle, inode, bh);
-               } else
-                       mark_buffer_dirty(bh);
+       start = lstart >> sb->s_blocksize_bits;
+       end = byte_end >> sb->s_blocksize_bits;
 
-               BUFFER_TRACE(bh, "Partial buffer zeroed");
-next:
-               bh = bh->b_this_page;
-               iblock++;
-               pos += range_to_discard;
+       /* Handle partial zero within the single block */
+       if (start == end) {
+               err = ext4_block_zero_page_range(handle, mapping,
+                                                lstart, length);
+               return err;
        }
-
+       /* Handle partial zero out on the start of the range */
+       if (partial) {
+               err = ext4_block_zero_page_range(handle, mapping,
+                                                lstart, sb->s_blocksize);
+               if (err)
+                       return err;
+       }
+       /* Handle partial zero out on the end of the range */
+       partial = byte_end & (sb->s_blocksize - 1);
+       if (partial != sb->s_blocksize - 1)
+               err = ext4_block_zero_page_range(handle, mapping,
+                                                byte_end - partial,
+                                                partial + 1);
        return err;
 }
 
@@ -3586,8 +3475,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t first_block, stop_block;
        struct address_space *mapping = inode->i_mapping;
-       loff_t first_page, last_page, page_len;
-       loff_t first_page_offset, last_page_offset;
+       loff_t first_block_offset, last_block_offset;
        handle_t *handle;
        unsigned int credits;
        int ret = 0;
@@ -3638,17 +3526,13 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
                   offset;
        }
 
-       first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-       last_page = (offset + length) >> PAGE_CACHE_SHIFT;
-
-       first_page_offset = first_page << PAGE_CACHE_SHIFT;
-       last_page_offset = last_page << PAGE_CACHE_SHIFT;
+       first_block_offset = round_up(offset, sb->s_blocksize);
+       last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
 
-       /* Now release the pages */
-       if (last_page_offset > first_page_offset) {
-               truncate_pagecache_range(inode, first_page_offset,
-                                        last_page_offset - 1);
-       }
+       /* Now release the pages and zero block aligned part of pages*/
+       if (last_block_offset > first_block_offset)
+               truncate_pagecache_range(inode, first_block_offset,
+                                        last_block_offset);
 
        /* Wait all existing dio workers, newcomers will block on i_mutex */
        ext4_inode_block_unlocked_dio(inode);
@@ -3668,66 +3552,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
                goto out_dio;
        }
 
-       /*
-        * Now we need to zero out the non-page-aligned data in the
-        * pages at the start and tail of the hole, and unmap the
-        * buffer heads for the block aligned regions of the page that
-        * were completely zeroed.
-        */
-       if (first_page > last_page) {
-               /*
-                * If the file space being truncated is contained
-                * within a page just zero out and unmap the middle of
-                * that page
-                */
-               ret = ext4_discard_partial_page_buffers(handle,
-                       mapping, offset, length, 0);
-
-               if (ret)
-                       goto out_stop;
-       } else {
-               /*
-                * zero out and unmap the partial page that contains
-                * the start of the hole
-                */
-               page_len = first_page_offset - offset;
-               if (page_len > 0) {
-                       ret = ext4_discard_partial_page_buffers(handle, mapping,
-                                               offset, page_len, 0);
-                       if (ret)
-                               goto out_stop;
-               }
-
-               /*
-                * zero out and unmap the partial page that contains
-                * the end of the hole
-                */
-               page_len = offset + length - last_page_offset;
-               if (page_len > 0) {
-                       ret = ext4_discard_partial_page_buffers(handle, mapping,
-                                       last_page_offset, page_len, 0);
-                       if (ret)
-                               goto out_stop;
-               }
-       }
-
-       /*
-        * If i_size is contained in the last page, we need to
-        * unmap and zero the partial page after i_size
-        */
-       if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
-          inode->i_size % PAGE_CACHE_SIZE != 0) {
-               page_len = PAGE_CACHE_SIZE -
-                       (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
-               if (page_len > 0) {
-                       ret = ext4_discard_partial_page_buffers(handle,
-                                       mapping, inode->i_size, page_len, 0);
-
-                       if (ret)
-                               goto out_stop;
-               }
-       }
+       ret = ext4_zero_partial_blocks(handle, inode, offset,
+                                      length);
+       if (ret)
+               goto out_stop;
 
        first_block = (offset + sb->s_blocksize - 1) >>
                EXT4_BLOCK_SIZE_BITS(sb);
@@ -3803,7 +3631,6 @@ void ext4_truncate(struct inode *inode)
        unsigned int credits;
        handle_t *handle;
        struct address_space *mapping = inode->i_mapping;
-       loff_t page_len;
 
        /*
         * There is a possibility that we're either freeing the inode
@@ -3847,14 +3674,8 @@ void ext4_truncate(struct inode *inode)
                return;
        }
 
-       if (inode->i_size % PAGE_CACHE_SIZE != 0) {
-               page_len = PAGE_CACHE_SIZE -
-                       (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
-               if (ext4_discard_partial_page_buffers(handle,
-                               mapping, inode->i_size, page_len, 0))
-                       goto out_stop;
-       }
+       if (inode->i_size & (inode->i_sb->s_blocksize - 1))
+               ext4_block_truncate_page(handle, mapping, inode->i_size);
 
        /*
         * We add the inode to the orphan list, so that if this
@@ -4623,7 +4444,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
                                      inode->i_size >> PAGE_CACHE_SHIFT);
                if (!page)
                        return;
-               ret = __ext4_journalled_invalidatepage(page, offset);
+               ret = __ext4_journalled_invalidatepage(page, offset,
+                                               PAGE_CACHE_SIZE - offset);
                unlock_page(page);
                page_cache_release(page);
                if (ret != -EBUSY)
@@ -4805,7 +4627,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
                 struct kstat *stat)
 {
        struct inode *inode;
-       unsigned long delalloc_blocks;
+       unsigned long long delalloc_blocks;
 
        inode = dentry->d_inode;
        generic_fillattr(inode, stat);
@@ -4823,15 +4645,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
        delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
                                EXT4_I(inode)->i_reserved_data_blocks);
 
-       stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+       stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
        return 0;
 }
 
-static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+                                  int pextents)
 {
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-               return ext4_ind_trans_blocks(inode, nrblocks, chunk);
-       return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+               return ext4_ind_trans_blocks(inode, lblocks);
+       return ext4_ext_index_trans_blocks(inode, pextents);
 }
 
 /*
@@ -4845,7 +4668,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  *
  * Also account for superblock, inode, quota and xattr blocks
  */
-static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+                                 int pextents)
 {
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
@@ -4853,14 +4677,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
        int ret = 0;
 
        /*
-        * How many index blocks need to touch to modify nrblocks?
-        * The "Chunk" flag indicating whether the nrblocks is
-        * physically contiguous on disk
-        *
-        * For Direct IO and fallocate, they calls get_block to allocate
-        * one single extent at a time, so they could set the "Chunk" flag
+        * How many index blocks need to touch to map @lblocks logical blocks
+        * to @pextents physical extents?
         */
-       idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+       idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
 
        ret = idxblocks;
 
@@ -4868,12 +4688,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
         * Now let's see how many group bitmaps and group descriptors need
         * to account
         */
-       groups = idxblocks;
-       if (chunk)
-               groups += 1;
-       else
-               groups += nrblocks;
-
+       groups = idxblocks + pextents;
        gdpblocks = groups;
        if (groups > ngroups)
                groups = ngroups;
@@ -4904,7 +4719,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
        int bpp = ext4_journal_blocks_per_page(inode);
        int ret;
 
-       ret = ext4_meta_trans_blocks(inode, bpp, 0);
+       ret = ext4_meta_trans_blocks(inode, bpp, bpp);
 
        /* Account for data blocks for journalled mode */
        if (ext4_should_journal_data(inode))