Merge tag 'writeback-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 28 Feb 2013 21:21:44 +0000 (13:21 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 28 Feb 2013 21:21:44 +0000 (13:21 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 28 Feb 2013 21:21:44 +0000 (13:21 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 28 Feb 2013 21:21:44 +0000 (13:21 -0800)
diff --combined fs/btrfs/extent-tree.c

index 1e59ed575cc991819a980b1ea6facd9d8a683995,f31abb14e06f171054f182864acb1dabf6fd1335..cf54bdfee334287383e7b63badc5a3683e2edf4f
--- 1/fs/btrfs/extent-tree.c
--- 2/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@@ -3689,20 -3689,6 +3689,6 @@@ static int can_overcommit(struct btrfs_
         return 0;
   }
   
- static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
-                                              unsigned long nr_pages,
-                                              enum wb_reason reason)
- {
-       if (!writeback_in_progress(sb->s_bdi) &&
-           down_read_trylock(&sb->s_umount)) {
-               writeback_inodes_sb_nr(sb, nr_pages, reason);
-               up_read(&sb->s_umount);
-               return 1;
-       }
- 
-       return 0;
- }
- 
   /*
    * shrink metadata reservation for delalloc
    */
@@@ -3735,9 -3721,9 +3721,9 @@@ static void shrink_delalloc(struct btrf
         while (delalloc_bytes && loops < 3) {
                 max_reclaim = min(delalloc_bytes, to_reclaim);
                 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-               writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
-                                                   nr_pages,
-                                                   WB_REASON_FS_FREE_SPACE);
+               try_to_writeback_inodes_sb_nr(root->fs_info->sb,
+                                             nr_pages,
+                                             WB_REASON_FS_FREE_SPACE);
   
                 /*
                  * We need to wait for the async pages to actually start before
@@@ -3997,7 -3983,7 +3983,7 @@@ again
          * We make the other tasks wait for the flush only when we can flush
          * all things.
          */
- -      if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
+ +      if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
                 flushing = true;
                 space_info->flush = 1;
         }
@@@ -4534,7 -4520,7 +4520,7 @@@ int btrfs_delalloc_reserve_metadata(str
         unsigned nr_extents = 0;
         int extra_reserve = 0;
         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
- -      int ret;
+ +      int ret = 0;
         bool delalloc_lock = true;
   
         /* If we are a free space inode we need to not flush since we will be in
@@@ -4579,18 -4565,20 +4565,18 @@@
         csum_bytes = BTRFS_I(inode)->csum_bytes;
         spin_unlock(&BTRFS_I(inode)->lock);
   
- -      if (root->fs_info->quota_enabled) {
+ +      if (root->fs_info->quota_enabled)
                 ret = btrfs_qgroup_reserve(root, num_bytes +
                                            nr_extents * root->leafsize);
- -              if (ret) {
- -                      spin_lock(&BTRFS_I(inode)->lock);
- -                      calc_csum_metadata_size(inode, num_bytes, 0);
- -                      spin_unlock(&BTRFS_I(inode)->lock);
- -                      if (delalloc_lock)
- -                              mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
- -                      return ret;
- -              }
- -      }
   
- -      ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+ +      /*
+ +       * ret != 0 here means the qgroup reservation failed, we go straight to
+ +       * the shared error handling then.
+ +       */
+ +      if (ret == 0)
+ +              ret = reserve_metadata_bytes(root, block_rsv,
+ +                                           to_reserve, flush);
+ +
         if (ret) {
                 u64 to_free = 0;
                 unsigned dropped;
@@@ -5558,7 -5546,7 +5544,7 @@@ static noinline int find_free_extent(st
         int empty_cluster = 2 * 1024 * 1024;
         struct btrfs_space_info *space_info;
         int loop = 0;
- -      int index = 0;
+ +      int index = __get_raid_index(data);
         int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
                 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
         bool found_uncached_bg = false;
@@@ -6522,7 -6510,7 +6508,7 @@@ reada
   }
   
   /*
- - * hepler to process tree block while walking down the tree.
+ + * helper to process tree block while walking down the tree.
    *
    * when wc->stage == UPDATE_BACKREF, this function updates
    * back refs for pointers in the block.
@@@ -6597,7 -6585,7 +6583,7 @@@ static noinline int walk_down_proc(stru
   }
   
   /*
- - * hepler to process tree block pointer.
+ + * helper to process tree block pointer.
    *
    * when wc->stage == DROP_REFERENCE, this function checks
    * reference count of the block pointed to. if the block
@@@ -6735,7 -6723,7 +6721,7 @@@ skip
   }
   
   /*
- - * hepler to process tree block while walking up the tree.
+ + * helper to process tree block while walking up the tree.
    *
    * when wc->stage == DROP_REFERENCE, this function drops
    * reference count on the block.
@@@ -6786,13 -6774,11 +6772,13 @@@ static noinline int walk_up_proc(struc
                                                        &wc->flags[level]);
                         if (ret < 0) {
                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
+ +                              path->locks[level] = 0;
                                 return ret;
                         }
                         BUG_ON(wc->refs[level] == 0);
                         if (wc->refs[level] == 1) {
                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
+ +                              path->locks[level] = 0;
                                 return 1;
                         }
                 }
diff --combined fs/ext4/inode.c

index 9c4f4b1c97f84aad654dd8f70cf5231c78a6fc52,5f6eef71ff214af0d29a40c2a276eaab41c08533..9ea0cde3fa9e0ffe7aebc28940293c422ae75a63
--- 1/fs/ext4/inode.c
--- 2/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@@ -132,6 -132,10 +132,6 @@@ static inline int ext4_begin_ordered_tr
   }
   
   static void ext4_invalidatepage(struct page *page, unsigned long offset);
- -static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
- -                                 struct buffer_head *bh_result, int create);
- -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
- -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
   static int __ext4_journalled_writepage(struct page *page, unsigned int len);
   static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
   static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
@@@ -234,8 -238,7 +234,8 @@@ void ext4_evict_inode(struct inode *ino
          * protection against it
          */
         sb_start_intwrite(inode->i_sb);
- -      handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
+ +      handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+ +                                  ext4_blocks_for_truncate(inode)+3);
         if (IS_ERR(handle)) {
                 ext4_std_error(inode->i_sb, PTR_ERR(handle));
                 /*
@@@ -343,7 -346,7 +343,7 @@@ void ext4_da_update_reserve_space(struc
         spin_lock(&ei->i_block_reservation_lock);
         trace_ext4_da_update_reserve_space(inode, used, quota_claim);
         if (unlikely(used > ei->i_reserved_data_blocks)) {
- -              ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+ +              ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
                          "with only %d reserved data blocks",
                          __func__, inode->i_ino, used,
                          ei->i_reserved_data_blocks);
@@@ -352,12 -355,10 +352,12 @@@
         }
   
         if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
- -              ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d "
- -                       "with only %d reserved metadata blocks\n", __func__,
- -                       inode->i_ino, ei->i_allocated_meta_blocks,
- -                       ei->i_reserved_meta_blocks);
+ +              ext4_warning(inode->i_sb, "ino %lu, allocated %d "
+ +                      "with only %d reserved metadata blocks "
+ +                      "(releasing %d blocks with reserved %d data blocks)",
+ +                      inode->i_ino, ei->i_allocated_meta_blocks,
+ +                           ei->i_reserved_meta_blocks, used,
+ +                           ei->i_reserved_data_blocks);
                 WARN_ON(1);
                 ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
         }
@@@ -507,33 -508,12 +507,33 @@@ static pgoff_t ext4_num_dirty_pages(str
   int ext4_map_blocks(handle_t *handle, struct inode *inode,
                     struct ext4_map_blocks *map, int flags)
   {
+ +      struct extent_status es;
         int retval;
   
         map->m_flags = 0;
         ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
                   "logical block %lu\n", inode->i_ino, flags, map->m_len,
                   (unsigned long) map->m_lblk);
+ +
+ +      /* Lookup extent status tree firstly */
+ +      if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ +              if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+ +                      map->m_pblk = ext4_es_pblock(&es) +
+ +                                      map->m_lblk - es.es_lblk;
+ +                      map->m_flags |= ext4_es_is_written(&es) ?
+ +                                      EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+ +                      retval = es.es_len - (map->m_lblk - es.es_lblk);
+ +                      if (retval > map->m_len)
+ +                              retval = map->m_len;
+ +                      map->m_len = retval;
+ +              } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+ +                      retval = 0;
+ +              } else {
+ +                      BUG_ON(1);
+ +              }
+ +              goto found;
+ +      }
+ +
         /*
          * Try to see if we can get the block without requesting a new
          * file system block.
@@@ -547,27 -527,20 +547,27 @@@
                 retval = ext4_ind_map_blocks(handle, inode, map, flags &
                                              EXT4_GET_BLOCKS_KEEP_SIZE);
         }
+ +      if (retval > 0) {
+ +              int ret;
+ +              unsigned long long status;
+ +
+ +              status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ +                              EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ +              if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ +                  ext4_find_delalloc_range(inode, map->m_lblk,
+ +                                           map->m_lblk + map->m_len - 1))
+ +                      status |= EXTENT_STATUS_DELAYED;
+ +              ret = ext4_es_insert_extent(inode, map->m_lblk,
+ +                                          map->m_len, map->m_pblk, status);
+ +              if (ret < 0)
+ +                      retval = ret;
+ +      }
         if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
                 up_read((&EXT4_I(inode)->i_data_sem));
   
+ +found:
         if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- -              int ret;
- -              if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
- -                      /* delayed alloc may be allocated by fallocate and
- -                       * coverted to initialized by directIO.
- -                       * we need to handle delayed extent here.
- -                       */
- -                      down_write((&EXT4_I(inode)->i_data_sem));
- -                      goto delayed_mapped;
- -              }
- -              ret = check_block_validity(inode, map);
+ +              int ret = check_block_validity(inode, map);
                 if (ret != 0)
                         return ret;
         }
@@@ -587,10 -560,16 +587,10 @@@
                 return retval;
   
         /*
- -       * When we call get_blocks without the create flag, the
- -       * BH_Unwritten flag could have gotten set if the blocks
- -       * requested were part of a uninitialized extent.  We need to
- -       * clear this flag now that we are committed to convert all or
- -       * part of the uninitialized extent to be an initialized
- -       * extent.  This is because we need to avoid the combination
- -       * of BH_Unwritten and BH_Mapped flags being simultaneously
- -       * set on the buffer_head.
+ +       * Here we clear m_flags because after allocating an new extent,
+ +       * it will be set again.
          */
- -      map->m_flags &= ~EXT4_MAP_UNWRITTEN;
+ +      map->m_flags &= ~EXT4_MAP_FLAGS;
   
         /*
          * New blocks allocate and/or writing to uninitialized extent
@@@ -636,23 -615,18 +636,23 @@@
                         (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
                         ext4_da_update_reserve_space(inode, retval, 1);
         }
- -      if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ +      if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
   
- -              if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- -                      int ret;
- -delayed_mapped:
- -                      /* delayed allocation blocks has been allocated */
- -                      ret = ext4_es_remove_extent(inode, map->m_lblk,
- -                                                  map->m_len);
- -                      if (ret < 0)
- -                              retval = ret;
- -              }
+ +      if (retval > 0) {
+ +              int ret;
+ +              unsigned long long status;
+ +
+ +              status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ +                              EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ +              if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ +                  ext4_find_delalloc_range(inode, map->m_lblk,
+ +                                           map->m_lblk + map->m_len - 1))
+ +                      status |= EXTENT_STATUS_DELAYED;
+ +              ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ +                                          map->m_pblk, status);
+ +              if (ret < 0)
+ +                      retval = ret;
         }
   
         up_write((&EXT4_I(inode)->i_data_sem));
@@@ -686,8 -660,7 +686,8 @@@ static int _ext4_get_block(struct inod
                 if (map.m_len > DIO_MAX_BLOCKS)
                         map.m_len = DIO_MAX_BLOCKS;
                 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
- -              handle = ext4_journal_start(inode, dio_credits);
+ +              handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ +                                          dio_credits);
                 if (IS_ERR(handle)) {
                         ret = PTR_ERR(handle);
                         return ret;
@@@ -734,16 -707,14 +734,16 @@@ struct buffer_head *ext4_getblk(handle_
         /* ensure we send some value back into *errp */
         *errp = 0;
   
+ +      if (create && err == 0)
+ +              err = -ENOSPC;  /* should never happen */
         if (err < 0)
                 *errp = err;
         if (err <= 0)
                 return NULL;
   
         bh = sb_getblk(inode->i_sb, map.m_pblk);
- -      if (!bh) {
- -              *errp = -EIO;
+ +      if (unlikely(!bh)) {
+ +              *errp = -ENOMEM;
                 return NULL;
         }
         if (map.m_flags & EXT4_MAP_NEW) {
@@@ -837,10 -808,11 +837,10 @@@ int ext4_walk_page_buffers(handle_t *ha
    * and the commit_write().  So doing the jbd2_journal_start at the start of
    * prepare_write() is the right place.
    *
- - * Also, this function can nest inside ext4_writepage() ->
- - * block_write_full_page(). In that case, we *know* that ext4_writepage()
- - * has generated enough buffer credits to do the whole page.  So we won't
- - * block on the journal in that case, which is good, because the caller may
- - * be PF_MEMALLOC.
+ + * Also, this function can nest inside ext4_writepage().  In that case, we
+ + * *know* that ext4_writepage() has generated enough buffer credits to do the
+ + * whole page.  So we won't block on the journal in that case, which is good,
+ + * because the caller may be PF_MEMALLOC.
    *
    * By accident, ext4 can be reentered when a transaction is open via
    * quota file writes.  If we were to commit the transaction while thus
@@@ -906,40 -878,32 +906,40 @@@ static int ext4_write_begin(struct fil
                 ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
                                                     flags, pagep);
                 if (ret < 0)
- -                      goto out;
- -              if (ret == 1) {
- -                      ret = 0;
- -                      goto out;
- -              }
+ +                      return ret;
+ +              if (ret == 1)
+ +                      return 0;
         }
   
- -retry:
- -      handle = ext4_journal_start(inode, needed_blocks);
+ +      /*
+ +       * grab_cache_page_write_begin() can take a long time if the
+ +       * system is thrashing due to memory pressure, or if the page
+ +       * is being written back.  So grab it first before we start
+ +       * the transaction handle.  This also allows us to allocate
+ +       * the page (if needed) without using GFP_NOFS.
+ +       */
+ +retry_grab:
+ +      page = grab_cache_page_write_begin(mapping, index, flags);
+ +      if (!page)
+ +              return -ENOMEM;
+ +      unlock_page(page);
+ +
+ +retry_journal:
+ +      handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
         if (IS_ERR(handle)) {
- -              ret = PTR_ERR(handle);
- -              goto out;
+ +              page_cache_release(page);
+ +              return PTR_ERR(handle);
         }
   
- -      /* We cannot recurse into the filesystem as the transaction is already
- -       * started */
- -      flags |= AOP_FLAG_NOFS;
- -
- -      page = grab_cache_page_write_begin(mapping, index, flags);
- -      if (!page) {
+ +      lock_page(page);
+ +      if (page->mapping != mapping) {
+ +              /* The page got truncated from under us */
+ +              unlock_page(page);
+ +              page_cache_release(page);
                 ext4_journal_stop(handle);
- -              ret = -ENOMEM;
- -              goto out;
+ +              goto retry_grab;
         }
- -
- -      *pagep = page;
+ +      wait_on_page_writeback(page);
   
         if (ext4_should_dioread_nolock(inode))
                 ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@@ -954,6 -918,7 +954,6 @@@
   
         if (ret) {
                 unlock_page(page);
- -              page_cache_release(page);
                 /*
                  * __block_write_begin may have instantiated a few blocks
                  * outside i_size.  Trim these off again. Don't need
@@@ -977,14 -942,11 +977,14 @@@
                         if (inode->i_nlink)
                                 ext4_orphan_del(NULL, inode);
                 }
- -      }
   
- -      if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- -              goto retry;
- -out:
+ +              if (ret == -ENOSPC &&
+ +                  ext4_should_retry_alloc(inode->i_sb, &retries))
+ +                      goto retry_journal;
+ +              page_cache_release(page);
+ +              return ret;
+ +      }
+ +      *pagep = page;
         return ret;
   }
   
@@@ -1294,7 -1256,7 +1294,7 @@@ static void ext4_da_release_space(struc
                  * function is called from invalidate page, it's
                  * harmless to return without any action.
                  */
- -              ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+ +              ext4_warning(inode->i_sb, "ext4_da_release_space: "
                          "ino %lu, to_free %d with only %d reserved "
                          "data blocks", inode->i_ino, to_free,
                          ei->i_reserved_data_blocks);
@@@ -1395,6 -1357,7 +1395,6 @@@ static int mpage_da_submit_io(struct mp
         loff_t size = i_size_read(inode);
         unsigned int len, block_start;
         struct buffer_head *bh, *page_bufs = NULL;
- -      int journal_data = ext4_should_journal_data(inode);
         sector_t pblock = 0, cur_logical = 0;
         struct ext4_io_submit io_submit;
   
@@@ -1415,7 -1378,7 +1415,7 @@@
                 if (nr_pages == 0)
                         break;
                 for (i = 0; i < nr_pages; i++) {
- -                      int commit_write = 0, skip_page = 0;
+ +                      int skip_page = 0;
                         struct page *page = pvec.pages[i];
   
                         index = page->index;
@@@ -1437,9 -1400,27 +1437,9 @@@
                         BUG_ON(!PageLocked(page));
                         BUG_ON(PageWriteback(page));
   
- -                      /*
- -                       * If the page does not have buffers (for
- -                       * whatever reason), try to create them using
- -                       * __block_write_begin.  If this fails,
- -                       * skip the page and move on.
- -                       */
- -                      if (!page_has_buffers(page)) {
- -                              if (__block_write_begin(page, 0, len,
- -                                              noalloc_get_block_write)) {
- -                              skip_page:
- -                                      unlock_page(page);
- -                                      continue;
- -                              }
- -                              commit_write = 1;
- -                      }
- -
                         bh = page_bufs = page_buffers(page);
                         block_start = 0;
                         do {
- -                              if (!bh)
- -                                      goto skip_page;
                                 if (map && (cur_logical >= map->m_lblk) &&
                                     (cur_logical <= (map->m_lblk +
                                                      (map->m_len - 1)))) {
@@@ -1467,14 -1448,33 +1467,14 @@@
                                 pblock++;
                         } while (bh != page_bufs);
   
- -                      if (skip_page)
- -                              goto skip_page;
- -
- -                      if (commit_write)
- -                              /* mark the buffer_heads as dirty & uptodate */
- -                              block_commit_write(page, 0, len);
+ +                      if (skip_page) {
+ +                              unlock_page(page);
+ +                              continue;
+ +                      }
   
                         clear_page_dirty_for_io(page);
- -                      /*
- -                       * Delalloc doesn't support data journalling,
- -                       * but eventually maybe we'll lift this
- -                       * restriction.
- -                       */
- -                      if (unlikely(journal_data && PageChecked(page)))
- -                              err = __ext4_journalled_writepage(page, len);
- -                      else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
- -                              err = ext4_bio_write_page(&io_submit, page,
- -                                                        len, mpd->wbc);
- -                      else if (buffer_uninit(page_bufs)) {
- -                              ext4_set_bh_endio(page_bufs, inode);
- -                              err = block_write_full_page_endio(page,
- -                                      noalloc_get_block_write,
- -                                      mpd->wbc, ext4_end_io_buffer_write);
- -                      } else
- -                              err = block_write_full_page(page,
- -                                      noalloc_get_block_write, mpd->wbc);
- -
+ +                      err = ext4_bio_write_page(&io_submit, page, len,
+ +                                                mpd->wbc);
                         if (!err)
                                 mpd->pages_written++;
                         /*
@@@ -1640,7 -1640,7 +1640,7 @@@ static void mpage_da_map_and_submit(str
                                  (unsigned long long) next,
                                  mpd->b_size >> mpd->inode->i_blkbits, err);
                         ext4_msg(sb, KERN_CRIT,
- -                              "This should not happen!! Data will be lost\n");
+ +                              "This should not happen!! Data will be lost");
                         if (err == -ENOSPC)
                                 ext4_print_free_blocks(mpd->inode);
                 }
@@@ -1690,16 -1690,16 +1690,16 @@@ submit_io
    *
    * @mpd->lbh - extent of blocks
    * @logical - logical number of the block in the file
- - * @bh - bh of the block (used to access block's state)
+ + * @b_state - b_state of the buffer head added
    *
    * the function is used to collect contig. blocks in same state
    */
- -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
- -                                 sector_t logical, size_t b_size,
+ +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
                                    unsigned long b_state)
   {
         sector_t next;
- -      int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+ +      int blkbits = mpd->inode->i_blkbits;
+ +      int nrblocks = mpd->b_size >> blkbits;
   
         /*
          * XXX Don't go larger than mballoc is willing to allocate
@@@ -1707,11 -1707,11 +1707,11 @@@
          * mpage_da_submit_io() into this function and then call
          * ext4_map_blocks() multiple times in a loop
          */
- -      if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+ +      if (nrblocks >= (8*1024*1024 >> blkbits))
                 goto flush_it;
   
- -      /* check if thereserved journal credits might overflow */
- -      if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
+ +      /* check if the reserved journal credits might overflow */
+ +      if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
                 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
                         /*
                          * With non-extent format we are limited by the journal
@@@ -1720,6 -1720,16 +1720,6 @@@
                          * nrblocks.  So limit nrblocks.
                          */
                         goto flush_it;
- -              } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
- -                              EXT4_MAX_TRANS_DATA) {
- -                      /*
- -                       * Adding the new buffer_head would make it cross the
- -                       * allowed limit for which we have journal credit
- -                       * reserved. So limit the new bh->b_size
- -                       */
- -                      b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
- -                                              mpd->inode->i_blkbits;
- -                      /* we will do mpage_da_submit_io in the next loop */
                 }
         }
         /*
@@@ -1727,7 -1737,7 +1727,7 @@@
          */
         if (mpd->b_size == 0) {
                 mpd->b_blocknr = logical;
- -              mpd->b_size = b_size;
+ +              mpd->b_size = 1 << blkbits;
                 mpd->b_state = b_state & BH_FLAGS;
                 return;
         }
@@@ -1737,7 -1747,7 +1737,7 @@@
          * Can we merge the block to our big extent?
          */
         if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
- -              mpd->b_size += b_size;
+ +              mpd->b_size += 1 << blkbits;
                 return;
         }
   
@@@ -1765,7 -1775,6 +1765,7 @@@ static int ext4_da_map_blocks(struct in
                               struct ext4_map_blocks *map,
                               struct buffer_head *bh)
   {
+ +      struct extent_status es;
         int retval;
         sector_t invalid_block = ~((sector_t) 0xffff);
   
@@@ -1776,42 -1785,6 +1776,42 @@@
         ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
                   "logical block %lu\n", inode->i_ino, map->m_len,
                   (unsigned long) map->m_lblk);
+ +
+ +      /* Lookup extent status tree firstly */
+ +      if (ext4_es_lookup_extent(inode, iblock, &es)) {
+ +
+ +              if (ext4_es_is_hole(&es)) {
+ +                      retval = 0;
+ +                      down_read((&EXT4_I(inode)->i_data_sem));
+ +                      goto add_delayed;
+ +              }
+ +
+ +              /*
+ +               * Delayed extent could be allocated by fallocate.
+ +               * So we need to check it.
+ +               */
+ +              if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+ +                      map_bh(bh, inode->i_sb, invalid_block);
+ +                      set_buffer_new(bh);
+ +                      set_buffer_delay(bh);
+ +                      return 0;
+ +              }
+ +
+ +              map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+ +              retval = es.es_len - (iblock - es.es_lblk);
+ +              if (retval > map->m_len)
+ +                      retval = map->m_len;
+ +              map->m_len = retval;
+ +              if (ext4_es_is_written(&es))
+ +                      map->m_flags |= EXT4_MAP_MAPPED;
+ +              else if (ext4_es_is_unwritten(&es))
+ +                      map->m_flags |= EXT4_MAP_UNWRITTEN;
+ +              else
+ +                      BUG_ON(1);
+ +
+ +              return retval;
+ +      }
+ +
         /*
          * Try to see if we can get the block without requesting a new
          * file system block.
@@@ -1830,15 -1803,11 +1830,15 @@@
                         map->m_flags |= EXT4_MAP_FROM_CLUSTER;
                 retval = 0;
         } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- -              retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+ +              retval = ext4_ext_map_blocks(NULL, inode, map,
+ +                                           EXT4_GET_BLOCKS_NO_PUT_HOLE);
         else
- -              retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+ +              retval = ext4_ind_map_blocks(NULL, inode, map,
+ +                                           EXT4_GET_BLOCKS_NO_PUT_HOLE);
   
+ +add_delayed:
         if (retval == 0) {
+ +              int ret;
                 /*
                  * XXX: __block_prepare_write() unmaps passed block,
                  * is it OK?
@@@ -1846,20 -1815,15 +1846,20 @@@
                 /* If the block was allocated from previously allocated cluster,
                  * then we dont need to reserve it again. */
                 if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
- -                      retval = ext4_da_reserve_space(inode, iblock);
- -                      if (retval)
+ +                      ret = ext4_da_reserve_space(inode, iblock);
+ +                      if (ret) {
                                 /* not enough space to reserve */
+ +                              retval = ret;
                                 goto out_unlock;
+ +                      }
                 }
   
- -              retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
- -              if (retval)
+ +              ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ +                                          ~0, EXTENT_STATUS_DELAYED);
+ +              if (ret) {
+ +                      retval = ret;
                         goto out_unlock;
+ +              }
   
                 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
                  * and it should not appear on the bh->b_state.
@@@ -1869,16 -1833,6 +1869,16 @@@
                 map_bh(bh, inode->i_sb, invalid_block);
                 set_buffer_new(bh);
                 set_buffer_delay(bh);
+ +      } else if (retval > 0) {
+ +              int ret;
+ +              unsigned long long status;
+ +
+ +              status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ +                              EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ +              ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ +                                          map->m_pblk, status);
+ +              if (ret != 0)
+ +                      retval = ret;
         }
   
   out_unlock:
@@@ -1936,6 -1890,27 +1936,6 @@@ int ext4_da_get_block_prep(struct inod
         return 0;
   }
   
- -/*
- - * This function is used as a standard get_block_t calback function
- - * when there is no desire to allocate any blocks.  It is used as a
- - * callback function for block_write_begin() and block_write_full_page().
- - * These functions should only try to map a single block at a time.
- - *
- - * Since this function doesn't do block allocations even if the caller
- - * requests it by passing in create=1, it is critically important that
- - * any caller checks to make sure that any buffer heads are returned
- - * by this function are either all already mapped or marked for
- - * delayed allocation before calling  block_write_full_page().  Otherwise,
- - * b_blocknr could be left unitialized, and the page write functions will
- - * be taken by surprise.
- - */
- -static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
- -                                 struct buffer_head *bh_result, int create)
- -{
- -      BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
- -      return _ext4_get_block(inode, iblock, bh_result, 0);
- -}
- -
   static int bget_one(handle_t *handle, struct buffer_head *bh)
   {
         get_bh(bh);
@@@ -1980,8 -1955,7 +1980,8 @@@ static int __ext4_journalled_writepage(
          * references to buffers so we are safe */
         unlock_page(page);
   
- -      handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ +      handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ +                                  ext4_writepage_trans_blocks(inode));
         if (IS_ERR(handle)) {
                 ret = PTR_ERR(handle);
                 goto out;
@@@ -2061,12 -2035,11 +2061,12 @@@ out
   static int ext4_writepage(struct page *page,
                           struct writeback_control *wbc)
   {
- -      int ret = 0, commit_write = 0;
+ +      int ret = 0;
         loff_t size;
         unsigned int len;
         struct buffer_head *page_bufs = NULL;
         struct inode *inode = page->mapping->host;
+ +      struct ext4_io_submit io_submit;
   
         trace_ext4_writepage(page);
         size = i_size_read(inode);
@@@ -2075,29 -2048,39 +2075,29 @@@
         else
                 len = PAGE_CACHE_SIZE;
   
+ +      page_bufs = page_buffers(page);
         /*
- -       * If the page does not have buffers (for whatever reason),
- -       * try to create them using __block_write_begin.  If this
- -       * fails, redirty the page and move on.
+ +       * We cannot do block allocation or other extent handling in this
+ +       * function. If there are buffers needing that, we have to redirty
+ +       * the page. But we may reach here when we do a journal commit via
+ +       * journal_submit_inode_data_buffers() and in that case we must write
+ +       * allocated buffers to achieve data=ordered mode guarantees.
          */
- -      if (!page_has_buffers(page)) {
- -              if (__block_write_begin(page, 0, len,
- -                                      noalloc_get_block_write)) {
- -              redirty_page:
- -                      redirty_page_for_writepage(wbc, page);
+ +      if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ +                                 ext4_bh_delay_or_unwritten)) {
+ +              redirty_page_for_writepage(wbc, page);
+ +              if (current->flags & PF_MEMALLOC) {
+ +                      /*
+ +                       * For memory cleaning there's no point in writing only
+ +                       * some buffers. So just bail out. Warn if we came here
+ +                       * from direct reclaim.
+ +                       */
+ +                      WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+ +                                                      == PF_MEMALLOC);
                         unlock_page(page);
                         return 0;
                 }
- -              commit_write = 1;
- -      }
- -      page_bufs = page_buffers(page);
- -      if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
- -                                 ext4_bh_delay_or_unwritten)) {
- -              /*
- -               * We don't want to do block allocation, so redirty
- -               * the page and return.  We may reach here when we do
- -               * a journal commit via journal_submit_inode_data_buffers.
- -               * We can also reach here via shrink_page_list but it
- -               * should never be for direct reclaim so warn if that
- -               * happens
- -               */
- -              WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
- -                                                              PF_MEMALLOC);
- -              goto redirty_page;
         }
- -      if (commit_write)
- -              /* now mark the buffer_heads as dirty and uptodate */
- -              block_commit_write(page, 0, len);
   
         if (PageChecked(page) && ext4_should_journal_data(inode))
                 /*
@@@ -2106,9 -2089,14 +2106,9 @@@
                  */
                 return __ext4_journalled_writepage(page, len);
   
- -      if (buffer_uninit(page_bufs)) {
- -              ext4_set_bh_endio(page_bufs, inode);
- -              ret = block_write_full_page_endio(page, noalloc_get_block_write,
- -                                          wbc, ext4_end_io_buffer_write);
- -      } else
- -              ret = block_write_full_page(page, noalloc_get_block_write,
- -                                          wbc);
- -
+ +      memset(&io_submit, 0, sizeof(io_submit));
+ +      ret = ext4_bio_write_page(&io_submit, page, len, wbc);
+ +      ext4_io_submit(&io_submit);
         return ret;
   }
   
@@@ -2240,38 -2228,51 +2240,38 @@@ static int write_cache_pages_da(handle_
                         logical = (sector_t) page->index <<
                                 (PAGE_CACHE_SHIFT - inode->i_blkbits);
   
- -                      if (!page_has_buffers(page)) {
- -                              mpage_add_bh_to_extent(mpd, logical,
- -                                                     PAGE_CACHE_SIZE,
- -                                                     (1 << BH_Dirty) | (1 << BH_Uptodate));
- -                              if (mpd->io_done)
- -                                      goto ret_extent_tail;
- -                      } else {
+ +                      /* Add all dirty buffers to mpd */
+ +                      head = page_buffers(page);
+ +                      bh = head;
+ +                      do {
+ +                              BUG_ON(buffer_locked(bh));
                                 /*
- -                               * Page with regular buffer heads,
- -                               * just add all dirty ones
+ +                               * We need to try to allocate unmapped blocks
+ +                               * in the same page.  Otherwise we won't make
+ +                               * progress with the page in ext4_writepage
                                  */
- -                              head = page_buffers(page);
- -                              bh = head;
- -                              do {
- -                                      BUG_ON(buffer_locked(bh));
+ +                              if (ext4_bh_delay_or_unwritten(NULL, bh)) {
+ +                                      mpage_add_bh_to_extent(mpd, logical,
+ +                                                             bh->b_state);
+ +                                      if (mpd->io_done)
+ +                                              goto ret_extent_tail;
+ +                              } else if (buffer_dirty(bh) &&
+ +                                         buffer_mapped(bh)) {
                                         /*
- -                                       * We need to try to allocate
- -                                       * unmapped blocks in the same page.
- -                                       * Otherwise we won't make progress
- -                                       * with the page in ext4_writepage
+ +                                       * mapped dirty buffer. We need to
+ +                                       * update the b_state because we look
+ +                                       * at b_state in mpage_da_map_blocks.
+ +                                       * We don't update b_size because if we
+ +                                       * find an unmapped buffer_head later
+ +                                       * we need to use the b_state flag of
+ +                                       * that buffer_head.
                                          */
- -                                      if (ext4_bh_delay_or_unwritten(NULL, bh)) {
- -                                              mpage_add_bh_to_extent(mpd, logical,
- -                                                                     bh->b_size,
- -                                                                     bh->b_state);
- -                                              if (mpd->io_done)
- -                                                      goto ret_extent_tail;
- -                                      } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
- -                                              /*
- -                                               * mapped dirty buffer. We need
- -                                               * to update the b_state
- -                                               * because we look at b_state
- -                                               * in mpage_da_map_blocks.  We
- -                                               * don't update b_size because
- -                                               * if we find an unmapped
- -                                               * buffer_head later we need to
- -                                               * use the b_state flag of that
- -                                               * buffer_head.
- -                                               */
- -                                              if (mpd->b_size == 0)
- -                                                      mpd->b_state = bh->b_state & BH_FLAGS;
- -                                      }
- -                                      logical++;
- -                              } while ((bh = bh->b_this_page) != head);
- -                      }
+ +                                      if (mpd->b_size == 0)
+ +                                              mpd->b_state =
+ +                                                      bh->b_state & BH_FLAGS;
+ +                              }
+ +                              logical++;
+ +                      } while ((bh = bh->b_this_page) != head);
   
                         if (nr_to_write > 0) {
                                 nr_to_write--;
@@@ -2412,8 -2413,7 +2412,8 @@@ retry
                 needed_blocks = ext4_da_writepages_trans_blocks(inode);
   
                 /* start a new transaction*/
- -              handle = ext4_journal_start(inode, needed_blocks);
+ +              handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ +                                          needed_blocks);
                 if (IS_ERR(handle)) {
                         ret = PTR_ERR(handle);
                         ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
@@@ -2512,12 -2512,8 +2512,8 @@@ static int ext4_nonda_switch(struct sup
         /*
          * Start pushing delalloc when 1/2 of free blocks are dirty.
          */
-       if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
-           !writeback_in_progress(sb->s_bdi) &&
-           down_read_trylock(&sb->s_umount)) {
-               writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
-               up_read(&sb->s_umount);
-       }
+       if (dirty_blocks && (free_blocks < 2 * dirty_blocks))
+               try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
   
         if (2 * free_blocks < 3 * dirty_blocks ||
                 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
@@@ -2555,52 -2551,42 +2551,52 @@@ static int ext4_da_write_begin(struct f
                                                       pos, len, flags,
                                                       pagep, fsdata);
                 if (ret < 0)
- -                      goto out;
- -              if (ret == 1) {
- -                      ret = 0;
- -                      goto out;
- -              }
+ +                      return ret;
+ +              if (ret == 1)
+ +                      return 0;
         }
   
- -retry:
+ +      /*
+ +       * grab_cache_page_write_begin() can take a long time if the
+ +       * system is thrashing due to memory pressure, or if the page
+ +       * is being written back.  So grab it first before we start
+ +       * the transaction handle.  This also allows us to allocate
+ +       * the page (if needed) without using GFP_NOFS.
+ +       */
+ +retry_grab:
+ +      page = grab_cache_page_write_begin(mapping, index, flags);
+ +      if (!page)
+ +              return -ENOMEM;
+ +      unlock_page(page);
+ +
         /*
          * With delayed allocation, we don't log the i_disksize update
          * if there is delayed block allocation. But we still need
          * to journalling the i_disksize update if writes to the end
          * of file which has an already mapped buffer.
          */
- -      handle = ext4_journal_start(inode, 1);
+ +retry_journal:
+ +      handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
         if (IS_ERR(handle)) {
- -              ret = PTR_ERR(handle);
- -              goto out;
+ +              page_cache_release(page);
+ +              return PTR_ERR(handle);
         }
- -      /* We cannot recurse into the filesystem as the transaction is already
- -       * started */
- -      flags |= AOP_FLAG_NOFS;
   
- -      page = grab_cache_page_write_begin(mapping, index, flags);
- -      if (!page) {
+ +      lock_page(page);
+ +      if (page->mapping != mapping) {
+ +              /* The page got truncated from under us */
+ +              unlock_page(page);
+ +              page_cache_release(page);
                 ext4_journal_stop(handle);
- -              ret = -ENOMEM;
- -              goto out;
+ +              goto retry_grab;
         }
- -      *pagep = page;
+ +      /* In case writeback began while the page was unlocked */
+ +      wait_on_page_writeback(page);
   
         ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
         if (ret < 0) {
                 unlock_page(page);
                 ext4_journal_stop(handle);
- -              page_cache_release(page);
                 /*
                  * block_write_begin may have instantiated a few blocks
                  * outside i_size.  Trim these off again. Don't need
@@@ -2608,16 -2594,11 +2604,16 @@@
                  */
                 if (pos + len > inode->i_size)
                         ext4_truncate_failed_write(inode);
+ +
+ +              if (ret == -ENOSPC &&
+ +                  ext4_should_retry_alloc(inode->i_sb, &retries))
+ +                      goto retry_journal;
+ +
+ +              page_cache_release(page);
+ +              return ret;
         }
   
- -      if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- -              goto retry;
- -out:
+ +      *pagep = page;
         return ret;
   }
   
@@@ -2873,10 -2854,36 +2869,10 @@@ ext4_readpages(struct file *file, struc
         return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
   }
   
- -static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
- -{
- -      struct buffer_head *head, *bh;
- -      unsigned int curr_off = 0;
- -
- -      if (!page_has_buffers(page))
- -              return;
- -      head = bh = page_buffers(page);
- -      do {
- -              if (offset <= curr_off && test_clear_buffer_uninit(bh)
- -                                      && bh->b_private) {
- -                      ext4_free_io_end(bh->b_private);
- -                      bh->b_private = NULL;
- -                      bh->b_end_io = NULL;
- -              }
- -              curr_off = curr_off + bh->b_size;
- -              bh = bh->b_this_page;
- -      } while (bh != head);
- -}
- -
   static void ext4_invalidatepage(struct page *page, unsigned long offset)
   {
         trace_ext4_invalidatepage(page, offset);
   
- -      /*
- -       * free any io_end structure allocated for buffers to be discarded
- -       */
- -      if (ext4_should_dioread_nolock(page->mapping->host))
- -              ext4_invalidatepage_free_endio(page, offset);
- -
         /* No journalling happens on data buffers when this function is used */
         WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
   
@@@ -2948,7 -2955,7 +2944,7 @@@ static void ext4_end_io_dio(struct kioc
                             ssize_t size, void *private, int ret,
                             bool is_async)
   {
- -      struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ +      struct inode *inode = file_inode(iocb->ki_filp);
           ext4_io_end_t *io_end = iocb->private;
   
         /* if not async direct IO or dio with 0 bytes write, just return */
@@@ -2966,9 -2973,9 +2962,9 @@@
         if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                 ext4_free_io_end(io_end);
   out:
+ +              inode_dio_done(inode);
                 if (is_async)
                         aio_complete(iocb, ret, 0);
- -              inode_dio_done(inode);
                 return;
         }
   
@@@ -2982,6 -2989,65 +2978,6 @@@
         ext4_add_complete_io(io_end);
   }
   
- -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
- -{
- -      ext4_io_end_t *io_end = bh->b_private;
- -      struct inode *inode;
- -
- -      if (!test_clear_buffer_uninit(bh) || !io_end)
- -              goto out;
- -
- -      if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
- -              ext4_msg(io_end->inode->i_sb, KERN_INFO,
- -                       "sb umounted, discard end_io request for inode %lu",
- -                       io_end->inode->i_ino);
- -              ext4_free_io_end(io_end);
- -              goto out;
- -      }
- -
- -      /*
- -       * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
- -       * but being more careful is always safe for the future change.
- -       */
- -      inode = io_end->inode;
- -      ext4_set_io_unwritten_flag(inode, io_end);
- -      ext4_add_complete_io(io_end);
- -out:
- -      bh->b_private = NULL;
- -      bh->b_end_io = NULL;
- -      clear_buffer_uninit(bh);
- -      end_buffer_async_write(bh, uptodate);
- -}
- -
- -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
- -{
- -      ext4_io_end_t *io_end;
- -      struct page *page = bh->b_page;
- -      loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
- -      size_t size = bh->b_size;
- -
- -retry:
- -      io_end = ext4_init_io_end(inode, GFP_ATOMIC);
- -      if (!io_end) {
- -              pr_warn_ratelimited("%s: allocation fail\n", __func__);
- -              schedule();
- -              goto retry;
- -      }
- -      io_end->offset = offset;
- -      io_end->size = size;
- -      /*
- -       * We need to hold a reference to the page to make sure it
- -       * doesn't get evicted before ext4_end_io_work() has a chance
- -       * to convert the extent from written to unwritten.
- -       */
- -      io_end->page = page;
- -      get_page(io_end->page);
- -
- -      bh->b_private = io_end;
- -      bh->b_end_io = ext4_end_io_buffer_write;
- -      return 0;
- -}
- -
   /*
    * For ext4 extent files, ext4 will do direct-io write to holes,
    * preallocated extents, and those write extend the file, no need to
@@@ -3483,20 -3549,20 +3479,20 @@@ int ext4_can_truncate(struct inode *ino
   
   int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
   {
- -      struct inode *inode = file->f_path.dentry->d_inode;
+ +      struct inode *inode = file_inode(file);
         if (!S_ISREG(inode->i_mode))
                 return -EOPNOTSUPP;
   
- -      if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- -              /* TODO: Add support for non extent hole punching */
- -              return -EOPNOTSUPP;
- -      }
+ +      if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ +              return ext4_ind_punch_hole(file, offset, length);
   
         if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
                 /* TODO: Add support for bigalloc file systems */
                 return -EOPNOTSUPP;
         }
   
+ +      trace_ext4_punch_hole(inode, offset, length);
+ +
         return ext4_ext_punch_hole(file, offset, length);
   }
   
@@@ -3590,8 -3656,11 +3586,8 @@@ static int __ext4_get_inode_loc(struct 
         iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
   
         bh = sb_getblk(sb, block);
- -      if (!bh) {
- -              EXT4_ERROR_INODE_BLOCK(inode, block,
- -                                     "unable to read itable block");
- -              return -EIO;
- -      }
+ +      if (unlikely(!bh))
+ +              return -ENOMEM;
         if (!buffer_uptodate(bh)) {
                 lock_buffer(bh);
   
@@@ -3623,7 -3692,7 +3619,7 @@@
   
                         /* Is the inode bitmap in cache? */
                         bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
- -                      if (!bitmap_bh)
+ +                      if (unlikely(!bitmap_bh))
                                 goto make_io;
   
                         /*
@@@ -4331,9 -4400,8 +4327,9 @@@ int ext4_setattr(struct dentry *dentry
   
                 /* (user+group)*(old+new) structure, inode write (sb,
                  * inode block, ? - but truncate inode update has it) */
- -              handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
- -                                      EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
+ +              handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+ +                      (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
+ +                       EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
                 if (IS_ERR(handle)) {
                         error = PTR_ERR(handle);
                         goto err_out;
@@@ -4368,7 -4436,7 +4364,7 @@@
             (attr->ia_size < inode->i_size)) {
                 handle_t *handle;
   
- -              handle = ext4_journal_start(inode, 3);
+ +              handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
                 if (IS_ERR(handle)) {
                         error = PTR_ERR(handle);
                         goto err_out;
@@@ -4388,8 -4456,7 +4384,8 @@@
                                                             attr->ia_size);
                         if (error) {
                                 /* Do as much error cleanup as possible */
- -                              handle = ext4_journal_start(inode, 3);
+ +                              handle = ext4_journal_start(inode,
+ +                                                          EXT4_HT_INODE, 3);
                                 if (IS_ERR(handle)) {
                                         ext4_orphan_del(NULL, inode);
                                         goto err_out;
@@@ -4730,7 -4797,7 +4726,7 @@@ void ext4_dirty_inode(struct inode *ino
   {
         handle_t *handle;
   
- -      handle = ext4_journal_start(inode, 2);
+ +      handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
         if (IS_ERR(handle))
                 goto out;
   
@@@ -4831,7 -4898,7 +4827,7 @@@ int ext4_change_inode_journal_flag(stru
   
         /* Finally we can mark the inode as dirty. */
   
- -      handle = ext4_journal_start(inode, 1);
+ +      handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
         if (IS_ERR(handle))
                 return PTR_ERR(handle);
   
@@@ -4855,7 -4922,7 +4851,7 @@@ int ext4_page_mkwrite(struct vm_area_st
         unsigned long len;
         int ret;
         struct file *file = vma->vm_file;
- -      struct inode *inode = file->f_path.dentry->d_inode;
+ +      struct inode *inode = file_inode(file);
         struct address_space *mapping = inode->i_mapping;
         handle_t *handle;
         get_block_t *get_block;
@@@ -4897,7 -4964,7 +4893,7 @@@
                                             0, len, NULL,
                                             ext4_bh_unmapped)) {
                         /* Wait so that we don't change page under IO */
- -                      wait_on_page_writeback(page);
+ +                      wait_for_stable_page(page);
                         ret = VM_FAULT_LOCKED;
                         goto out;
                 }
@@@ -4909,8 -4976,7 +4905,8 @@@
         else
                 get_block = ext4_get_block;
   retry_alloc:
- -      handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ +      handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ +                                  ext4_writepage_trans_blocks(inode));
         if (IS_ERR(handle)) {
                 ret = VM_FAULT_SIGBUS;
                 goto out;
diff --combined fs/fs-writeback.c

index 359494ea1bde2a4df306004155cd31cdd6e89252,ad3cc46a743aa8c1b347af78a2615f7868659feb..21f46fb3a10193a966c18879c5bea794fd9a494e
--- 1/fs/fs-writeback.c
--- 2/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@@ -318,14 -318,8 +318,14 @@@ static void queue_io(struct bdi_writeba
   
   static int write_inode(struct inode *inode, struct writeback_control *wbc)
   {
- -      if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
- -              return inode->i_sb->s_op->write_inode(inode, wbc);
+ +      int ret;
+ +
+ +      if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
+ +              trace_writeback_write_inode_start(inode, wbc);
+ +              ret = inode->i_sb->s_op->write_inode(inode, wbc);
+ +              trace_writeback_write_inode(inode, wbc);
+ +              return ret;
+ +      }
         return 0;
   }
   
@@@ -456,8 -450,6 +456,8 @@@ __writeback_single_inode(struct inode *
   
         WARN_ON(!(inode->i_state & I_SYNC));
   
+ +      trace_writeback_single_inode_start(inode, wbc, nr_to_write);
+ +
         ret = do_writepages(mapping, wbc);
   
         /*
@@@ -1158,12 -1150,8 +1158,12 @@@ void __mark_inode_dirty(struct inode *i
          * dirty the inode itself
          */
         if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ +              trace_writeback_dirty_inode_start(inode, flags);
+ +
                 if (sb->s_op->dirty_inode)
                         sb->s_op->dirty_inode(inode, flags);
+ +
+ +              trace_writeback_dirty_inode(inode, flags);
         }
   
         /*
@@@ -1344,47 -1332,43 +1344,43 @@@ void writeback_inodes_sb(struct super_b
   EXPORT_SYMBOL(writeback_inodes_sb);
   
   /**
-  * writeback_inodes_sb_if_idle        -       start writeback if none underway
+  * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
    * @sb: the superblock
-  * @reason: reason why some writeback work was initiated
+  * @nr: the number of pages to write
+  * @reason: the reason of writeback
    *
-  * Invoke writeback_inodes_sb if no writeback is currently underway.
+  * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
    * Returns 1 if writeback was started, 0 if not.
    */
- int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
+ int try_to_writeback_inodes_sb_nr(struct super_block *sb,
+                                 unsigned long nr,
+                                 enum wb_reason reason)
   {
-       if (!writeback_in_progress(sb->s_bdi)) {
-               down_read(&sb->s_umount);
-               writeback_inodes_sb(sb, reason);
-               up_read(&sb->s_umount);
+       if (writeback_in_progress(sb->s_bdi))
                 return 1;
-       } else
+ 
+       if (!down_read_trylock(&sb->s_umount))
                 return 0;
+ 
+       writeback_inodes_sb_nr(sb, nr, reason);
+       up_read(&sb->s_umount);
+       return 1;
   }
- EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
+ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
   
   /**
-  * writeback_inodes_sb_nr_if_idle     -       start writeback if none underway
+  * try_to_writeback_inodes_sb - try to start writeback if none underway
    * @sb: the superblock
-  * @nr: the number of pages to write
    * @reason: reason why some writeback work was initiated
    *
-  * Invoke writeback_inodes_sb if no writeback is currently underway.
+  * Implement by try_to_writeback_inodes_sb_nr()
    * Returns 1 if writeback was started, 0 if not.
    */
- int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
-                                  unsigned long nr,
-                                  enum wb_reason reason)
+ int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
   {
-       if (!writeback_in_progress(sb->s_bdi)) {
-               down_read(&sb->s_umount);
-               writeback_inodes_sb_nr(sb, nr, reason);
-               up_read(&sb->s_umount);
-               return 1;
-       } else
-               return 0;
+       return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
   }
- EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
+ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
   
   /**
    * sync_inodes_sb     -       sync sb inode pages
diff --combined mm/page-writeback.c

index 742c40583159cba7736d87c4ec3bc5a7250613d5,1534ebd6e70f5d6ff5ca59a83f7ac0498b889976..efe68148f621959beb28987dd9430fff289f583f
--- 1/mm/page-writeback.c
--- 2/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@@ -35,7 -35,6 +35,7 @@@
   #include <linux/buffer_head.h> /* __set_page_dirty_buffers */
   #include <linux/pagevec.h>
   #include <linux/timer.h>
+ +#include <linux/sched/rt.h>
   #include <trace/events/writeback.h>
   
   /*
@@@ -241,9 -240,6 +241,9 @@@ static unsigned long global_dirtyable_m
         if (!vm_highmem_is_dirtyable)
                 x -= highmem_dirtyable_memory(x);
   
+ +      /* Subtract min_free_kbytes */
+ +      x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
+ +
         return x + 1;   /* Ensure that we never return 0 */
   }
   
@@@ -696,7 -692,7 +696,7 @@@ static unsigned long bdi_position_ratio
          *     => fast response on large errors; small oscillation near setpoint
          */
         setpoint = (freerun + limit) / 2;
-       x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
+       x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
                     limit - setpoint + 1);
         pos_ratio = x;
         pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
@@@ -1986,8 -1982,6 +1986,8 @@@ int __set_page_dirty_no_writeback(struc
    */
   void account_page_dirtied(struct page *page, struct address_space *mapping)
   {
+ +      trace_writeback_dirty_page(page, mapping);
+ +
         if (mapping_cap_account_dirty(mapping)) {
                 __inc_zone_page_state(page, NR_FILE_DIRTY);
                 __inc_zone_page_state(page, NR_DIRTIED);
@@@ -2295,27 -2289,3 +2295,27 @@@ int mapping_tagged(struct address_spac
         return radix_tree_tagged(&mapping->page_tree, tag);
   }
   EXPORT_SYMBOL(mapping_tagged);
+ +
+ +/**
+ + * wait_for_stable_page() - wait for writeback to finish, if necessary.
+ + * @page:     The page to wait on.
+ + *
+ + * This function determines if the given page is related to a backing device
+ + * that requires page contents to be held stable during writeback.  If so, then
+ + * it will wait for any pending writeback to complete.
+ + */
+ +void wait_for_stable_page(struct page *page)
+ +{
+ +      struct address_space *mapping = page_mapping(page);
+ +      struct backing_dev_info *bdi = mapping->backing_dev_info;
+ +
+ +      if (!bdi_cap_stable_pages_required(bdi))
+ +              return;
+ +#ifdef CONFIG_NEED_BOUNCE_POOL
+ +      if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE)
+ +              return;
+ +#endif /* CONFIG_NEED_BOUNCE_POOL */
+ +
+ +      wait_on_page_writeback(page);
+ +}
+ +EXPORT_SYMBOL_GPL(wait_for_stable_page);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 28 Feb 2013 21:21:44 +0000 (13:21 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 28 Feb 2013 21:21:44 +0000 (13:21 -0800)
		1	2
fs/btrfs/extent-tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fs-writeback.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page-writeback.c	patch \|	diff1 \|	diff2 \|	blob \| history