]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge tag 'writeback-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 28 Feb 2013 21:21:44 +0000 (13:21 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 28 Feb 2013 21:21:44 +0000 (13:21 -0800)
Pull writeback fixes from Wu Fengguang:
 "Two writeback fixes

   - fix negative (setpoint - dirty) in 32bit archs

   - use down_read_trylock() in writeback_inodes_sb(_nr)_if_idle()"

* tag 'writeback-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/wfg/linux:
  Negative (setpoint-dirty) in bdi_position_ratio()
  vfs: re-implement writeback_inodes_sb(_nr)_if_idle() and rename them

1  2 
fs/btrfs/extent-tree.c
fs/ext4/inode.c
fs/fs-writeback.c
mm/page-writeback.c

diff --combined fs/btrfs/extent-tree.c
index 1e59ed575cc991819a980b1ea6facd9d8a683995,f31abb14e06f171054f182864acb1dabf6fd1335..cf54bdfee334287383e7b63badc5a3683e2edf4f
@@@ -3689,20 -3689,6 +3689,6 @@@ static int can_overcommit(struct btrfs_
        return 0;
  }
  
- static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
-                                              unsigned long nr_pages,
-                                              enum wb_reason reason)
- {
-       if (!writeback_in_progress(sb->s_bdi) &&
-           down_read_trylock(&sb->s_umount)) {
-               writeback_inodes_sb_nr(sb, nr_pages, reason);
-               up_read(&sb->s_umount);
-               return 1;
-       }
-       return 0;
- }
  /*
   * shrink metadata reservation for delalloc
   */
@@@ -3735,9 -3721,9 +3721,9 @@@ static void shrink_delalloc(struct btrf
        while (delalloc_bytes && loops < 3) {
                max_reclaim = min(delalloc_bytes, to_reclaim);
                nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-               writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
-                                                   nr_pages,
-                                                   WB_REASON_FS_FREE_SPACE);
+               try_to_writeback_inodes_sb_nr(root->fs_info->sb,
+                                             nr_pages,
+                                             WB_REASON_FS_FREE_SPACE);
  
                /*
                 * We need to wait for the async pages to actually start before
@@@ -3997,7 -3983,7 +3983,7 @@@ again
         * We make the other tasks wait for the flush only when we can flush
         * all things.
         */
 -      if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
 +      if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
                flushing = true;
                space_info->flush = 1;
        }
@@@ -4534,7 -4520,7 +4520,7 @@@ int btrfs_delalloc_reserve_metadata(str
        unsigned nr_extents = 0;
        int extra_reserve = 0;
        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
 -      int ret;
 +      int ret = 0;
        bool delalloc_lock = true;
  
        /* If we are a free space inode we need to not flush since we will be in
        csum_bytes = BTRFS_I(inode)->csum_bytes;
        spin_unlock(&BTRFS_I(inode)->lock);
  
 -      if (root->fs_info->quota_enabled) {
 +      if (root->fs_info->quota_enabled)
                ret = btrfs_qgroup_reserve(root, num_bytes +
                                           nr_extents * root->leafsize);
 -              if (ret) {
 -                      spin_lock(&BTRFS_I(inode)->lock);
 -                      calc_csum_metadata_size(inode, num_bytes, 0);
 -                      spin_unlock(&BTRFS_I(inode)->lock);
 -                      if (delalloc_lock)
 -                              mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 -                      return ret;
 -              }
 -      }
  
 -      ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
 +      /*
 +       * ret != 0 here means the qgroup reservation failed, we go straight to
 +       * the shared error handling then.
 +       */
 +      if (ret == 0)
 +              ret = reserve_metadata_bytes(root, block_rsv,
 +                                           to_reserve, flush);
 +
        if (ret) {
                u64 to_free = 0;
                unsigned dropped;
@@@ -5558,7 -5546,7 +5544,7 @@@ static noinline int find_free_extent(st
        int empty_cluster = 2 * 1024 * 1024;
        struct btrfs_space_info *space_info;
        int loop = 0;
 -      int index = 0;
 +      int index = __get_raid_index(data);
        int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
        bool found_uncached_bg = false;
@@@ -6522,7 -6510,7 +6508,7 @@@ reada
  }
  
  /*
 - * hepler to process tree block while walking down the tree.
 + * helper to process tree block while walking down the tree.
   *
   * when wc->stage == UPDATE_BACKREF, this function updates
   * back refs for pointers in the block.
@@@ -6597,7 -6585,7 +6583,7 @@@ static noinline int walk_down_proc(stru
  }
  
  /*
 - * hepler to process tree block pointer.
 + * helper to process tree block pointer.
   *
   * when wc->stage == DROP_REFERENCE, this function checks
   * reference count of the block pointed to. if the block
@@@ -6735,7 -6723,7 +6721,7 @@@ skip
  }
  
  /*
 - * hepler to process tree block while walking up the tree.
 + * helper to process tree block while walking up the tree.
   *
   * when wc->stage == DROP_REFERENCE, this function drops
   * reference count on the block.
@@@ -6786,13 -6774,11 +6772,13 @@@ static noinline int walk_up_proc(struc
                                                       &wc->flags[level]);
                        if (ret < 0) {
                                btrfs_tree_unlock_rw(eb, path->locks[level]);
 +                              path->locks[level] = 0;
                                return ret;
                        }
                        BUG_ON(wc->refs[level] == 0);
                        if (wc->refs[level] == 1) {
                                btrfs_tree_unlock_rw(eb, path->locks[level]);
 +                              path->locks[level] = 0;
                                return 1;
                        }
                }
diff --combined fs/ext4/inode.c
index 9c4f4b1c97f84aad654dd8f70cf5231c78a6fc52,5f6eef71ff214af0d29a40c2a276eaab41c08533..9ea0cde3fa9e0ffe7aebc28940293c422ae75a63
@@@ -132,6 -132,10 +132,6 @@@ static inline int ext4_begin_ordered_tr
  }
  
  static void ext4_invalidatepage(struct page *page, unsigned long offset);
 -static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 -                                 struct buffer_head *bh_result, int create);
 -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
 -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
  static int __ext4_journalled_writepage(struct page *page, unsigned int len);
  static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
  static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
@@@ -234,8 -238,7 +234,8 @@@ void ext4_evict_inode(struct inode *ino
         * protection against it
         */
        sb_start_intwrite(inode->i_sb);
 -      handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
 +      handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
 +                                  ext4_blocks_for_truncate(inode)+3);
        if (IS_ERR(handle)) {
                ext4_std_error(inode->i_sb, PTR_ERR(handle));
                /*
@@@ -343,7 -346,7 +343,7 @@@ void ext4_da_update_reserve_space(struc
        spin_lock(&ei->i_block_reservation_lock);
        trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
 -              ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
 +              ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
                         "with only %d reserved data blocks",
                         __func__, inode->i_ino, used,
                         ei->i_reserved_data_blocks);
        }
  
        if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
 -              ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d "
 -                       "with only %d reserved metadata blocks\n", __func__,
 -                       inode->i_ino, ei->i_allocated_meta_blocks,
 -                       ei->i_reserved_meta_blocks);
 +              ext4_warning(inode->i_sb, "ino %lu, allocated %d "
 +                      "with only %d reserved metadata blocks "
 +                      "(releasing %d blocks with reserved %d data blocks)",
 +                      inode->i_ino, ei->i_allocated_meta_blocks,
 +                           ei->i_reserved_meta_blocks, used,
 +                           ei->i_reserved_data_blocks);
                WARN_ON(1);
                ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
        }
@@@ -507,33 -508,12 +507,33 @@@ static pgoff_t ext4_num_dirty_pages(str
  int ext4_map_blocks(handle_t *handle, struct inode *inode,
                    struct ext4_map_blocks *map, int flags)
  {
 +      struct extent_status es;
        int retval;
  
        map->m_flags = 0;
        ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
                  "logical block %lu\n", inode->i_ino, flags, map->m_len,
                  (unsigned long) map->m_lblk);
 +
 +      /* Lookup extent status tree firstly */
 +      if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
 +              if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
 +                      map->m_pblk = ext4_es_pblock(&es) +
 +                                      map->m_lblk - es.es_lblk;
 +                      map->m_flags |= ext4_es_is_written(&es) ?
 +                                      EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
 +                      retval = es.es_len - (map->m_lblk - es.es_lblk);
 +                      if (retval > map->m_len)
 +                              retval = map->m_len;
 +                      map->m_len = retval;
 +              } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
 +                      retval = 0;
 +              } else {
 +                      BUG_ON(1);
 +              }
 +              goto found;
 +      }
 +
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
                retval = ext4_ind_map_blocks(handle, inode, map, flags &
                                             EXT4_GET_BLOCKS_KEEP_SIZE);
        }
 +      if (retval > 0) {
 +              int ret;
 +              unsigned long long status;
 +
 +              status = map->m_flags & EXT4_MAP_UNWRITTEN ?
 +                              EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
 +              if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
 +                  ext4_find_delalloc_range(inode, map->m_lblk,
 +                                           map->m_lblk + map->m_len - 1))
 +                      status |= EXTENT_STATUS_DELAYED;
 +              ret = ext4_es_insert_extent(inode, map->m_lblk,
 +                                          map->m_len, map->m_pblk, status);
 +              if (ret < 0)
 +                      retval = ret;
 +      }
        if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
                up_read((&EXT4_I(inode)->i_data_sem));
  
 +found:
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 -              int ret;
 -              if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
 -                      /* delayed alloc may be allocated by fallocate and
 -                       * coverted to initialized by directIO.
 -                       * we need to handle delayed extent here.
 -                       */
 -                      down_write((&EXT4_I(inode)->i_data_sem));
 -                      goto delayed_mapped;
 -              }
 -              ret = check_block_validity(inode, map);
 +              int ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }
                return retval;
  
        /*
 -       * When we call get_blocks without the create flag, the
 -       * BH_Unwritten flag could have gotten set if the blocks
 -       * requested were part of a uninitialized extent.  We need to
 -       * clear this flag now that we are committed to convert all or
 -       * part of the uninitialized extent to be an initialized
 -       * extent.  This is because we need to avoid the combination
 -       * of BH_Unwritten and BH_Mapped flags being simultaneously
 -       * set on the buffer_head.
 +       * Here we clear m_flags because after allocating an new extent,
 +       * it will be set again.
         */
 -      map->m_flags &= ~EXT4_MAP_UNWRITTEN;
 +      map->m_flags &= ~EXT4_MAP_FLAGS;
  
        /*
         * New blocks allocate and/or writing to uninitialized extent
                        (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
 -      if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
 +      if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
  
 -              if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 -                      int ret;
 -delayed_mapped:
 -                      /* delayed allocation blocks has been allocated */
 -                      ret = ext4_es_remove_extent(inode, map->m_lblk,
 -                                                  map->m_len);
 -                      if (ret < 0)
 -                              retval = ret;
 -              }
 +      if (retval > 0) {
 +              int ret;
 +              unsigned long long status;
 +
 +              status = map->m_flags & EXT4_MAP_UNWRITTEN ?
 +                              EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
 +              if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
 +                  ext4_find_delalloc_range(inode, map->m_lblk,
 +                                           map->m_lblk + map->m_len - 1))
 +                      status |= EXTENT_STATUS_DELAYED;
 +              ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
 +                                          map->m_pblk, status);
 +              if (ret < 0)
 +                      retval = ret;
        }
  
        up_write((&EXT4_I(inode)->i_data_sem));
@@@ -686,8 -660,7 +686,8 @@@ static int _ext4_get_block(struct inod
                if (map.m_len > DIO_MAX_BLOCKS)
                        map.m_len = DIO_MAX_BLOCKS;
                dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
 -              handle = ext4_journal_start(inode, dio_credits);
 +              handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
 +                                          dio_credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        return ret;
@@@ -734,16 -707,14 +734,16 @@@ struct buffer_head *ext4_getblk(handle_
        /* ensure we send some value back into *errp */
        *errp = 0;
  
 +      if (create && err == 0)
 +              err = -ENOSPC;  /* should never happen */
        if (err < 0)
                *errp = err;
        if (err <= 0)
                return NULL;
  
        bh = sb_getblk(inode->i_sb, map.m_pblk);
 -      if (!bh) {
 -              *errp = -EIO;
 +      if (unlikely(!bh)) {
 +              *errp = -ENOMEM;
                return NULL;
        }
        if (map.m_flags & EXT4_MAP_NEW) {
@@@ -837,10 -808,11 +837,10 @@@ int ext4_walk_page_buffers(handle_t *ha
   * and the commit_write().  So doing the jbd2_journal_start at the start of
   * prepare_write() is the right place.
   *
 - * Also, this function can nest inside ext4_writepage() ->
 - * block_write_full_page(). In that case, we *know* that ext4_writepage()
 - * has generated enough buffer credits to do the whole page.  So we won't
 - * block on the journal in that case, which is good, because the caller may
 - * be PF_MEMALLOC.
 + * Also, this function can nest inside ext4_writepage().  In that case, we
 + * *know* that ext4_writepage() has generated enough buffer credits to do the
 + * whole page.  So we won't block on the journal in that case, which is good,
 + * because the caller may be PF_MEMALLOC.
   *
   * By accident, ext4 can be reentered when a transaction is open via
   * quota file writes.  If we were to commit the transaction while thus
@@@ -906,40 -878,32 +906,40 @@@ static int ext4_write_begin(struct fil
                ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
                                                    flags, pagep);
                if (ret < 0)
 -                      goto out;
 -              if (ret == 1) {
 -                      ret = 0;
 -                      goto out;
 -              }
 +                      return ret;
 +              if (ret == 1)
 +                      return 0;
        }
  
 -retry:
 -      handle = ext4_journal_start(inode, needed_blocks);
 +      /*
 +       * grab_cache_page_write_begin() can take a long time if the
 +       * system is thrashing due to memory pressure, or if the page
 +       * is being written back.  So grab it first before we start
 +       * the transaction handle.  This also allows us to allocate
 +       * the page (if needed) without using GFP_NOFS.
 +       */
 +retry_grab:
 +      page = grab_cache_page_write_begin(mapping, index, flags);
 +      if (!page)
 +              return -ENOMEM;
 +      unlock_page(page);
 +
 +retry_journal:
 +      handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
        if (IS_ERR(handle)) {
 -              ret = PTR_ERR(handle);
 -              goto out;
 +              page_cache_release(page);
 +              return PTR_ERR(handle);
        }
  
 -      /* We cannot recurse into the filesystem as the transaction is already
 -       * started */
 -      flags |= AOP_FLAG_NOFS;
 -
 -      page = grab_cache_page_write_begin(mapping, index, flags);
 -      if (!page) {
 +      lock_page(page);
 +      if (page->mapping != mapping) {
 +              /* The page got truncated from under us */
 +              unlock_page(page);
 +              page_cache_release(page);
                ext4_journal_stop(handle);
 -              ret = -ENOMEM;
 -              goto out;
 +              goto retry_grab;
        }
 -
 -      *pagep = page;
 +      wait_on_page_writeback(page);
  
        if (ext4_should_dioread_nolock(inode))
                ret = __block_write_begin(page, pos, len, ext4_get_block_write);
  
        if (ret) {
                unlock_page(page);
 -              page_cache_release(page);
                /*
                 * __block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                        if (inode->i_nlink)
                                ext4_orphan_del(NULL, inode);
                }
 -      }
  
 -      if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 -              goto retry;
 -out:
 +              if (ret == -ENOSPC &&
 +                  ext4_should_retry_alloc(inode->i_sb, &retries))
 +                      goto retry_journal;
 +              page_cache_release(page);
 +              return ret;
 +      }
 +      *pagep = page;
        return ret;
  }
  
@@@ -1294,7 -1256,7 +1294,7 @@@ static void ext4_da_release_space(struc
                 * function is called from invalidate page, it's
                 * harmless to return without any action.
                 */
 -              ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
 +              ext4_warning(inode->i_sb, "ext4_da_release_space: "
                         "ino %lu, to_free %d with only %d reserved "
                         "data blocks", inode->i_ino, to_free,
                         ei->i_reserved_data_blocks);
@@@ -1395,6 -1357,7 +1395,6 @@@ static int mpage_da_submit_io(struct mp
        loff_t size = i_size_read(inode);
        unsigned int len, block_start;
        struct buffer_head *bh, *page_bufs = NULL;
 -      int journal_data = ext4_should_journal_data(inode);
        sector_t pblock = 0, cur_logical = 0;
        struct ext4_io_submit io_submit;
  
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
 -                      int commit_write = 0, skip_page = 0;
 +                      int skip_page = 0;
                        struct page *page = pvec.pages[i];
  
                        index = page->index;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
  
 -                      /*
 -                       * If the page does not have buffers (for
 -                       * whatever reason), try to create them using
 -                       * __block_write_begin.  If this fails,
 -                       * skip the page and move on.
 -                       */
 -                      if (!page_has_buffers(page)) {
 -                              if (__block_write_begin(page, 0, len,
 -                                              noalloc_get_block_write)) {
 -                              skip_page:
 -                                      unlock_page(page);
 -                                      continue;
 -                              }
 -                              commit_write = 1;
 -                      }
 -
                        bh = page_bufs = page_buffers(page);
                        block_start = 0;
                        do {
 -                              if (!bh)
 -                                      goto skip_page;
                                if (map && (cur_logical >= map->m_lblk) &&
                                    (cur_logical <= (map->m_lblk +
                                                     (map->m_len - 1)))) {
                                pblock++;
                        } while (bh != page_bufs);
  
 -                      if (skip_page)
 -                              goto skip_page;
 -
 -                      if (commit_write)
 -                              /* mark the buffer_heads as dirty & uptodate */
 -                              block_commit_write(page, 0, len);
 +                      if (skip_page) {
 +                              unlock_page(page);
 +                              continue;
 +                      }
  
                        clear_page_dirty_for_io(page);
 -                      /*
 -                       * Delalloc doesn't support data journalling,
 -                       * but eventually maybe we'll lift this
 -                       * restriction.
 -                       */
 -                      if (unlikely(journal_data && PageChecked(page)))
 -                              err = __ext4_journalled_writepage(page, len);
 -                      else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
 -                              err = ext4_bio_write_page(&io_submit, page,
 -                                                        len, mpd->wbc);
 -                      else if (buffer_uninit(page_bufs)) {
 -                              ext4_set_bh_endio(page_bufs, inode);
 -                              err = block_write_full_page_endio(page,
 -                                      noalloc_get_block_write,
 -                                      mpd->wbc, ext4_end_io_buffer_write);
 -                      } else
 -                              err = block_write_full_page(page,
 -                                      noalloc_get_block_write, mpd->wbc);
 -
 +                      err = ext4_bio_write_page(&io_submit, page, len,
 +                                                mpd->wbc);
                        if (!err)
                                mpd->pages_written++;
                        /*
@@@ -1640,7 -1640,7 +1640,7 @@@ static void mpage_da_map_and_submit(str
                                 (unsigned long long) next,
                                 mpd->b_size >> mpd->inode->i_blkbits, err);
                        ext4_msg(sb, KERN_CRIT,
 -                              "This should not happen!! Data will be lost\n");
 +                              "This should not happen!! Data will be lost");
                        if (err == -ENOSPC)
                                ext4_print_free_blocks(mpd->inode);
                }
@@@ -1690,16 -1690,16 +1690,16 @@@ submit_io
   *
   * @mpd->lbh - extent of blocks
   * @logical - logical number of the block in the file
 - * @bh - bh of the block (used to access block's state)
 + * @b_state - b_state of the buffer head added
   *
   * the function is used to collect contig. blocks in same state
   */
 -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 -                                 sector_t logical, size_t b_size,
 +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
                                   unsigned long b_state)
  {
        sector_t next;
 -      int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
 +      int blkbits = mpd->inode->i_blkbits;
 +      int nrblocks = mpd->b_size >> blkbits;
  
        /*
         * XXX Don't go larger than mballoc is willing to allocate
         * mpage_da_submit_io() into this function and then call
         * ext4_map_blocks() multiple times in a loop
         */
 -      if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
 +      if (nrblocks >= (8*1024*1024 >> blkbits))
                goto flush_it;
  
 -      /* check if thereserved journal credits might overflow */
 -      if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
 +      /* check if the reserved journal credits might overflow */
 +      if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
                        /*
                         * With non-extent format we are limited by the journal
                         * nrblocks.  So limit nrblocks.
                         */
                        goto flush_it;
 -              } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
 -                              EXT4_MAX_TRANS_DATA) {
 -                      /*
 -                       * Adding the new buffer_head would make it cross the
 -                       * allowed limit for which we have journal credit
 -                       * reserved. So limit the new bh->b_size
 -                       */
 -                      b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
 -                                              mpd->inode->i_blkbits;
 -                      /* we will do mpage_da_submit_io in the next loop */
                }
        }
        /*
         */
        if (mpd->b_size == 0) {
                mpd->b_blocknr = logical;
 -              mpd->b_size = b_size;
 +              mpd->b_size = 1 << blkbits;
                mpd->b_state = b_state & BH_FLAGS;
                return;
        }
         * Can we merge the block to our big extent?
         */
        if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
 -              mpd->b_size += b_size;
 +              mpd->b_size += 1 << blkbits;
                return;
        }
  
@@@ -1765,7 -1775,6 +1765,7 @@@ static int ext4_da_map_blocks(struct in
                              struct ext4_map_blocks *map,
                              struct buffer_head *bh)
  {
 +      struct extent_status es;
        int retval;
        sector_t invalid_block = ~((sector_t) 0xffff);
  
        ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
                  "logical block %lu\n", inode->i_ino, map->m_len,
                  (unsigned long) map->m_lblk);
 +
 +      /* Lookup extent status tree firstly */
 +      if (ext4_es_lookup_extent(inode, iblock, &es)) {
 +
 +              if (ext4_es_is_hole(&es)) {
 +                      retval = 0;
 +                      down_read((&EXT4_I(inode)->i_data_sem));
 +                      goto add_delayed;
 +              }
 +
 +              /*
 +               * Delayed extent could be allocated by fallocate.
 +               * So we need to check it.
 +               */
 +              if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
 +                      map_bh(bh, inode->i_sb, invalid_block);
 +                      set_buffer_new(bh);
 +                      set_buffer_delay(bh);
 +                      return 0;
 +              }
 +
 +              map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
 +              retval = es.es_len - (iblock - es.es_lblk);
 +              if (retval > map->m_len)
 +                      retval = map->m_len;
 +              map->m_len = retval;
 +              if (ext4_es_is_written(&es))
 +                      map->m_flags |= EXT4_MAP_MAPPED;
 +              else if (ext4_es_is_unwritten(&es))
 +                      map->m_flags |= EXT4_MAP_UNWRITTEN;
 +              else
 +                      BUG_ON(1);
 +
 +              return retval;
 +      }
 +
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
                        map->m_flags |= EXT4_MAP_FROM_CLUSTER;
                retval = 0;
        } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 -              retval = ext4_ext_map_blocks(NULL, inode, map, 0);
 +              retval = ext4_ext_map_blocks(NULL, inode, map,
 +                                           EXT4_GET_BLOCKS_NO_PUT_HOLE);
        else
 -              retval = ext4_ind_map_blocks(NULL, inode, map, 0);
 +              retval = ext4_ind_map_blocks(NULL, inode, map,
 +                                           EXT4_GET_BLOCKS_NO_PUT_HOLE);
  
 +add_delayed:
        if (retval == 0) {
 +              int ret;
                /*
                 * XXX: __block_prepare_write() unmaps passed block,
                 * is it OK?
                /* If the block was allocated from previously allocated cluster,
                 * then we dont need to reserve it again. */
                if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
 -                      retval = ext4_da_reserve_space(inode, iblock);
 -                      if (retval)
 +                      ret = ext4_da_reserve_space(inode, iblock);
 +                      if (ret) {
                                /* not enough space to reserve */
 +                              retval = ret;
                                goto out_unlock;
 +                      }
                }
  
 -              retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
 -              if (retval)
 +              ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
 +                                          ~0, EXTENT_STATUS_DELAYED);
 +              if (ret) {
 +                      retval = ret;
                        goto out_unlock;
 +              }
  
                /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
                 * and it should not appear on the bh->b_state.
                map_bh(bh, inode->i_sb, invalid_block);
                set_buffer_new(bh);
                set_buffer_delay(bh);
 +      } else if (retval > 0) {
 +              int ret;
 +              unsigned long long status;
 +
 +              status = map->m_flags & EXT4_MAP_UNWRITTEN ?
 +                              EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
 +              ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
 +                                          map->m_pblk, status);
 +              if (ret != 0)
 +                      retval = ret;
        }
  
  out_unlock:
@@@ -1936,6 -1890,27 +1936,6 @@@ int ext4_da_get_block_prep(struct inod
        return 0;
  }
  
 -/*
 - * This function is used as a standard get_block_t calback function
 - * when there is no desire to allocate any blocks.  It is used as a
 - * callback function for block_write_begin() and block_write_full_page().
 - * These functions should only try to map a single block at a time.
 - *
 - * Since this function doesn't do block allocations even if the caller
 - * requests it by passing in create=1, it is critically important that
 - * any caller checks to make sure that any buffer heads are returned
 - * by this function are either all already mapped or marked for
 - * delayed allocation before calling  block_write_full_page().  Otherwise,
 - * b_blocknr could be left unitialized, and the page write functions will
 - * be taken by surprise.
 - */
 -static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 -                                 struct buffer_head *bh_result, int create)
 -{
 -      BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
 -      return _ext4_get_block(inode, iblock, bh_result, 0);
 -}
 -
  static int bget_one(handle_t *handle, struct buffer_head *bh)
  {
        get_bh(bh);
@@@ -1980,8 -1955,7 +1980,8 @@@ static int __ext4_journalled_writepage(
         * references to buffers so we are safe */
        unlock_page(page);
  
 -      handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 +      handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
 +                                  ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                goto out;
  static int ext4_writepage(struct page *page,
                          struct writeback_control *wbc)
  {
 -      int ret = 0, commit_write = 0;
 +      int ret = 0;
        loff_t size;
        unsigned int len;
        struct buffer_head *page_bufs = NULL;
        struct inode *inode = page->mapping->host;
 +      struct ext4_io_submit io_submit;
  
        trace_ext4_writepage(page);
        size = i_size_read(inode);
        else
                len = PAGE_CACHE_SIZE;
  
 +      page_bufs = page_buffers(page);
        /*
 -       * If the page does not have buffers (for whatever reason),
 -       * try to create them using __block_write_begin.  If this
 -       * fails, redirty the page and move on.
 +       * We cannot do block allocation or other extent handling in this
 +       * function. If there are buffers needing that, we have to redirty
 +       * the page. But we may reach here when we do a journal commit via
 +       * journal_submit_inode_data_buffers() and in that case we must write
 +       * allocated buffers to achieve data=ordered mode guarantees.
         */
 -      if (!page_has_buffers(page)) {
 -              if (__block_write_begin(page, 0, len,
 -                                      noalloc_get_block_write)) {
 -              redirty_page:
 -                      redirty_page_for_writepage(wbc, page);
 +      if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
 +                                 ext4_bh_delay_or_unwritten)) {
 +              redirty_page_for_writepage(wbc, page);
 +              if (current->flags & PF_MEMALLOC) {
 +                      /*
 +                       * For memory cleaning there's no point in writing only
 +                       * some buffers. So just bail out. Warn if we came here
 +                       * from direct reclaim.
 +                       */
 +                      WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
 +                                                      == PF_MEMALLOC);
                        unlock_page(page);
                        return 0;
                }
 -              commit_write = 1;
 -      }
 -      page_bufs = page_buffers(page);
 -      if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
 -                                 ext4_bh_delay_or_unwritten)) {
 -              /*
 -               * We don't want to do block allocation, so redirty
 -               * the page and return.  We may reach here when we do
 -               * a journal commit via journal_submit_inode_data_buffers.
 -               * We can also reach here via shrink_page_list but it
 -               * should never be for direct reclaim so warn if that
 -               * happens
 -               */
 -              WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
 -                                                              PF_MEMALLOC);
 -              goto redirty_page;
        }
 -      if (commit_write)
 -              /* now mark the buffer_heads as dirty and uptodate */
 -              block_commit_write(page, 0, len);
  
        if (PageChecked(page) && ext4_should_journal_data(inode))
                /*
                 */
                return __ext4_journalled_writepage(page, len);
  
 -      if (buffer_uninit(page_bufs)) {
 -              ext4_set_bh_endio(page_bufs, inode);
 -              ret = block_write_full_page_endio(page, noalloc_get_block_write,
 -                                          wbc, ext4_end_io_buffer_write);
 -      } else
 -              ret = block_write_full_page(page, noalloc_get_block_write,
 -                                          wbc);
 -
 +      memset(&io_submit, 0, sizeof(io_submit));
 +      ret = ext4_bio_write_page(&io_submit, page, len, wbc);
 +      ext4_io_submit(&io_submit);
        return ret;
  }
  
@@@ -2240,38 -2228,51 +2240,38 @@@ static int write_cache_pages_da(handle_
                        logical = (sector_t) page->index <<
                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
  
 -                      if (!page_has_buffers(page)) {
 -                              mpage_add_bh_to_extent(mpd, logical,
 -                                                     PAGE_CACHE_SIZE,
 -                                                     (1 << BH_Dirty) | (1 << BH_Uptodate));
 -                              if (mpd->io_done)
 -                                      goto ret_extent_tail;
 -                      } else {
 +                      /* Add all dirty buffers to mpd */
 +                      head = page_buffers(page);
 +                      bh = head;
 +                      do {
 +                              BUG_ON(buffer_locked(bh));
                                /*
 -                               * Page with regular buffer heads,
 -                               * just add all dirty ones
 +                               * We need to try to allocate unmapped blocks
 +                               * in the same page.  Otherwise we won't make
 +                               * progress with the page in ext4_writepage
                                 */
 -                              head = page_buffers(page);
 -                              bh = head;
 -                              do {
 -                                      BUG_ON(buffer_locked(bh));
 +                              if (ext4_bh_delay_or_unwritten(NULL, bh)) {
 +                                      mpage_add_bh_to_extent(mpd, logical,
 +                                                             bh->b_state);
 +                                      if (mpd->io_done)
 +                                              goto ret_extent_tail;
 +                              } else if (buffer_dirty(bh) &&
 +                                         buffer_mapped(bh)) {
                                        /*
 -                                       * We need to try to allocate
 -                                       * unmapped blocks in the same page.
 -                                       * Otherwise we won't make progress
 -                                       * with the page in ext4_writepage
 +                                       * mapped dirty buffer. We need to
 +                                       * update the b_state because we look
 +                                       * at b_state in mpage_da_map_blocks.
 +                                       * We don't update b_size because if we
 +                                       * find an unmapped buffer_head later
 +                                       * we need to use the b_state flag of
 +                                       * that buffer_head.
                                         */
 -                                      if (ext4_bh_delay_or_unwritten(NULL, bh)) {
 -                                              mpage_add_bh_to_extent(mpd, logical,
 -                                                                     bh->b_size,
 -                                                                     bh->b_state);
 -                                              if (mpd->io_done)
 -                                                      goto ret_extent_tail;
 -                                      } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
 -                                              /*
 -                                               * mapped dirty buffer. We need
 -                                               * to update the b_state
 -                                               * because we look at b_state
 -                                               * in mpage_da_map_blocks.  We
 -                                               * don't update b_size because
 -                                               * if we find an unmapped
 -                                               * buffer_head later we need to
 -                                               * use the b_state flag of that
 -                                               * buffer_head.
 -                                               */
 -                                              if (mpd->b_size == 0)
 -                                                      mpd->b_state = bh->b_state & BH_FLAGS;
 -                                      }
 -                                      logical++;
 -                              } while ((bh = bh->b_this_page) != head);
 -                      }
 +                                      if (mpd->b_size == 0)
 +                                              mpd->b_state =
 +                                                      bh->b_state & BH_FLAGS;
 +                              }
 +                              logical++;
 +                      } while ((bh = bh->b_this_page) != head);
  
                        if (nr_to_write > 0) {
                                nr_to_write--;
@@@ -2412,8 -2413,7 +2412,8 @@@ retry
                needed_blocks = ext4_da_writepages_trans_blocks(inode);
  
                /* start a new transaction*/
 -              handle = ext4_journal_start(inode, needed_blocks);
 +              handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
 +                                          needed_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
@@@ -2512,12 -2512,8 +2512,8 @@@ static int ext4_nonda_switch(struct sup
        /*
         * Start pushing delalloc when 1/2 of free blocks are dirty.
         */
-       if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
-           !writeback_in_progress(sb->s_bdi) &&
-           down_read_trylock(&sb->s_umount)) {
-               writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
-               up_read(&sb->s_umount);
-       }
+       if (dirty_blocks && (free_blocks < 2 * dirty_blocks))
+               try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
  
        if (2 * free_blocks < 3 * dirty_blocks ||
                free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
@@@ -2555,52 -2551,42 +2551,52 @@@ static int ext4_da_write_begin(struct f
                                                      pos, len, flags,
                                                      pagep, fsdata);
                if (ret < 0)
 -                      goto out;
 -              if (ret == 1) {
 -                      ret = 0;
 -                      goto out;
 -              }
 +                      return ret;
 +              if (ret == 1)
 +                      return 0;
        }
  
 -retry:
 +      /*
 +       * grab_cache_page_write_begin() can take a long time if the
 +       * system is thrashing due to memory pressure, or if the page
 +       * is being written back.  So grab it first before we start
 +       * the transaction handle.  This also allows us to allocate
 +       * the page (if needed) without using GFP_NOFS.
 +       */
 +retry_grab:
 +      page = grab_cache_page_write_begin(mapping, index, flags);
 +      if (!page)
 +              return -ENOMEM;
 +      unlock_page(page);
 +
        /*
         * With delayed allocation, we don't log the i_disksize update
         * if there is delayed block allocation. But we still need
         * to journalling the i_disksize update if writes to the end
         * of file which has an already mapped buffer.
         */
 -      handle = ext4_journal_start(inode, 1);
 +retry_journal:
 +      handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
        if (IS_ERR(handle)) {
 -              ret = PTR_ERR(handle);
 -              goto out;
 +              page_cache_release(page);
 +              return PTR_ERR(handle);
        }
 -      /* We cannot recurse into the filesystem as the transaction is already
 -       * started */
 -      flags |= AOP_FLAG_NOFS;
  
 -      page = grab_cache_page_write_begin(mapping, index, flags);
 -      if (!page) {
 +      lock_page(page);
 +      if (page->mapping != mapping) {
 +              /* The page got truncated from under us */
 +              unlock_page(page);
 +              page_cache_release(page);
                ext4_journal_stop(handle);
 -              ret = -ENOMEM;
 -              goto out;
 +              goto retry_grab;
        }
 -      *pagep = page;
 +      /* In case writeback began while the page was unlocked */
 +      wait_on_page_writeback(page);
  
        ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
        if (ret < 0) {
                unlock_page(page);
                ext4_journal_stop(handle);
 -              page_cache_release(page);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 */
                if (pos + len > inode->i_size)
                        ext4_truncate_failed_write(inode);
 +
 +              if (ret == -ENOSPC &&
 +                  ext4_should_retry_alloc(inode->i_sb, &retries))
 +                      goto retry_journal;
 +
 +              page_cache_release(page);
 +              return ret;
        }
  
 -      if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 -              goto retry;
 -out:
 +      *pagep = page;
        return ret;
  }
  
@@@ -2873,10 -2854,36 +2869,10 @@@ ext4_readpages(struct file *file, struc
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
  }
  
 -static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
 -{
 -      struct buffer_head *head, *bh;
 -      unsigned int curr_off = 0;
 -
 -      if (!page_has_buffers(page))
 -              return;
 -      head = bh = page_buffers(page);
 -      do {
 -              if (offset <= curr_off && test_clear_buffer_uninit(bh)
 -                                      && bh->b_private) {
 -                      ext4_free_io_end(bh->b_private);
 -                      bh->b_private = NULL;
 -                      bh->b_end_io = NULL;
 -              }
 -              curr_off = curr_off + bh->b_size;
 -              bh = bh->b_this_page;
 -      } while (bh != head);
 -}
 -
  static void ext4_invalidatepage(struct page *page, unsigned long offset)
  {
        trace_ext4_invalidatepage(page, offset);
  
 -      /*
 -       * free any io_end structure allocated for buffers to be discarded
 -       */
 -      if (ext4_should_dioread_nolock(page->mapping->host))
 -              ext4_invalidatepage_free_endio(page, offset);
 -
        /* No journalling happens on data buffers when this function is used */
        WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
  
@@@ -2948,7 -2955,7 +2944,7 @@@ static void ext4_end_io_dio(struct kioc
                            ssize_t size, void *private, int ret,
                            bool is_async)
  {
 -      struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 +      struct inode *inode = file_inode(iocb->ki_filp);
          ext4_io_end_t *io_end = iocb->private;
  
        /* if not async direct IO or dio with 0 bytes write, just return */
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                ext4_free_io_end(io_end);
  out:
 +              inode_dio_done(inode);
                if (is_async)
                        aio_complete(iocb, ret, 0);
 -              inode_dio_done(inode);
                return;
        }
  
        ext4_add_complete_io(io_end);
  }
  
 -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
 -{
 -      ext4_io_end_t *io_end = bh->b_private;
 -      struct inode *inode;
 -
 -      if (!test_clear_buffer_uninit(bh) || !io_end)
 -              goto out;
 -
 -      if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
 -              ext4_msg(io_end->inode->i_sb, KERN_INFO,
 -                       "sb umounted, discard end_io request for inode %lu",
 -                       io_end->inode->i_ino);
 -              ext4_free_io_end(io_end);
 -              goto out;
 -      }
 -
 -      /*
 -       * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
 -       * but being more careful is always safe for the future change.
 -       */
 -      inode = io_end->inode;
 -      ext4_set_io_unwritten_flag(inode, io_end);
 -      ext4_add_complete_io(io_end);
 -out:
 -      bh->b_private = NULL;
 -      bh->b_end_io = NULL;
 -      clear_buffer_uninit(bh);
 -      end_buffer_async_write(bh, uptodate);
 -}
 -
 -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
 -{
 -      ext4_io_end_t *io_end;
 -      struct page *page = bh->b_page;
 -      loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
 -      size_t size = bh->b_size;
 -
 -retry:
 -      io_end = ext4_init_io_end(inode, GFP_ATOMIC);
 -      if (!io_end) {
 -              pr_warn_ratelimited("%s: allocation fail\n", __func__);
 -              schedule();
 -              goto retry;
 -      }
 -      io_end->offset = offset;
 -      io_end->size = size;
 -      /*
 -       * We need to hold a reference to the page to make sure it
 -       * doesn't get evicted before ext4_end_io_work() has a chance
 -       * to convert the extent from written to unwritten.
 -       */
 -      io_end->page = page;
 -      get_page(io_end->page);
 -
 -      bh->b_private = io_end;
 -      bh->b_end_io = ext4_end_io_buffer_write;
 -      return 0;
 -}
 -
  /*
   * For ext4 extent files, ext4 will do direct-io write to holes,
   * preallocated extents, and those write extend the file, no need to
@@@ -3483,20 -3549,20 +3479,20 @@@ int ext4_can_truncate(struct inode *ino
  
  int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
  {
 -      struct inode *inode = file->f_path.dentry->d_inode;
 +      struct inode *inode = file_inode(file);
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
  
 -      if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 -              /* TODO: Add support for non extent hole punching */
 -              return -EOPNOTSUPP;
 -      }
 +      if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 +              return ext4_ind_punch_hole(file, offset, length);
  
        if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
                /* TODO: Add support for bigalloc file systems */
                return -EOPNOTSUPP;
        }
  
 +      trace_ext4_punch_hole(inode, offset, length);
 +
        return ext4_ext_punch_hole(file, offset, length);
  }
  
@@@ -3590,8 -3656,11 +3586,8 @@@ static int __ext4_get_inode_loc(struct 
        iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
  
        bh = sb_getblk(sb, block);
 -      if (!bh) {
 -              EXT4_ERROR_INODE_BLOCK(inode, block,
 -                                     "unable to read itable block");
 -              return -EIO;
 -      }
 +      if (unlikely(!bh))
 +              return -ENOMEM;
        if (!buffer_uptodate(bh)) {
                lock_buffer(bh);
  
  
                        /* Is the inode bitmap in cache? */
                        bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
 -                      if (!bitmap_bh)
 +                      if (unlikely(!bitmap_bh))
                                goto make_io;
  
                        /*
@@@ -4331,9 -4400,8 +4327,9 @@@ int ext4_setattr(struct dentry *dentry
  
                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
 -              handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
 -                                      EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
 +              handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
 +                      (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
 +                       EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
            (attr->ia_size < inode->i_size)) {
                handle_t *handle;
  
 -              handle = ext4_journal_start(inode, 3);
 +              handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
                                                            attr->ia_size);
                        if (error) {
                                /* Do as much error cleanup as possible */
 -                              handle = ext4_journal_start(inode, 3);
 +                              handle = ext4_journal_start(inode,
 +                                                          EXT4_HT_INODE, 3);
                                if (IS_ERR(handle)) {
                                        ext4_orphan_del(NULL, inode);
                                        goto err_out;
@@@ -4730,7 -4797,7 +4726,7 @@@ void ext4_dirty_inode(struct inode *ino
  {
        handle_t *handle;
  
 -      handle = ext4_journal_start(inode, 2);
 +      handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle))
                goto out;
  
@@@ -4831,7 -4898,7 +4827,7 @@@ int ext4_change_inode_journal_flag(stru
  
        /* Finally we can mark the inode as dirty. */
  
 -      handle = ext4_journal_start(inode, 1);
 +      handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
  
@@@ -4855,7 -4922,7 +4851,7 @@@ int ext4_page_mkwrite(struct vm_area_st
        unsigned long len;
        int ret;
        struct file *file = vma->vm_file;
 -      struct inode *inode = file->f_path.dentry->d_inode;
 +      struct inode *inode = file_inode(file);
        struct address_space *mapping = inode->i_mapping;
        handle_t *handle;
        get_block_t *get_block;
                                            0, len, NULL,
                                            ext4_bh_unmapped)) {
                        /* Wait so that we don't change page under IO */
 -                      wait_on_page_writeback(page);
 +                      wait_for_stable_page(page);
                        ret = VM_FAULT_LOCKED;
                        goto out;
                }
        else
                get_block = ext4_get_block;
  retry_alloc:
 -      handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 +      handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
 +                                  ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = VM_FAULT_SIGBUS;
                goto out;
diff --combined fs/fs-writeback.c
index 359494ea1bde2a4df306004155cd31cdd6e89252,ad3cc46a743aa8c1b347af78a2615f7868659feb..21f46fb3a10193a966c18879c5bea794fd9a494e
@@@ -318,14 -318,8 +318,14 @@@ static void queue_io(struct bdi_writeba
  
  static int write_inode(struct inode *inode, struct writeback_control *wbc)
  {
 -      if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
 -              return inode->i_sb->s_op->write_inode(inode, wbc);
 +      int ret;
 +
 +      if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
 +              trace_writeback_write_inode_start(inode, wbc);
 +              ret = inode->i_sb->s_op->write_inode(inode, wbc);
 +              trace_writeback_write_inode(inode, wbc);
 +              return ret;
 +      }
        return 0;
  }
  
@@@ -456,8 -450,6 +456,8 @@@ __writeback_single_inode(struct inode *
  
        WARN_ON(!(inode->i_state & I_SYNC));
  
 +      trace_writeback_single_inode_start(inode, wbc, nr_to_write);
 +
        ret = do_writepages(mapping, wbc);
  
        /*
@@@ -1158,12 -1150,8 +1158,12 @@@ void __mark_inode_dirty(struct inode *i
         * dirty the inode itself
         */
        if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
 +              trace_writeback_dirty_inode_start(inode, flags);
 +
                if (sb->s_op->dirty_inode)
                        sb->s_op->dirty_inode(inode, flags);
 +
 +              trace_writeback_dirty_inode(inode, flags);
        }
  
        /*
@@@ -1344,47 -1332,43 +1344,43 @@@ void writeback_inodes_sb(struct super_b
  EXPORT_SYMBOL(writeback_inodes_sb);
  
  /**
-  * writeback_inodes_sb_if_idle        -       start writeback if none underway
+  * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
   * @sb: the superblock
-  * @reason: reason why some writeback work was initiated
+  * @nr: the number of pages to write
+  * @reason: the reason of writeback
   *
-  * Invoke writeback_inodes_sb if no writeback is currently underway.
+  * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
   * Returns 1 if writeback was started, 0 if not.
   */
- int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
+ int try_to_writeback_inodes_sb_nr(struct super_block *sb,
+                                 unsigned long nr,
+                                 enum wb_reason reason)
  {
-       if (!writeback_in_progress(sb->s_bdi)) {
-               down_read(&sb->s_umount);
-               writeback_inodes_sb(sb, reason);
-               up_read(&sb->s_umount);
+       if (writeback_in_progress(sb->s_bdi))
                return 1;
-       } else
+       if (!down_read_trylock(&sb->s_umount))
                return 0;
+       writeback_inodes_sb_nr(sb, nr, reason);
+       up_read(&sb->s_umount);
+       return 1;
  }
- EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
+ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
  
  /**
-  * writeback_inodes_sb_nr_if_idle     -       start writeback if none underway
+  * try_to_writeback_inodes_sb - try to start writeback if none underway
   * @sb: the superblock
-  * @nr: the number of pages to write
   * @reason: reason why some writeback work was initiated
   *
-  * Invoke writeback_inodes_sb if no writeback is currently underway.
+  * Implement by try_to_writeback_inodes_sb_nr()
   * Returns 1 if writeback was started, 0 if not.
   */
- int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
-                                  unsigned long nr,
-                                  enum wb_reason reason)
+ int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
  {
-       if (!writeback_in_progress(sb->s_bdi)) {
-               down_read(&sb->s_umount);
-               writeback_inodes_sb_nr(sb, nr, reason);
-               up_read(&sb->s_umount);
-               return 1;
-       } else
-               return 0;
+       return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
  }
- EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
+ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
  
  /**
   * sync_inodes_sb     -       sync sb inode pages
diff --combined mm/page-writeback.c
index 742c40583159cba7736d87c4ec3bc5a7250613d5,1534ebd6e70f5d6ff5ca59a83f7ac0498b889976..efe68148f621959beb28987dd9430fff289f583f
@@@ -35,7 -35,6 +35,7 @@@
  #include <linux/buffer_head.h> /* __set_page_dirty_buffers */
  #include <linux/pagevec.h>
  #include <linux/timer.h>
 +#include <linux/sched/rt.h>
  #include <trace/events/writeback.h>
  
  /*
@@@ -241,9 -240,6 +241,9 @@@ static unsigned long global_dirtyable_m
        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);
  
 +      /* Subtract min_free_kbytes */
 +      x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
 +
        return x + 1;   /* Ensure that we never return 0 */
  }
  
@@@ -696,7 -692,7 +696,7 @@@ static unsigned long bdi_position_ratio
         *     => fast response on large errors; small oscillation near setpoint
         */
        setpoint = (freerun + limit) / 2;
-       x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
+       x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
                    limit - setpoint + 1);
        pos_ratio = x;
        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
@@@ -1986,8 -1982,6 +1986,8 @@@ int __set_page_dirty_no_writeback(struc
   */
  void account_page_dirtied(struct page *page, struct address_space *mapping)
  {
 +      trace_writeback_dirty_page(page, mapping);
 +
        if (mapping_cap_account_dirty(mapping)) {
                __inc_zone_page_state(page, NR_FILE_DIRTY);
                __inc_zone_page_state(page, NR_DIRTIED);
@@@ -2295,27 -2289,3 +2295,27 @@@ int mapping_tagged(struct address_spac
        return radix_tree_tagged(&mapping->page_tree, tag);
  }
  EXPORT_SYMBOL(mapping_tagged);
 +
 +/**
 + * wait_for_stable_page() - wait for writeback to finish, if necessary.
 + * @page:     The page to wait on.
 + *
 + * This function determines if the given page is related to a backing device
 + * that requires page contents to be held stable during writeback.  If so, then
 + * it will wait for any pending writeback to complete.
 + */
 +void wait_for_stable_page(struct page *page)
 +{
 +      struct address_space *mapping = page_mapping(page);
 +      struct backing_dev_info *bdi = mapping->backing_dev_info;
 +
 +      if (!bdi_cap_stable_pages_required(bdi))
 +              return;
 +#ifdef CONFIG_NEED_BOUNCE_POOL
 +      if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE)
 +              return;
 +#endif /* CONFIG_NEED_BOUNCE_POOL */
 +
 +      wait_on_page_writeback(page);
 +}
 +EXPORT_SYMBOL_GPL(wait_for_stable_page);