Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs

[karo-tx-linux.git] / fs / ext3 / inode.c
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c

index efce2bbfb5e580976e76d95791dd676a29d71dd0..f5157d0d1b43772e95aace234b4ee50fca9ade3b 100644 (file)
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1559,56 +1559,17 @@ static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
  }
  
  /*
- * Note that we always start a transaction even if we're not journalling
- * data.  This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext3_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
+ * Note that whenever we need to map blocks we start a transaction even if
+ * we're not journalling data.  This is to preserve ordering: any hole
+ * instantiation within __block_write_full_page -> ext3_get_block() should be
+ * journalled along with the data so we don't crash and then get metadata which
   * refers to old data.
   *
   * In all journalling modes block_write_full_page() will start the I/O.
   *
- * Problem:
- *
- *     ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- *             ext3_writepage()
- *
- * Similar for:
- *
- *     ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
- *
- * Same applies to ext3_get_block().  We will deadlock on various things like
- * lock_journal and i_truncate_mutex.
- *
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
- *
- * 16May01: If we're reentered then journal_current_handle() will be
- *         non-zero. We simply *return*.
- *
- * 1 July 2001: @@@ FIXME:
- *   In journalled data mode, a data buffer may be metadata against the
- *   current transaction.  But the same file is part of a shared mapping
- *   and someone does a writepage() on it.
- *
- *   We will move the buffer onto the async_data list, but *after* it has
- *   been dirtied. So there's a small window where we have dirty data on
- *   BJ_Metadata.
- *
- *   Note that this only applies to the last partial page in the file.  The
- *   bit which block_write_full_page() uses prepare/commit for.  (That's
- *   broken code anyway: it's wrong for msync()).
- *
- *   It's a rare case: affects the final partial page, for journalled data
- *   where the file is subject to bith write() and writepage() in the same
- *   transction.  To fix it we'll need a custom block_write_full_page().
- *   We'll probably need that anyway for journalling writepage() output.
- *
   * We don't honour synchronous mounts for writepage().  That would be
   * disastrous.  Any write() or metadata operation will sync the fs for
   * us.
- *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
   */
  static int ext3_ordered_writepage(struct page *page,
                                 struct writeback_control *wbc)
@@ -1673,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page,
          * block_write_full_page() succeeded.  Otherwise they are unmapped,
          * and generally junk.
          */
-       if (ret == 0) {
-               err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+       if (ret == 0)
+               ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
                                         NULL, journal_dirty_data_fn);
-               if (!ret)
-                       ret = err;
-       }
         walk_page_buffers(handle, page_bufs, 0,
                         PAGE_CACHE_SIZE, NULL, bput_one);
         err = ext3_journal_stop(handle);
@@ -1925,6 +1883,8 @@ retry:
                          * and pretend the write failed... */
                         ext3_truncate_failed_direct_write(inode);
                         ret = PTR_ERR(handle);
+                       if (inode->i_nlink)
+                               ext3_orphan_del(NULL, inode);
                         goto out;
                 }
                 if (inode->i_nlink)
@@ -3212,21 +3172,20 @@ out_brelse:
   *
   * We are called from a few places:
   *
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
   *   Here, there will be no transaction running. We wait for any running
   *   transaction to commit.
   *
- * - Within sys_sync(), kupdate and such.
- *   We wait on commit, if tol to.
+ * - Within flush work (for sys_sync(), kupdate and such).
+ *   We wait on commit, if told to.
   *
- * - Within prune_icache() (PF_MEMALLOC == true)
- *   Here we simply return.  We can't afford to block kswapd on the
- *   journal commit.
+ * - Within iput_final() -> write_inode_now()
+ *   We wait on commit, if told to.
   *
   * In all cases it is actually safe for us to return without doing anything,
   * because the inode has been copied into a raw inode buffer in
- * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext3_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
+ * writeback.
   *
   * Note that we are absolutely dependent upon all inode dirtiers doing the
   * right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -3238,13 +3197,13 @@ out_brelse:
   *     stuff();
   *     inode->i_size = expr;
   *
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost.  Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost.  Plus the inode will no longer be on the
+ * superblock's dirty inode list.
   */
  int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
  {
-       if (current->flags & PF_MEMALLOC)
+       if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
                 return 0;
  
         if (ext3_journal_current_handle()) {
@@ -3253,7 +3212,12 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
                 return -EIO;
         }
  
-       if (wbc->sync_mode != WB_SYNC_ALL)
+       /*
+        * No need to force transaction in WB_SYNC_NONE mode. Also
+        * ext3_sync_fs() will force the commit after everything is
+        * written.
+        */
+       if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
                 return 0;
  
         return ext3_force_commit(inode->i_sb);