Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[karo-tx-linux.git] / fs / xfs / xfs_file.c
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 1f12ad0a8585b3d0f0788cfe5b88b6ab662a1547..8121e75352ee9bddd4726ca685d6d3e855256bdd 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -559,7 +559,7 @@ restart:
         if (error <= 0)
                 return error;
  
-       error = xfs_break_layouts(inode, iolock);
+       error = xfs_break_layouts(inode, iolock, true);
         if (error)
                 return error;
  
@@ -569,21 +569,42 @@ restart:
          * write.  If zeroing is needed and we are currently holding the
          * iolock shared, we need to update it to exclusive which implies
          * having to redo all checks before.
+        *
+        * We need to serialise against EOF updates that occur in IO
+        * completions here. We want to make sure that nobody is changing the
+        * size while we do this check until we have placed an IO barrier (i.e.
+        * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
+        * The spinlock effectively forms a memory barrier once we have the
+        * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
+        * and hence be able to correctly determine if we need to run zeroing.
          */
+       spin_lock(&ip->i_flags_lock);
         if (iocb->ki_pos > i_size_read(inode)) {
                 bool    zero = false;
  
+               spin_unlock(&ip->i_flags_lock);
                 if (*iolock == XFS_IOLOCK_SHARED) {
                         xfs_rw_iunlock(ip, *iolock);
                         *iolock = XFS_IOLOCK_EXCL;
                         xfs_rw_ilock(ip, *iolock);
                         iov_iter_reexpand(from, count);
+
+                       /*
+                        * We now have an IO submission barrier in place, but
+                        * AIO can do EOF updates during IO completion and hence
+                        * we now need to wait for all of them to drain. Non-AIO
+                        * DIO will have drained before we are given the
+                        * XFS_IOLOCK_EXCL, and so for most cases this wait is a
+                        * no-op.
+                        */
+                       inode_dio_wait(inode);
                         goto restart;
                 }
                 error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
                 if (error)
                         return error;
-       }
+       } else
+               spin_unlock(&ip->i_flags_lock);
  
         /*
          * Updating the timestamps will grab the ilock again from
@@ -645,6 +666,8 @@ xfs_file_dio_aio_write(
         int                     iolock;
         size_t                  count = iov_iter_count(from);
         loff_t                  pos = iocb->ki_pos;
+       loff_t                  end;
+       struct iov_iter         data;
         struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                         mp->m_rtdev_targp : mp->m_ddev_targp;
  
@@ -685,10 +708,11 @@ xfs_file_dio_aio_write(
                 goto out;
         count = iov_iter_count(from);
         pos = iocb->ki_pos;
+       end = pos + count - 1;
  
         if (mapping->nrpages) {
                 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                                   pos, pos + count - 1);
+                                                  pos, end);
                 if (ret)
                         goto out;
                 /*
@@ -698,7 +722,7 @@ xfs_file_dio_aio_write(
                  */
                 ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
                                         pos >> PAGE_CACHE_SHIFT,
-                                       (pos + count - 1) >> PAGE_CACHE_SHIFT);
+                                       end >> PAGE_CACHE_SHIFT);
                 WARN_ON_ONCE(ret);
                 ret = 0;
         }
@@ -715,8 +739,22 @@ xfs_file_dio_aio_write(
         }
  
         trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-       ret = generic_file_direct_write(iocb, from, pos);
  
+       data = *from;
+       ret = mapping->a_ops->direct_IO(iocb, &data, pos);
+
+       /* see generic_file_direct_write() for why this is necessary */
+       if (mapping->nrpages) {
+               invalidate_inode_pages2_range(mapping,
+                                             pos >> PAGE_CACHE_SHIFT,
+                                             end >> PAGE_CACHE_SHIFT);
+       }
+
+       if (ret > 0) {
+               pos += ret;
+               iov_iter_advance(from, ret);
+               iocb->ki_pos = pos;
+       }
  out:
         xfs_rw_iunlock(ip, iolock);
  
@@ -822,6 +860,11 @@ xfs_file_write_iter(
         return ret;
  }
  
+#define        XFS_FALLOC_FL_SUPPORTED                                         \
+               (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
+                FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
+                FALLOC_FL_INSERT_RANGE)
+
  STATIC long
  xfs_file_fallocate(
         struct file             *file,
@@ -835,18 +878,21 @@ xfs_file_fallocate(
         enum xfs_prealloc_flags flags = 0;
         uint                    iolock = XFS_IOLOCK_EXCL;
         loff_t                  new_size = 0;
+       bool                    do_file_insert = 0;
  
         if (!S_ISREG(inode->i_mode))
                 return -EINVAL;
-       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
-                    FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+       if (mode & ~XFS_FALLOC_FL_SUPPORTED)
                 return -EOPNOTSUPP;
  
         xfs_ilock(ip, iolock);
-       error = xfs_break_layouts(inode, &iolock);
+       error = xfs_break_layouts(inode, &iolock, false);
         if (error)
                 goto out_unlock;
  
+       xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+       iolock |= XFS_MMAPLOCK_EXCL;
+
         if (mode & FALLOC_FL_PUNCH_HOLE) {
                 error = xfs_free_file_space(ip, offset, len);
                 if (error)
@@ -873,6 +919,27 @@ xfs_file_fallocate(
                 error = xfs_collapse_file_space(ip, offset, len);
                 if (error)
                         goto out_unlock;
+       } else if (mode & FALLOC_FL_INSERT_RANGE) {
+               unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+
+               new_size = i_size_read(inode) + len;
+               if (offset & blksize_mask || len & blksize_mask) {
+                       error = -EINVAL;
+                       goto out_unlock;
+               }
+
+               /* check the new inode size does not wrap through zero */
+               if (new_size > inode->i_sb->s_maxbytes) {
+                       error = -EFBIG;
+                       goto out_unlock;
+               }
+
+               /* Offset should be less than i_size */
+               if (offset >= i_size_read(inode)) {
+                       error = -EINVAL;
+                       goto out_unlock;
+               }
+               do_file_insert = 1;
         } else {
                 flags |= XFS_PREALLOC_SET;
  
@@ -907,8 +974,19 @@ xfs_file_fallocate(
                 iattr.ia_valid = ATTR_SIZE;
                 iattr.ia_size = new_size;
                 error = xfs_setattr_size(ip, &iattr);
+               if (error)
+                       goto out_unlock;
         }
  
+       /*
+        * Perform hole insertion now that the file size has been
+        * updated so that if we crash during the operation we don't
+        * leave shifted extents past EOF and hence losing access to
+        * the data that is contained within them.
+        */
+       if (do_file_insert)
+               error = xfs_insert_file_space(ip, offset, len);
+
  out_unlock:
         xfs_iunlock(ip, iolock);
         return error;
@@ -996,20 +1074,6 @@ xfs_file_mmap(
         return 0;
  }
  
-/*
- * mmap()d file has taken write protection fault and is being made
- * writable. We can set the page state up correctly for a writable
- * page, which means we can do correct delalloc accounting (ENOSPC
- * checking!) and unwritten extent mapping.
- */
-STATIC int
-xfs_vm_page_mkwrite(
-       struct vm_area_struct   *vma,
-       struct vm_fault         *vmf)
-{
-       return block_page_mkwrite(vma, vmf, xfs_get_blocks);
-}
-
  /*
   * This type is designed to indicate the type of offset we would like
   * to search from page cache for xfs_seek_hole_data().
@@ -1385,6 +1449,55 @@ xfs_file_llseek(
         }
  }
  
+/*
+ * Locking for serialisation of IO during page faults. This results in a lock
+ * ordering of:
+ *
+ * mmap_sem (MM)
+ *   i_mmap_lock (XFS - truncate serialisation)
+ *     page_lock (MM)
+ *       i_lock (XFS - extent map serialisation)
+ */
+STATIC int
+xfs_filemap_fault(
+       struct vm_area_struct   *vma,
+       struct vm_fault         *vmf)
+{
+       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
+       int                     error;
+
+       trace_xfs_filemap_fault(ip);
+
+       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+       error = filemap_fault(vma, vmf);
+       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+
+       return error;
+}
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
+ */
+STATIC int
+xfs_filemap_page_mkwrite(
+       struct vm_area_struct   *vma,
+       struct vm_fault         *vmf)
+{
+       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
+       int                     error;
+
+       trace_xfs_filemap_page_mkwrite(ip);
+
+       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+       error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+
+       return error;
+}
+
  const struct file_operations xfs_file_operations = {
         .llseek         = xfs_file_llseek,
         .read_iter      = xfs_file_read_iter,
@@ -1415,7 +1528,7 @@ const struct file_operations xfs_dir_file_operations = {
  };
  
  static const struct vm_operations_struct xfs_file_vm_ops = {
-       .fault          = filemap_fault,
+       .fault          = xfs_filemap_fault,
         .map_pages      = filemap_map_pages,
-       .page_mkwrite   = xfs_vm_page_mkwrite,
+       .page_mkwrite   = xfs_filemap_page_mkwrite,
  };