Merge branch 'xfs-dax-updates' into for-next

author Dave Chinner <david@fromorbit.com>

Tue, 3 Nov 2015 02:28:41 +0000 (13:28 +1100)

committer Dave Chinner <david@fromorbit.com>

Tue, 3 Nov 2015 02:28:41 +0000 (13:28 +1100)
author Dave Chinner <david@fromorbit.com>
Tue, 3 Nov 2015 02:28:41 +0000 (13:28 +1100)
committer Dave Chinner <david@fromorbit.com>
Tue, 3 Nov 2015 02:28:41 +0000 (13:28 +1100)
diff --git a/fs/dax.c b/fs/dax.c

index 7ae6df7ea1d2d04962ef4554a6a2fa1efb977006..74033ad1bc9291e540fe17d4a367818b9b557aa0 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -29,6 +29,11 @@
  #include <linux/uio.h>
  #include <linux/vmstat.h>
  
+/*
+ * dax_clear_blocks() is called from within transaction context from XFS,
+ * and hence this means the stack from this point must follow GFP_NOFS
+ * semantics for all operations.
+ */
  int dax_clear_blocks(struct inode *inode, sector_t block, long size)
  {
         struct block_device *bdev = inode->i_sb->s_bdev;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c

index e926197e0620991ad7f2ca67502e0e0673d45254..3479294c1d586603d75c432267a91ff744a31006 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2509,7 +2509,7 @@ xfs_alloc_vextent(
                  * Try near allocation first, then anywhere-in-ag after
                  * the first a.g. fails.
                  */
-               if ((args->userdata  == XFS_ALLOC_INITIAL_USER_DATA) &&
+               if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
                     (mp->m_flags & XFS_MOUNT_32BITINODES)) {
                         args->fsbno = XFS_AGB_TO_FSB(mp,
                                         ((mp->m_agfrotor / rotorstep) %
@@ -2640,6 +2640,14 @@ xfs_alloc_vextent(
                 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
                         args->len);
  #endif
+
+               /* Zero the extent if we were asked to do so */
+               if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
+                       error = xfs_zero_extent(args->ip, args->fsbno, args->len);
+                       if (error)
+                               goto error0;
+               }
+
         }
         xfs_perag_put(args->pag);
         return 0;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h

index ca1c8168373aa444ebb2ef7eb2865e1d859c1ceb..0ecde4d5cac8ff2ceaf2c373efd87583fcb8152a 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -101,6 +101,7 @@ typedef struct xfs_alloc_arg {
         struct xfs_mount *mp;           /* file system mount point */
         struct xfs_buf  *agbp;          /* buffer for a.g. freelist header */
         struct xfs_perag *pag;          /* per-ag struct for this agno */
+       struct xfs_inode *ip;           /* for userdata zeroing method */
         xfs_fsblock_t   fsbno;          /* file system block number */
         xfs_agnumber_t  agno;           /* allocation group number */
         xfs_agblock_t   agbno;          /* allocation group-relative block # */
@@ -120,15 +121,16 @@ typedef struct xfs_alloc_arg {
         char            wasdel;         /* set if allocation was prev delayed */
         char            wasfromfl;      /* set if allocation is from freelist */
         char            isfl;           /* set if is freelist blocks - !acctg */
-       char            userdata;       /* set if this is user data */
+       char            userdata;       /* mask defining userdata treatment */
         xfs_fsblock_t   firstblock;     /* io first block allocated */
  } xfs_alloc_arg_t;
  
  /*
   * Defines for userdata
   */
-#define XFS_ALLOC_USERDATA             1       /* allocation is for user data*/
-#define XFS_ALLOC_INITIAL_USER_DATA    2       /* special case start of file */
+#define XFS_ALLOC_USERDATA             (1 << 0)/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA    (1 << 1)/* special case start of file */
+#define XFS_ALLOC_USERDATA_ZERO                (1 << 2)/* zero extent on allocation */
  
  xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
                 struct xfs_perag *pag, xfs_extlen_t need);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c

index ab92d10890ed743d3a3cffae2d361e9cf732d0ae..119c2422aac78bcbfb65b8af527efff0aa8e5951 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3802,8 +3802,13 @@ xfs_bmap_btalloc(
         args.wasdel = ap->wasdel;
         args.isfl = 0;
         args.userdata = ap->userdata;
-       if ((error = xfs_alloc_vextent(&args)))
+       if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
+               args.ip = ap->ip;
+
+       error = xfs_alloc_vextent(&args);
+       if (error)
                 return error;
+
         if (tryagain && args.fsbno == NULLFSBLOCK) {
                 /*
                  * Exact allocation failed. Now try with alignment
@@ -4302,11 +4307,14 @@ xfs_bmapi_allocate(
  
         /*
          * Indicate if this is the first user data in the file, or just any
-        * user data.
+        * user data. And if it is userdata, indicate whether it needs to
+        * be initialised to zero during allocation.
          */
         if (!(bma->flags & XFS_BMAPI_METADATA)) {
                 bma->userdata = (bma->offset == 0) ?
                         XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+               if (bma->flags & XFS_BMAPI_ZERO)
+                       bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
         }
  
         bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
@@ -4421,6 +4429,17 @@ xfs_bmapi_convert_unwritten(
         mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
                                 ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
  
+       /*
+        * Before insertion into the bmbt, zero the range being converted
+        * if required.
+        */
+       if (flags & XFS_BMAPI_ZERO) {
+               error = xfs_zero_extent(bma->ip, mval->br_startblock,
+                                       mval->br_blockcount);
+               if (error)
+                       return error;
+       }
+
         error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
                         &bma->cur, mval, bma->firstblock, bma->flist,
                         &tmp_logflags);
@@ -4514,6 +4533,18 @@ xfs_bmapi_write(
         ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
  
+       /* zeroing is for currently only for data extents, not metadata */
+       ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
+                       (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO));
+       /*
+        * we can allocate unwritten extents or pre-zero allocated blocks,
+        * but it makes no sense to do both at once. This would result in
+        * zeroing the unwritten extent twice, but it still being an
+        * unwritten extent....
+        */
+       ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
+                       (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
+
         if (unlikely(XFS_TEST_ERROR(
             (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
              XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h

index 6aaa0c1c7200594de983a02b40d61900ced8e200..a160f8a5a3fcd280ac1af5e02063fd3818710fe5 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -52,9 +52,9 @@ struct xfs_bmalloca {
         xfs_extlen_t            minleft; /* amount must be left after alloc */
         bool                    eof;    /* set if allocating past last extent */
         bool                    wasdel; /* replacing a delayed allocation */
-       bool                    userdata;/* set if is user data */
         bool                    aeof;   /* allocated space at eof */
         bool                    conv;   /* overwriting unwritten extents */
+       char                    userdata;/* userdata mask */
         int                     flags;
  };
  
@@ -109,6 +109,14 @@ typedef    struct xfs_bmap_free
   */
  #define XFS_BMAPI_CONVERT      0x040
  
+/*
+ * allocate zeroed extents - this requires all newly allocated user data extents
+ * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set.
+ * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
+ * during the allocation range to zeroed written extents.
+ */
+#define XFS_BMAPI_ZERO         0x080
+
  #define XFS_BMAPI_FLAGS \
         { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
         { XFS_BMAPI_METADATA,   "METADATA" }, \
@@ -116,7 +124,8 @@ typedef     struct xfs_bmap_free
         { XFS_BMAPI_PREALLOC,   "PREALLOC" }, \
         { XFS_BMAPI_IGSTATE,    "IGSTATE" }, \
         { XFS_BMAPI_CONTIG,     "CONTIG" }, \
-       { XFS_BMAPI_CONVERT,    "CONVERT" }
+       { XFS_BMAPI_CONVERT,    "CONVERT" }, \
+       { XFS_BMAPI_ZERO,       "ZERO" }
  
  
  static inline int xfs_bmapi_aflag(int w)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index e4fff5898c1c0eefd4fb053b028bd357e6dd69d8..29e7e5dd5178ef84638752f31ce6149c64398061 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1259,13 +1259,28 @@ xfs_vm_releasepage(
   * the DIO. There is only going to be one reference to the ioend and its life
   * cycle is constrained by the DIO completion code. hence we don't need
   * reference counting here.
+ *
+ * Note that for DIO, an IO to the highest supported file block offset (i.e.
+ * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
+ * bit variable. Hence if we see this overflow, we have to assume that the IO is
+ * extending the file size. We won't know for sure until IO completion is run
+ * and the actual max write offset is communicated to the IO completion
+ * routine.
+ *
+ * For DAX page faults, we are preparing to never see unwritten extents here,
+ * nor should we ever extend the inode size. Hence we will soon have nothing to
+ * do here for this case, ensuring we don't have to provide an IO completion
+ * callback to free an ioend that we don't actually need for a fault into the
+ * page at offset (2^63 - 1FSB) bytes.
   */
+
  static void
  xfs_map_direct(
         struct inode            *inode,
         struct buffer_head      *bh_result,
         struct xfs_bmbt_irec    *imap,
-       xfs_off_t               offset)
+       xfs_off_t               offset,
+       bool                    dax_fault)
  {
         struct xfs_ioend        *ioend;
         xfs_off_t               size = bh_result->b_size;
@@ -1278,6 +1293,13 @@ xfs_map_direct(
  
         trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
  
+       if (dax_fault) {
+               ASSERT(type == XFS_IO_OVERWRITE);
+               trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+                                           imap);
+               return;
+       }
+
         if (bh_result->b_private) {
                 ioend = bh_result->b_private;
                 ASSERT(ioend->io_size > 0);
@@ -1292,7 +1314,8 @@ xfs_map_direct(
                                               ioend->io_size, ioend->io_type,
                                               imap);
         } else if (type == XFS_IO_UNWRITTEN ||
-                  offset + size > i_size_read(inode)) {
+                  offset + size > i_size_read(inode) ||
+                  offset + size < 0) {
                 ioend = xfs_alloc_ioend(inode, type);
                 ioend->io_offset = offset;
                 ioend->io_size = size;
@@ -1354,7 +1377,8 @@ __xfs_get_blocks(
         sector_t                iblock,
         struct buffer_head      *bh_result,
         int                     create,
-       bool                    direct)
+       bool                    direct,
+       bool                    dax_fault)
  {
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
@@ -1402,10 +1426,12 @@ __xfs_get_blocks(
         if (error)
                 goto out_unlock;
  
+       /* for DAX, we convert unwritten extents directly */
         if (create &&
             (!nimaps ||
              (imap.br_startblock == HOLESTARTBLOCK ||
-             imap.br_startblock == DELAYSTARTBLOCK))) {
+             imap.br_startblock == DELAYSTARTBLOCK) ||
+            (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
                 if (direct || xfs_get_extsz_hint(ip)) {
                         /*
                          * xfs_iomap_write_direct() expects the shared lock. It
@@ -1450,6 +1476,12 @@ __xfs_get_blocks(
                 goto out_unlock;
         }
  
+       if (IS_DAX(inode) && create) {
+               ASSERT(!ISUNWRITTEN(&imap));
+               /* zeroing is not needed at a higher layer */
+               new = 0;
+       }
+
         /* trim mapping down to size requested */
         if (direct || size > (1 << inode->i_blkbits))
                 xfs_map_trim_size(inode, iblock, bh_result,
@@ -1467,7 +1499,8 @@ __xfs_get_blocks(
                         set_buffer_unwritten(bh_result);
                 /* direct IO needs special help */
                 if (create && direct)
-                       xfs_map_direct(inode, bh_result, &imap, offset);
+                       xfs_map_direct(inode, bh_result, &imap, offset,
+                                      dax_fault);
         }
  
         /*
@@ -1514,7 +1547,7 @@ xfs_get_blocks(
         struct buffer_head      *bh_result,
         int                     create)
  {
-       return __xfs_get_blocks(inode, iblock, bh_result, create, false);
+       return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
  }
  
  int
@@ -1524,7 +1557,17 @@ xfs_get_blocks_direct(
         struct buffer_head      *bh_result,
         int                     create)
  {
-       return __xfs_get_blocks(inode, iblock, bh_result, create, true);
+       return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
+}
+
+int
+xfs_get_blocks_dax_fault(
+       struct inode            *inode,
+       sector_t                iblock,
+       struct buffer_head      *bh_result,
+       int                     create)
+{
+       return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
  }
  
  static void
@@ -1623,45 +1666,6 @@ xfs_end_io_direct_write(
         __xfs_end_io_direct_write(inode, ioend, offset, size);
  }
  
-/*
- * For DAX we need a mapping buffer callback for unwritten extent conversion
- * when page faults allocate blocks and then zero them. Note that in this
- * case the mapping indicated by the ioend may extend beyond EOF. We most
- * definitely do not want to extend EOF here, so we trim back the ioend size to
- * EOF.
- */
-#ifdef CONFIG_FS_DAX
-void
-xfs_end_io_dax_write(
-       struct buffer_head      *bh,
-       int                     uptodate)
-{
-       struct xfs_ioend        *ioend = bh->b_private;
-       struct inode            *inode = ioend->io_inode;
-       ssize_t                 size = ioend->io_size;
-
-       ASSERT(IS_DAX(ioend->io_inode));
-
-       /* if there was an error zeroing, then don't convert it */
-       if (!uptodate)
-               ioend->io_error = -EIO;
-
-       /*
-        * Trim update to EOF, so we don't extend EOF during unwritten extent
-        * conversion of partial EOF blocks.
-        */
-       spin_lock(&XFS_I(inode)->i_flags_lock);
-       if (ioend->io_offset + size > i_size_read(inode))
-               size = i_size_read(inode) - ioend->io_offset;
-       spin_unlock(&XFS_I(inode)->i_flags_lock);
-
-       __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
-
-}
-#else
-void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
-#endif
-
  static inline ssize_t
  xfs_vm_do_dio(
         struct inode            *inode,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h

index 86afd1ac7895f8d225fa2da2285c03f7014666e3..f6ffc9ae5cebeae7ebf8c2cc7c3598c07c4c68f5 100644 (file)
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -58,7 +58,8 @@ int   xfs_get_blocks(struct inode *inode, sector_t offset,
                        struct buffer_head *map_bh, int create);
  int    xfs_get_blocks_direct(struct inode *inode, sector_t offset,
                               struct buffer_head *map_bh, int create);
-void   xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
+int    xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
+                                struct buffer_head *map_bh, int create);
  
  extern void xfs_count_page_state(struct page *, int *, int *);
  
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

index eca325e4226147db653495f02bc3c6ddb32774ae..dbae6490a79a5f41e5407b57f82477e5ed7582b8 100644 (file)
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -56,6 +56,35 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
                  XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
  }
  
+/*
+ * Routine to zero an extent on disk allocated to the specific inode.
+ *
+ * The VFS functions take a linearised filesystem block offset, so we have to
+ * convert the sparse xfs fsb to the right format first.
+ * VFS types are real funky, too.
+ */
+int
+xfs_zero_extent(
+       struct xfs_inode *ip,
+       xfs_fsblock_t   start_fsb,
+       xfs_off_t       count_fsb)
+{
+       struct xfs_mount *mp = ip->i_mount;
+       xfs_daddr_t     sector = xfs_fsb_to_db(ip, start_fsb);
+       sector_t        block = XFS_BB_TO_FSBT(mp, sector);
+       ssize_t         size = XFS_FSB_TO_B(mp, count_fsb);
+
+       if (IS_DAX(VFS_I(ip)))
+               return dax_clear_blocks(VFS_I(ip), block, size);
+
+       /*
+        * let the block layer decide on the fastest method of
+        * implementing the zeroing.
+        */
+       return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
+
+}
+
  /*
   * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
   * caller.  Frees all the extents that need freeing, which must be done
@@ -229,6 +258,13 @@ xfs_bmap_rtalloc(
                 xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
                         ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
                                         XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+
+               /* Zero the extent if we were asked to do so */
+               if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
+                       error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
+                       if (error)
+                               return error;
+               }
         } else {
                 ap->length = 0;
         }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index a3bf4c099dd2e71edf3faa7c5779293c5e0fee8c..39743efae79501f3d590b05722ebd0c30deb50f4 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1493,7 +1493,7 @@ xfs_file_llseek(
   *
   * mmap_sem (MM)
   *   sb_start_pagefault(vfs, freeze)
- *     i_mmap_lock (XFS - truncate serialisation)
+ *     i_mmaplock (XFS - truncate serialisation)
   *       page_lock (MM)
   *         i_lock (XFS - extent map serialisation)
   */
@@ -1519,8 +1519,7 @@ xfs_filemap_page_mkwrite(
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
         if (IS_DAX(inode)) {
-               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
-                                   xfs_end_io_dax_write);
+               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
         } else {
                 ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
                 ret = block_page_mkwrite_return(ret);
@@ -1554,7 +1553,7 @@ xfs_filemap_fault(
                  * changes to xfs_get_blocks_direct() to map unwritten extent
                  * ioend for conversion on read-only mappings.
                  */
-               ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
+               ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
         } else
                 ret = filemap_fault(vma, vmf);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1562,6 +1561,13 @@ xfs_filemap_fault(
         return ret;
  }
  
+/*
+ * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
+ * both read and write faults. Hence we need to handle both cases. There is no
+ * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+ * handle both cases here. @flags carries the information on the type of fault
+ * occuring.
+ */
  STATIC int
  xfs_filemap_pmd_fault(
         struct vm_area_struct   *vma,
@@ -1578,15 +1584,54 @@ xfs_filemap_pmd_fault(
  
         trace_xfs_filemap_pmd_fault(ip);
  
-       sb_start_pagefault(inode->i_sb);
-       file_update_time(vma->vm_file);
+       if (flags & FAULT_FLAG_WRITE) {
+               sb_start_pagefault(inode->i_sb);
+               file_update_time(vma->vm_file);
+       }
+
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
-                                   xfs_end_io_dax_write);
+       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
+                             NULL);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-       sb_end_pagefault(inode->i_sb);
  
+       if (flags & FAULT_FLAG_WRITE)
+               sb_end_pagefault(inode->i_sb);
+
+       return ret;
+}
+
+/*
+ * pfn_mkwrite was originally inteneded to ensure we capture time stamp
+ * updates on write faults. In reality, it's need to serialise against
+ * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
+ * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
+ * barrier in place.
+ */
+static int
+xfs_filemap_pfn_mkwrite(
+       struct vm_area_struct   *vma,
+       struct vm_fault         *vmf)
+{
+
+       struct inode            *inode = file_inode(vma->vm_file);
+       struct xfs_inode        *ip = XFS_I(inode);
+       int                     ret = VM_FAULT_NOPAGE;
+       loff_t                  size;
+
+       trace_xfs_filemap_pfn_mkwrite(ip);
+
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+
+       /* check if the faulting page hasn't raced with truncate */
+       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (vmf->pgoff >= size)
+               ret = VM_FAULT_SIGBUS;
+       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+       sb_end_pagefault(inode->i_sb);
         return ret;
+
  }
  
  static const struct vm_operations_struct xfs_file_vm_ops = {
@@ -1594,6 +1639,7 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
         .pmd_fault      = xfs_filemap_pmd_fault,
         .map_pages      = filemap_map_pages,
         .page_mkwrite   = xfs_filemap_page_mkwrite,
+       .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
  };
  
  STATIC int
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index c3cb5a552c4eb2fae269bbed95e40c86fa8a93f5..f4f5b43cf64712cf8fe46924820142fc75a33e86 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -132,6 +132,7 @@ xfs_iomap_write_direct(
         int             committed;
         int             error;
         int             lockmode;
+       int             bmapi_flags = XFS_BMAPI_PREALLOC;
  
         rt = XFS_IS_REALTIME_INODE(ip);
         extsz = xfs_get_extsz_hint(ip);
@@ -195,6 +196,23 @@ xfs_iomap_write_direct(
          * Allocate and setup the transaction
          */
         tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+
+       /*
+        * For DAX, we do not allocate unwritten extents, but instead we zero
+        * the block before we commit the transaction.  Ideally we'd like to do
+        * this outside the transaction context, but if we commit and then crash
+        * we may not have zeroed the blocks and this will be exposed on
+        * recovery of the allocation. Hence we must zero before commit.
+        * Further, if we are mapping unwritten extents here, we need to zero
+        * and convert them to written so that we don't need an unwritten extent
+        * callback for DAX. This also means that we need to be able to dip into
+        * the reserve block pool if there is no space left but we need to do
+        * unwritten extent conversion.
+        */
+       if (IS_DAX(VFS_I(ip))) {
+               bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
+               tp->t_flags |= XFS_TRANS_RESERVE;
+       }
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
                                   resblks, resrtextents);
         /*
@@ -221,7 +239,7 @@ xfs_iomap_write_direct(
         xfs_bmap_init(&free_list, &firstfsb);
         nimaps = 1;
         error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
-                               XFS_BMAPI_PREALLOC, &firstfsb, resblks, imap,
+                               bmapi_flags, &firstfsb, resblks, imap,
                                 &nimaps, &free_list);
         if (error)
                 goto out_bmap_cancel;
@@ -232,6 +250,7 @@ xfs_iomap_write_direct(
         error = xfs_bmap_finish(&tp, &free_list, &committed);
         if (error)
                 goto out_bmap_cancel;
+
         error = xfs_trans_commit(tp);
         if (error)
                 goto out_unlock;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h

index 6db5fc041d4f2550d06582827d73255c8f102f96..b57098481c10a2a55a05bf6e75e6e43f7e224401 100644 (file)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -338,4 +338,7 @@ extern int  xfs_dev_is_read_only(struct xfs_mount *, char *);
  
  extern void    xfs_set_low_space_thresholds(struct xfs_mount *);
  
+int    xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
+                       xfs_off_t count_fsb);
+
  #endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index 957f5ccdd84fcc4606f3427dbc39c9f5636f032d..877079eb0f8f0e5d286e4c797761df6bf2f55228 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -689,6 +689,7 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
  DEFINE_INODE_EVENT(xfs_filemap_fault);
  DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
  DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
+DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
  
  DECLARE_EVENT_CLASS(xfs_iref_class,
         TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
author	Dave Chinner <david@fromorbit.com>
	Tue, 3 Nov 2015 02:28:41 +0000 (13:28 +1100)
committer	Dave Chinner <david@fromorbit.com>
	Tue, 3 Nov 2015 02:28:41 +0000 (13:28 +1100)
fs/dax.c		patch \| blob \| history
fs/xfs/libxfs/xfs_alloc.c		patch \| blob \| history
fs/xfs/libxfs/xfs_alloc.h		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.c		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.h		patch \| blob \| history
fs/xfs/xfs_aops.c		patch \| blob \| history
fs/xfs/xfs_aops.h		patch \| blob \| history
fs/xfs/xfs_bmap_util.c		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_iomap.c		patch \| blob \| history
fs/xfs/xfs_mount.h		patch \| blob \| history
fs/xfs/xfs_trace.h		patch \| blob \| history