]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - fs/dax.c
Merge tag 'irqchip-4.13-2' of git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm...
[karo-tx-linux.git] / fs / dax.c
index 43bbd6d1037d20099bc50393ab2b255f02e188a7..2a6889b3585f068c73091d8895639b7e941d702a 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -460,35 +460,6 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
        return ret;
 }
 
-/*
- * Invalidate exceptional DAX entry if easily possible. This handles DAX
- * entries for invalidate_inode_pages() so we evict the entry only if we can
- * do so without blocking.
- */
-int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
-       int ret = 0;
-       void *entry, **slot;
-       struct radix_tree_root *page_tree = &mapping->page_tree;
-
-       spin_lock_irq(&mapping->tree_lock);
-       entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
-       if (!entry || !radix_tree_exceptional_entry(entry) ||
-           slot_locked(mapping, slot))
-               goto out;
-       if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
-           radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
-               goto out;
-       radix_tree_delete(page_tree, index);
-       mapping->nrexceptional--;
-       ret = 1;
-out:
-       spin_unlock_irq(&mapping->tree_lock);
-       if (ret)
-               dax_wake_mapping_entry_waiter(mapping, index, entry, true);
-       return ret;
-}
-
 /*
  * Invalidate exceptional DAX entry if it is clean.
  */
@@ -509,21 +480,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 static int dax_load_hole(struct address_space *mapping, void **entry,
                         struct vm_fault *vmf)
 {
+       struct inode *inode = mapping->host;
        struct page *page;
        int ret;
 
        /* Hole page already exists? Return it...  */
        if (!radix_tree_exceptional_entry(*entry)) {
                page = *entry;
-               goto out;
+               goto finish_fault;
        }
 
        /* This will replace locked radix tree entry with a hole page */
        page = find_or_create_page(mapping, vmf->pgoff,
                                   vmf->gfp_mask | __GFP_ZERO);
-       if (!page)
-               return VM_FAULT_OOM;
- out:
+       if (!page) {
+               ret = VM_FAULT_OOM;
+               goto out;
+       }
+
+finish_fault:
        vmf->page = page;
        ret = finish_fault(vmf);
        vmf->page = NULL;
@@ -531,8 +506,10 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
        if (!ret) {
                /* Grab reference for PTE that is now referencing the page */
                get_page(page);
-               return VM_FAULT_NOPAGE;
+               ret = VM_FAULT_NOPAGE;
        }
+out:
+       trace_dax_load_hole(inode, vmf, ret);
        return ret;
 }
 
@@ -817,6 +794,7 @@ static int dax_writeback_one(struct block_device *bdev,
        spin_lock_irq(&mapping->tree_lock);
        radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
        spin_unlock_irq(&mapping->tree_lock);
+       trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
  dax_unlock:
        dax_read_unlock(id);
        put_locked_mapping_entry(mapping, index, entry);
@@ -857,6 +835,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
        start_index = wbc->range_start >> PAGE_SHIFT;
        end_index = wbc->range_end >> PAGE_SHIFT;
 
+       trace_dax_writeback_range(inode, start_index, end_index);
+
        tag_pages_for_writeback(mapping, start_index, end_index);
 
        pagevec_init(&pvec, 0);
@@ -876,14 +856,14 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 
                        ret = dax_writeback_one(bdev, dax_dev, mapping,
                                        indices[i], pvec.pages[i]);
-                       if (ret < 0) {
-                               put_dax(dax_dev);
-                               return ret;
-                       }
+                       if (ret < 0)
+                               goto out;
                }
        }
+out:
        put_dax(dax_dev);
-       return 0;
+       trace_dax_writeback_range_done(inode, start_index, end_index);
+       return (ret < 0 ? ret : 0);
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
@@ -916,6 +896,7 @@ static int dax_insert_mapping(struct address_space *mapping,
                return PTR_ERR(ret);
        *entryp = ret;
 
+       trace_dax_insert_mapping(mapping->host, vmf, ret);
        return vm_insert_mixed(vma, vaddr, pfn);
 }
 
@@ -927,6 +908,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
 {
        struct file *file = vmf->vma->vm_file;
        struct address_space *mapping = file->f_mapping;
+       struct inode *inode = mapping->host;
        void *entry, **slot;
        pgoff_t index = vmf->pgoff;
 
@@ -936,6 +918,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
                if (entry)
                        put_unlocked_mapping_entry(mapping, index, entry);
                spin_unlock_irq(&mapping->tree_lock);
+               trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
                return VM_FAULT_NOPAGE;
        }
        radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
@@ -948,6 +931,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
         */
        finish_mkwrite_fault(vmf);
        put_locked_mapping_entry(mapping, index, entry);
+       trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
        return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -980,12 +964,12 @@ int __dax_zero_page_range(struct block_device *bdev,
                void *kaddr;
                pfn_t pfn;
 
-               rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+               rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
                if (rc)
                        return rc;
 
                id = dax_read_lock();
-               rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr,
+               rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr,
                                &pfn);
                if (rc < 0) {
                        dax_read_unlock(id);
@@ -1031,7 +1015,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
         * into page tables. We have to tear down these mappings so that data
         * written by write(2) is visible in mmap.
         */
-       if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
+       if (iomap->flags & IOMAP_F_NEW) {
                invalidate_inode_pages2_range(inode->i_mapping,
                                              pos >> PAGE_SHIFT,
                                              (end - 1) >> PAGE_SHIFT);
@@ -1150,34 +1134,50 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
        int vmf_ret = 0;
        void *entry;
 
+       trace_dax_pte_fault(inode, vmf, vmf_ret);
        /*
         * Check whether offset isn't beyond end of file now. Caller is supposed
         * to hold locks serializing us with truncate / punch hole so this is
         * a reliable test.
         */
-       if (pos >= i_size_read(inode))
-               return VM_FAULT_SIGBUS;
+       if (pos >= i_size_read(inode)) {
+               vmf_ret = VM_FAULT_SIGBUS;
+               goto out;
+       }
 
        if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
                flags |= IOMAP_WRITE;
 
+       entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
+       if (IS_ERR(entry)) {
+               vmf_ret = dax_fault_return(PTR_ERR(entry));
+               goto out;
+       }
+
+       /*
+        * It is possible, particularly with mixed reads & writes to private
+        * mappings, that we have raced with a PMD fault that overlaps with
+        * the PTE we need to set up.  If so just return and the fault will be
+        * retried.
+        */
+       if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
+               vmf_ret = VM_FAULT_NOPAGE;
+               goto unlock_entry;
+       }
+
        /*
         * Note that we don't bother to use iomap_apply here: DAX required
         * the file system block size to be equal the page size, which means
         * that we never have to deal with more than a single extent here.
         */
        error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
-       if (error)
-               return dax_fault_return(error);
-       if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
-               vmf_ret = dax_fault_return(-EIO);       /* fs corruption? */
-               goto finish_iomap;
+       if (error) {
+               vmf_ret = dax_fault_return(error);
+               goto unlock_entry;
        }
-
-       entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
-       if (IS_ERR(entry)) {
-               vmf_ret = dax_fault_return(PTR_ERR(entry));
-               goto finish_iomap;
+       if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
+               error = -EIO;   /* fs corruption? */
+               goto error_finish_iomap;
        }
 
        sector = dax_iomap_sector(&iomap, pos);
@@ -1199,13 +1199,13 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
                }
 
                if (error)
-                       goto error_unlock_entry;
+                       goto error_finish_iomap;
 
                __SetPageUptodate(vmf->cow_page);
                vmf_ret = finish_fault(vmf);
                if (!vmf_ret)
                        vmf_ret = VM_FAULT_DONE_COW;
-               goto unlock_entry;
+               goto finish_iomap;
        }
 
        switch (iomap.type) {
@@ -1225,7 +1225,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
        case IOMAP_HOLE:
                if (!(vmf->flags & FAULT_FLAG_WRITE)) {
                        vmf_ret = dax_load_hole(mapping, &entry, vmf);
-                       goto unlock_entry;
+                       goto finish_iomap;
                }
                /*FALLTHRU*/
        default:
@@ -1234,10 +1234,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
                break;
        }
 
- error_unlock_entry:
+ error_finish_iomap:
        vmf_ret = dax_fault_return(error) | major;
- unlock_entry:
-       put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  finish_iomap:
        if (ops->iomap_end) {
                int copied = PAGE_SIZE;
@@ -1252,6 +1250,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
                 */
                ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
        }
+ unlock_entry:
+       put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ out:
+       trace_dax_pte_fault_done(inode, vmf, vmf_ret);
        return vmf_ret;
 }
 
@@ -1396,6 +1398,28 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
        if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
                goto fallback;
 
+       /*
+        * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+        * PMD or a HZP entry.  If it can't (because a 4k page is already in
+        * the tree, for instance), it will return -EEXIST and we just fall
+        * back to 4k entries.
+        */
+       entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+       if (IS_ERR(entry))
+               goto fallback;
+
+       /*
+        * It is possible, particularly with mixed reads & writes to private
+        * mappings, that we have raced with a PTE fault that overlaps with
+        * the PMD we need to set up.  If so just return and the fault will be
+        * retried.
+        */
+       if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
+                       !pmd_devmap(*vmf->pmd)) {
+               result = 0;
+               goto unlock_entry;
+       }
+
        /*
         * Note that we don't use iomap_apply here.  We aren't doing I/O, only
         * setting up a mapping, so really we're using iomap_begin() as a way
@@ -1404,21 +1428,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
        pos = (loff_t)pgoff << PAGE_SHIFT;
        error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
        if (error)
-               goto fallback;
+               goto unlock_entry;
 
        if (iomap.offset + iomap.length < pos + PMD_SIZE)
                goto finish_iomap;
 
-       /*
-        * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
-        * PMD or a HZP entry.  If it can't (because a 4k page is already in
-        * the tree, for instance), it will return -EEXIST and we just fall
-        * back to 4k entries.
-        */
-       entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
-       if (IS_ERR(entry))
-               goto finish_iomap;
-
        switch (iomap.type) {
        case IOMAP_MAPPED:
                result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
@@ -1426,7 +1440,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
        case IOMAP_UNWRITTEN:
        case IOMAP_HOLE:
                if (WARN_ON_ONCE(write))
-                       goto unlock_entry;
+                       break;
                result = dax_pmd_load_hole(vmf, &iomap, &entry);
                break;
        default:
@@ -1434,8 +1448,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
                break;
        }
 
- unlock_entry:
-       put_locked_mapping_entry(mapping, pgoff, entry);
  finish_iomap:
        if (ops->iomap_end) {
                int copied = PMD_SIZE;
@@ -1451,6 +1463,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
                ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
                                &iomap);
        }
+ unlock_entry:
+       put_locked_mapping_entry(mapping, pgoff, entry);
  fallback:
        if (result == VM_FAULT_FALLBACK) {
                split_huge_pmd(vma, vmf->pmd, vmf->address);