]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - fs/block_dev.c
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph...
[karo-tx-linux.git] / fs / block_dev.c
index 01b8e0d4b4ff0aec98725485463f5c8b2a362102..7b9cd49622b132f5f71e6557f05d0a2591a37c16 100644 (file)
@@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev)
 {
        struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-       if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+       if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
                return;
 
        invalidate_bh_lrus();
@@ -156,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
        return 0;
 }
 
+static struct inode *bdev_file_inode(struct file *file)
+{
+       return file->f_mapping->host;
+}
+
 static ssize_t
 blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 {
        struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
+       struct inode *inode = bdev_file_inode(file);
 
        if (IS_DAX(inode))
                return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
@@ -338,18 +343,18 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
  */
 static loff_t block_llseek(struct file *file, loff_t offset, int whence)
 {
-       struct inode *bd_inode = file->f_mapping->host;
+       struct inode *bd_inode = bdev_file_inode(file);
        loff_t retval;
 
-       mutex_lock(&bd_inode->i_mutex);
+       inode_lock(bd_inode);
        retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
-       mutex_unlock(&bd_inode->i_mutex);
+       inode_unlock(bd_inode);
        return retval;
 }
        
 int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
-       struct inode *bd_inode = filp->f_mapping->host;
+       struct inode *bd_inode = bdev_file_inode(filp);
        struct block_device *bdev = I_BDEV(bd_inode);
        int error;
        
@@ -395,7 +400,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
        if (!ops->rw_page || bdev_get_integrity(bdev))
                return result;
 
-       result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL);
+       result = blk_queue_enter(bdev->bd_queue, false);
        if (result)
                return result;
        result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
@@ -432,7 +437,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 
        if (!ops->rw_page || bdev_get_integrity(bdev))
                return -EOPNOTSUPP;
-       result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL);
+       result = blk_queue_enter(bdev->bd_queue, false);
        if (result)
                return result;
 
@@ -450,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
 /**
  * bdev_direct_access() - Get the address for directly-accessibly memory
  * @bdev: The device containing the memory
- * @sector: The offset within the device
- * @addr: Where to put the address of the memory
- * @pfn: The Page Frame Number for the memory
- * @size: The number of bytes requested
+ * @dax: control and output parameters for ->direct_access
  *
  * If a block device is made up of directly addressable memory, this function
  * will tell the caller the PFN and the address of the memory.  The address
@@ -464,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
  * Return: negative errno if an error occurs, otherwise the number of bytes
  * accessible at this address.
  */
-long bdev_direct_access(struct block_device *bdev, sector_t sector,
-                       void __pmem **addr, unsigned long *pfn, long size)
+long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
-       long avail;
+       sector_t sector = dax->sector;
+       long avail, size = dax->size;
        const struct block_device_operations *ops = bdev->bd_disk->fops;
 
        /*
@@ -486,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
        sector += get_start_sect(bdev);
        if (sector % (PAGE_SIZE / 512))
                return -EINVAL;
-       avail = ops->direct_access(bdev, sector, addr, pfn);
+       avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
        if (!avail)
                return -ERANGE;
+       if (avail > 0 && avail & ~PAGE_MASK)
+               return -ENXIO;
        return min(avail, size);
 }
 EXPORT_SYMBOL_GPL(bdev_direct_access);
@@ -590,7 +594,7 @@ void __init bdev_cache_init(void)
 
        bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
                        0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-                               SLAB_MEM_SPREAD|SLAB_PANIC),
+                               SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
                        init_once);
        err = register_filesystem(&bd_type);
        if (err)
@@ -696,7 +700,7 @@ static struct block_device *bd_acquire(struct inode *inode)
        spin_lock(&bdev_lock);
        bdev = inode->i_bdev;
        if (bdev) {
-               ihold(bdev->bd_inode);
+               bdgrab(bdev);
                spin_unlock(&bdev_lock);
                return bdev;
        }
@@ -712,7 +716,7 @@ static struct block_device *bd_acquire(struct inode *inode)
                         * So, we can access it via ->i_mapping always
                         * without igrab().
                         */
-                       ihold(bdev->bd_inode);
+                       bdgrab(bdev);
                        inode->i_bdev = bdev;
                        inode->i_mapping = bdev->bd_inode->i_mapping;
                        list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -735,7 +739,7 @@ void bd_forget(struct inode *inode)
        spin_unlock(&bdev_lock);
 
        if (bdev)
-               iput(bdev->bd_inode);
+               bdput(bdev);
 }
 
 /**
@@ -1138,9 +1142,9 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 {
        unsigned bsize = bdev_logical_block_size(bdev);
 
-       mutex_lock(&bdev->bd_inode->i_mutex);
+       inode_lock(bdev->bd_inode);
        i_size_write(bdev->bd_inode, size);
-       mutex_unlock(&bdev->bd_inode->i_mutex);
+       inode_unlock(bdev->bd_inode);
        while (bsize < PAGE_CACHE_SIZE) {
                if (size & bsize)
                        break;
@@ -1224,8 +1228,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                }
                        }
 
-                       if (!ret)
+                       if (!ret) {
                                bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+                               if (!blkdev_dax_capable(bdev))
+                                       bdev->bd_inode->i_flags &= ~S_DAX;
+                       }
 
                        /*
                         * If the device is invalidated, rescan partition
@@ -1239,6 +1246,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                else if (ret == -ENOMEDIUM)
                                        invalidate_partitions(disk, bdev);
                        }
+
                        if (ret)
                                goto out_clear;
                } else {
@@ -1259,12 +1267,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                goto out_clear;
                        }
                        bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
-                       /*
-                        * If the partition is not aligned on a page
-                        * boundary, we can't do dax I/O to it.
-                        */
-                       if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) ||
-                           (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+                       if (!blkdev_dax_capable(bdev))
                                bdev->bd_inode->i_flags &= ~S_DAX;
                }
        } else {
@@ -1599,14 +1602,14 @@ EXPORT_SYMBOL(blkdev_put);
 
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
-       struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+       struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
        blkdev_put(bdev, filp->f_mode);
        return 0;
 }
 
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
-       struct block_device *bdev = I_BDEV(file->f_mapping->host);
+       struct block_device *bdev = I_BDEV(bdev_file_inode(file));
        fmode_t mode = file->f_mode;
 
        /*
@@ -1631,7 +1634,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
        struct file *file = iocb->ki_filp;
-       struct inode *bd_inode = file->f_mapping->host;
+       struct inode *bd_inode = bdev_file_inode(file);
        loff_t size = i_size_read(bd_inode);
        struct blk_plug plug;
        ssize_t ret;
@@ -1663,7 +1666,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter);
 ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
        struct file *file = iocb->ki_filp;
-       struct inode *bd_inode = file->f_mapping->host;
+       struct inode *bd_inode = bdev_file_inode(file);
        loff_t size = i_size_read(bd_inode);
        loff_t pos = iocb->ki_pos;
 
@@ -1702,13 +1705,101 @@ static const struct address_space_operations def_blk_aops = {
        .is_dirty_writeback = buffer_check_dirty_writeback,
 };
 
+#ifdef CONFIG_FS_DAX
+/*
+ * In the raw block case we do not need to contend with truncation nor
+ * unwritten file extents.  Without those concerns there is no need for
+ * additional locking beyond the mmap_sem context that these routines
+ * are already executing under.
+ *
+ * Note, there is no protection if the block device is dynamically
+ * resized (partition grow/shrink) during a fault. A stable block device
+ * size is already not enforced in the blkdev_direct_IO path.
+ *
+ * For DAX, it is the responsibility of the block device driver to
+ * ensure the whole-disk device size is stable while requests are in
+ * flight.
+ *
+ * Finally, unlike the filemap_page_mkwrite() case there is no
+ * filesystem superblock to sync against freezing.  We still include a
+ * pfn_mkwrite callback for dax drivers to receive write fault
+ * notifications.
+ */
+static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       return __dax_fault(vma, vmf, blkdev_get_block, NULL);
+}
+
+static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+               pmd_t *pmd, unsigned int flags)
+{
+       return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
+}
+
+static void blkdev_vm_open(struct vm_area_struct *vma)
+{
+       struct inode *bd_inode = bdev_file_inode(vma->vm_file);
+       struct block_device *bdev = I_BDEV(bd_inode);
+
+       inode_lock(bd_inode);
+       bdev->bd_map_count++;
+       inode_unlock(bd_inode);
+}
+
+static void blkdev_vm_close(struct vm_area_struct *vma)
+{
+       struct inode *bd_inode = bdev_file_inode(vma->vm_file);
+       struct block_device *bdev = I_BDEV(bd_inode);
+
+       inode_lock(bd_inode);
+       bdev->bd_map_count--;
+       inode_unlock(bd_inode);
+}
+
+static const struct vm_operations_struct blkdev_dax_vm_ops = {
+       .open           = blkdev_vm_open,
+       .close          = blkdev_vm_close,
+       .fault          = blkdev_dax_fault,
+       .pmd_fault      = blkdev_dax_pmd_fault,
+       .pfn_mkwrite    = blkdev_dax_fault,
+};
+
+static const struct vm_operations_struct blkdev_default_vm_ops = {
+       .open           = blkdev_vm_open,
+       .close          = blkdev_vm_close,
+       .fault          = filemap_fault,
+       .map_pages      = filemap_map_pages,
+};
+
+static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       struct inode *bd_inode = bdev_file_inode(file);
+       struct block_device *bdev = I_BDEV(bd_inode);
+
+       file_accessed(file);
+       inode_lock(bd_inode);
+       bdev->bd_map_count++;
+       if (IS_DAX(bd_inode)) {
+               vma->vm_ops = &blkdev_dax_vm_ops;
+               vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+       } else {
+               vma->vm_ops = &blkdev_default_vm_ops;
+       }
+       inode_unlock(bd_inode);
+
+       return 0;
+}
+#else
+#define blkdev_mmap generic_file_mmap
+#endif
+
 const struct file_operations def_blk_fops = {
        .open           = blkdev_open,
        .release        = blkdev_close,
        .llseek         = block_llseek,
        .read_iter      = blkdev_read_iter,
        .write_iter     = blkdev_write_iter,
-       .mmap           = generic_file_mmap,
+       .mmap           = blkdev_mmap,
        .fsync          = blkdev_fsync,
        .unlocked_ioctl = block_ioctl,
 #ifdef CONFIG_COMPAT