]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'akpm' (updates from Andrew Morton)
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 9 Jul 2013 20:33:36 +0000 (13:33 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 9 Jul 2013 20:33:36 +0000 (13:33 -0700)
Merge second patch-bomb from Andrew Morton:
 - misc fixes
 - audit stuff
 - fanotify/inotify/dnotify things
 - most of the rest of MM.  The new cache shrinker code from Glauber and
   Dave Chinner probably isn't quite stabilized yet.
 - ptrace
 - ipc
 - partitions
 - reboot cleanups
 - add LZ4 decompressor, use it for kernel compression

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (118 commits)
  lib/scatterlist: error handling in __sg_alloc_table()
  scsi_debug: fix do_device_access() with wrap around range
  crypto: talitos: use sg_pcopy_to_buffer()
  lib/scatterlist: introduce sg_pcopy_from_buffer() and sg_pcopy_to_buffer()
  lib/scatterlist: factor out sg_miter_get_next_page() from sg_miter_next()
  crypto: add lz4 Cryptographic API
  lib: add lz4 compressor module
  arm: add support for LZ4-compressed kernel
  lib: add support for LZ4-compressed kernel
  decompressor: add LZ4 decompressor module
  lib: add weak clz/ctz functions
  reboot: move arch/x86 reboot= handling to generic kernel
  reboot: arm: change reboot_mode to use enum reboot_mode
  reboot: arm: prepare reboot_mode for moving to generic kernel code
  reboot: arm: remove unused restart_mode fields from some arm subarchs
  reboot: unicore32: prepare reboot_mode for moving to generic kernel code
  reboot: x86: prepare reboot_mode for moving to generic kernel code
  reboot: checkpatch.pl the new kernel/reboot.c file
  reboot: move shutdown/reboot related functions to kernel/reboot.c
  reboot: remove -stable friendly PF_THREAD_BOUND define
  ...

172 files changed:
MAINTAINERS
drivers/block/rbd.c
drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
drivers/staging/lustre/lustre/include/linux/lvfs.h
drivers/staging/lustre/lustre/include/lprocfs_status.h
drivers/staging/lustre/lustre/llite/dcache.c
drivers/staging/lustre/lustre/llite/llite_internal.h
drivers/staging/lustre/lustre/llite/llite_lib.c
drivers/staging/lustre/lustre/llite/namei.c
drivers/staging/lustre/lustre/lvfs/lvfs_linux.c
drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
fs/autofs4/expire.c
fs/autofs4/root.c
fs/btrfs/backref.c
fs/btrfs/backref.h
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/export.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/file-item.c
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/free-space-cache.h
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/lzo.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/qgroup.c
fs/btrfs/relocation.c
fs/btrfs/root-tree.c
fs/btrfs/scrub.c
fs/btrfs/send.c
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/tree-log.c
fs/btrfs/ulist.c
fs/btrfs/version.h [deleted file]
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/ceph/addr.c
fs/ceph/caps.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/locks.c
fs/ceph/mds_client.c
fs/ceph/mdsmap.c
fs/ceph/super.c
fs/ceph/super.h
fs/ceph/xattr.c
fs/coda/dir.c
fs/configfs/dir.c
fs/ecryptfs/inode.c
fs/ext3/fsync.c
fs/ext3/super.c
fs/f2fs/dir.c
fs/locks.c
fs/nfs/Kconfig
fs/nfs/Makefile
fs/nfs/blocklayout/blocklayout.c
fs/nfs/callback.c
fs/nfs/callback.h
fs/nfs/callback_proc.c
fs/nfs/callback_xdr.c
fs/nfs/client.c
fs/nfs/dir.c
fs/nfs/dns_resolve.c
fs/nfs/getroot.c
fs/nfs/idmap.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/mount_clnt.c
fs/nfs/namespace.c
fs/nfs/nfs3proc.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4client.c
fs/nfs/nfs4file.c
fs/nfs/nfs4filelayout.c
fs/nfs/nfs4filelayout.h
fs/nfs/nfs4filelayoutdev.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4session.c
fs/nfs/nfs4session.h
fs/nfs/nfs4state.c
fs/nfs/nfs4super.c
fs/nfs/nfs4xdr.c
fs/nfs/objlayout/objlayout.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/proc.c
fs/nfs/super.c
fs/nfs/unlink.c
fs/nfsd/nfsd.h
fs/nilfs2/super.c
fs/quota/dquot.c
fs/seq_file.c
fs/xfs/Makefile
fs/xfs/xfs_alloc.c
fs/xfs/xfs_bmap_btree.h
fs/xfs/xfs_buf_item.c
fs/xfs/xfs_buf_item.h
fs/xfs/xfs_dfrag.c
fs/xfs/xfs_dir2_leaf.c
fs/xfs/xfs_dquot.c
fs/xfs/xfs_dquot.h
fs/xfs/xfs_fsops.c
fs/xfs/xfs_ialloc.c
fs/xfs/xfs_ialloc.h
fs/xfs/xfs_icache.c
fs/xfs/xfs_icache.h
fs/xfs/xfs_icreate_item.c [new file with mode: 0644]
fs/xfs/xfs_icreate_item.h [new file with mode: 0644]
fs/xfs/xfs_inode.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iops.c
fs/xfs/xfs_itable.c
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_log_cil.c
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_qm.c
fs/xfs/xfs_qm.h
fs/xfs/xfs_qm_syscalls.c
fs/xfs/xfs_quota.h
fs/xfs/xfs_quotaops.c
fs/xfs/xfs_sb.h
fs/xfs/xfs_super.c
fs/xfs/xfs_symlink.c
fs/xfs/xfs_symlink.h
fs/xfs/xfs_sysctl.c
fs/xfs/xfs_trace.h
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
fs/xfs/xfs_trans_buf.c
fs/xfs/xfs_trans_dquot.c
fs/xfs/xfs_trans_inode.c
fs/xfs/xfs_vnodeops.c
include/linux/ceph/decode.h
include/linux/ceph/osd_client.h
include/linux/dcache.h
include/linux/fs.h
include/linux/nfs4.h
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_xdr.h
include/linux/security.h
include/linux/seq_file.h
include/linux/sunrpc/sched.h
include/trace/events/9p.h
include/trace/events/btrfs.h
include/uapi/linux/btrfs.h
net/9p/client.c
net/ceph/auth_none.c
net/ceph/osd_client.c
net/sunrpc/clnt.c
net/sunrpc/rpc_pipe.c
net/sunrpc/sched.c
security/capability.c
security/security.c
security/selinux/hooks.c
security/selinux/include/security.h
security/selinux/ss/policydb.c
security/smack/smack_lsm.c

index 70cf679d3904d4b442fca0771d42d77fadb430a1..9623bc5a39d56da279fec7674582946b23db7d82 100644 (file)
@@ -180,6 +180,11 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git
 S:     Maintained
 F:     Documentation/filesystems/9p.txt
 F:     fs/9p/
+F:     net/9p/
+F:     include/net/9p/
+F:     include/uapi/linux/virtio_9p.h
+F:     include/trace/events/9p.h
+
 
 A8293 MEDIA DRIVER
 M:     Antti Palosaari <crope@iki.fi>
index aff789d6fccd35b7f0a0c3314e5c04951c6c774c..4ad2ad9a5bb01448d6d2206a3387575718c47b87 100644 (file)
@@ -372,7 +372,7 @@ enum rbd_dev_flags {
        RBD_DEV_FLAG_REMOVING,  /* this mapping is being removed */
 };
 
-static DEFINE_MUTEX(ctl_mutex);          /* Serialize open/close/setup/teardown */
+static DEFINE_MUTEX(client_mutex);     /* Serialize client creation */
 
 static LIST_HEAD(rbd_dev_list);    /* devices */
 static DEFINE_SPINLOCK(rbd_dev_list_lock);
@@ -489,10 +489,8 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
        if (removing)
                return -ENOENT;
 
-       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
        (void) get_device(&rbd_dev->dev);
        set_device_ro(bdev, rbd_dev->mapping.read_only);
-       mutex_unlock(&ctl_mutex);
 
        return 0;
 }
@@ -507,9 +505,7 @@ static void rbd_release(struct gendisk *disk, fmode_t mode)
        spin_unlock_irq(&rbd_dev->lock);
        rbd_assert(open_count_before > 0);
 
-       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
        put_device(&rbd_dev->dev);
-       mutex_unlock(&ctl_mutex);
 }
 
 static const struct block_device_operations rbd_bd_ops = {
@@ -520,7 +516,7 @@ static const struct block_device_operations rbd_bd_ops = {
 
 /*
  * Initialize an rbd client instance.  Success or not, this function
- * consumes ceph_opts.
+ * consumes ceph_opts.  Caller holds client_mutex.
  */
 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 {
@@ -535,30 +531,25 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
        kref_init(&rbdc->kref);
        INIT_LIST_HEAD(&rbdc->node);
 
-       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
        rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
        if (IS_ERR(rbdc->client))
-               goto out_mutex;
+               goto out_rbdc;
        ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 
        ret = ceph_open_session(rbdc->client);
        if (ret < 0)
-               goto out_err;
+               goto out_client;
 
        spin_lock(&rbd_client_list_lock);
        list_add_tail(&rbdc->node, &rbd_client_list);
        spin_unlock(&rbd_client_list_lock);
 
-       mutex_unlock(&ctl_mutex);
        dout("%s: rbdc %p\n", __func__, rbdc);
 
        return rbdc;
-
-out_err:
+out_client:
        ceph_destroy_client(rbdc->client);
-out_mutex:
-       mutex_unlock(&ctl_mutex);
+out_rbdc:
        kfree(rbdc);
 out_opt:
        if (ceph_opts)
@@ -682,11 +673,13 @@ static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 {
        struct rbd_client *rbdc;
 
+       mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
        rbdc = rbd_client_find(ceph_opts);
        if (rbdc)       /* using an existing client */
                ceph_destroy_options(ceph_opts);
        else
                rbdc = rbd_client_create(ceph_opts);
+       mutex_unlock(&client_mutex);
 
        return rbdc;
 }
@@ -840,7 +833,6 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev,
 
        /* We won't fail any more, fill in the header */
 
-       down_write(&rbd_dev->header_rwsem);
        if (first_time) {
                header->object_prefix = object_prefix;
                header->obj_order = ondisk->options.order;
@@ -869,8 +861,6 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev,
                if (rbd_dev->mapping.size != header->image_size)
                        rbd_dev->mapping.size = header->image_size;
 
-       up_write(&rbd_dev->header_rwsem);
-
        return 0;
 out_2big:
        ret = -EIO;
@@ -1126,6 +1116,7 @@ static void zero_bio_chain(struct bio *chain, int start_ofs)
                                buf = bvec_kmap_irq(bv, &flags);
                                memset(buf + remainder, 0,
                                       bv->bv_len - remainder);
+                               flush_dcache_page(bv->bv_page);
                                bvec_kunmap_irq(buf, &flags);
                        }
                        pos += bv->bv_len;
@@ -1153,11 +1144,12 @@ static void zero_pages(struct page **pages, u64 offset, u64 end)
                unsigned long flags;
                void *kaddr;
 
-               page_offset = (size_t)(offset & ~PAGE_MASK);
-               length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
+               page_offset = offset & ~PAGE_MASK;
+               length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
                local_irq_save(flags);
                kaddr = kmap_atomic(*page);
                memset(kaddr + page_offset, 0, length);
+               flush_dcache_page(*page);
                kunmap_atomic(kaddr);
                local_irq_restore(flags);
 
@@ -2171,9 +2163,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
        struct rbd_obj_request *obj_request = NULL;
        struct rbd_obj_request *next_obj_request;
        bool write_request = img_request_write_test(img_request);
-       struct bio *bio_list;
+       struct bio *bio_list = 0;
        unsigned int bio_offset = 0;
-       struct page **pages;
+       struct page **pages = 0;
        u64 img_offset;
        u64 resid;
        u16 opcode;
@@ -2535,6 +2527,7 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
         */
        orig_request = obj_request->obj_request;
        obj_request->obj_request = NULL;
+       rbd_obj_request_put(orig_request);
        rbd_assert(orig_request);
        rbd_assert(orig_request->img_request);
 
@@ -2555,7 +2548,6 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
        if (!rbd_dev->parent_overlap) {
                struct ceph_osd_client *osdc;
 
-               rbd_obj_request_put(orig_request);
                osdc = &rbd_dev->rbd_client->client->osdc;
                result = rbd_obj_request_submit(osdc, orig_request);
                if (!result)
@@ -2585,7 +2577,6 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
 out:
        if (orig_request->result)
                rbd_obj_request_complete(orig_request);
-       rbd_obj_request_put(orig_request);
 }
 
 static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
@@ -2859,7 +2850,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
                (unsigned int)opcode);
        ret = rbd_dev_refresh(rbd_dev);
        if (ret)
-               rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
+               rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
 
        rbd_obj_notify_ack(rbd_dev, notify_id);
 }
@@ -3339,8 +3330,8 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
        int ret;
 
        rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+       down_write(&rbd_dev->header_rwsem);
        mapping_size = rbd_dev->mapping.size;
-       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
        if (rbd_dev->image_format == 1)
                ret = rbd_dev_v1_header_info(rbd_dev);
        else
@@ -3349,7 +3340,8 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
        /* If it's a mapped snapshot, validate its EXISTS flag */
 
        rbd_exists_validate(rbd_dev);
-       mutex_unlock(&ctl_mutex);
+       up_write(&rbd_dev->header_rwsem);
+
        if (mapping_size != rbd_dev->mapping.size) {
                sector_t size;
 
@@ -3813,6 +3805,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
        void *end;
        u64 pool_id;
        char *image_id;
+       u64 snap_id;
        u64 overlap;
        int ret;
 
@@ -3872,24 +3865,56 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
                        (unsigned long long)pool_id, U32_MAX);
                goto out_err;
        }
-       parent_spec->pool_id = pool_id;
 
        image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
        if (IS_ERR(image_id)) {
                ret = PTR_ERR(image_id);
                goto out_err;
        }
-       parent_spec->image_id = image_id;
-       ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
+       ceph_decode_64_safe(&p, end, snap_id, out_err);
        ceph_decode_64_safe(&p, end, overlap, out_err);
 
-       if (overlap) {
-               rbd_spec_put(rbd_dev->parent_spec);
+       /*
+        * The parent won't change (except when the clone is
+        * flattened, already handled that).  So we only need to
+        * record the parent spec we have not already done so.
+        */
+       if (!rbd_dev->parent_spec) {
+               parent_spec->pool_id = pool_id;
+               parent_spec->image_id = image_id;
+               parent_spec->snap_id = snap_id;
                rbd_dev->parent_spec = parent_spec;
                parent_spec = NULL;     /* rbd_dev now owns this */
-               rbd_dev->parent_overlap = overlap;
-       } else {
-               rbd_warn(rbd_dev, "ignoring parent of clone with overlap 0\n");
+       }
+
+       /*
+        * We always update the parent overlap.  If it's zero we
+        * treat it specially.
+        */
+       rbd_dev->parent_overlap = overlap;
+       smp_mb();
+       if (!overlap) {
+
+               /* A null parent_spec indicates it's the initial probe */
+
+               if (parent_spec) {
+                       /*
+                        * The overlap has become zero, so the clone
+                        * must have been resized down to 0 at some
+                        * point.  Treat this the same as a flatten.
+                        */
+                       rbd_dev_parent_put(rbd_dev);
+                       pr_info("%s: clone image now standalone\n",
+                               rbd_dev->disk->disk_name);
+               } else {
+                       /*
+                        * For the initial probe, if we find the
+                        * overlap is zero we just pretend there was
+                        * no parent image.
+                        */
+                       rbd_warn(rbd_dev, "ignoring parent of "
+                                               "clone with overlap 0\n");
+               }
        }
 out:
        ret = 0;
@@ -4245,16 +4270,14 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
        bool first_time = rbd_dev->header.object_prefix == NULL;
        int ret;
 
-       down_write(&rbd_dev->header_rwsem);
-
        ret = rbd_dev_v2_image_size(rbd_dev);
        if (ret)
-               goto out;
+               return ret;
 
        if (first_time) {
                ret = rbd_dev_v2_header_onetime(rbd_dev);
                if (ret)
-                       goto out;
+                       return ret;
        }
 
        /*
@@ -4269,7 +4292,7 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
 
                ret = rbd_dev_v2_parent_info(rbd_dev);
                if (ret)
-                       goto out;
+                       return ret;
 
                /*
                 * Print a warning if this is the initial probe and
@@ -4290,8 +4313,6 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
 
        ret = rbd_dev_v2_snap_context(rbd_dev);
        dout("rbd_dev_v2_snap_context returned %d\n", ret);
-out:
-       up_write(&rbd_dev->header_rwsem);
 
        return ret;
 }
@@ -4301,8 +4322,6 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
        struct device *dev;
        int ret;
 
-       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
        dev = &rbd_dev->dev;
        dev->bus = &rbd_bus_type;
        dev->type = &rbd_device_type;
@@ -4311,8 +4330,6 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
        dev_set_name(dev, "%d", rbd_dev->dev_id);
        ret = device_register(dev);
 
-       mutex_unlock(&ctl_mutex);
-
        return ret;
 }
 
@@ -5059,23 +5076,6 @@ err_out_module:
        return (ssize_t)rc;
 }
 
-static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
-{
-       struct list_head *tmp;
-       struct rbd_device *rbd_dev;
-
-       spin_lock(&rbd_dev_list_lock);
-       list_for_each(tmp, &rbd_dev_list) {
-               rbd_dev = list_entry(tmp, struct rbd_device, node);
-               if (rbd_dev->dev_id == dev_id) {
-                       spin_unlock(&rbd_dev_list_lock);
-                       return rbd_dev;
-               }
-       }
-       spin_unlock(&rbd_dev_list_lock);
-       return NULL;
-}
-
 static void rbd_dev_device_release(struct device *dev)
 {
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
@@ -5120,8 +5120,10 @@ static ssize_t rbd_remove(struct bus_type *bus,
                          size_t count)
 {
        struct rbd_device *rbd_dev = NULL;
-       int target_id;
+       struct list_head *tmp;
+       int dev_id;
        unsigned long ul;
+       bool already = false;
        int ret;
 
        ret = strict_strtoul(buf, 10, &ul);
@@ -5129,37 +5131,40 @@ static ssize_t rbd_remove(struct bus_type *bus,
                return ret;
 
        /* convert to int; abort if we lost anything in the conversion */
-       target_id = (int) ul;
-       if (target_id != ul)
+       dev_id = (int)ul;
+       if (dev_id != ul)
                return -EINVAL;
 
-       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
-       rbd_dev = __rbd_get_dev(target_id);
-       if (!rbd_dev) {
-               ret = -ENOENT;
-               goto done;
+       ret = -ENOENT;
+       spin_lock(&rbd_dev_list_lock);
+       list_for_each(tmp, &rbd_dev_list) {
+               rbd_dev = list_entry(tmp, struct rbd_device, node);
+               if (rbd_dev->dev_id == dev_id) {
+                       ret = 0;
+                       break;
+               }
+       }
+       if (!ret) {
+               spin_lock_irq(&rbd_dev->lock);
+               if (rbd_dev->open_count)
+                       ret = -EBUSY;
+               else
+                       already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
+                                                       &rbd_dev->flags);
+               spin_unlock_irq(&rbd_dev->lock);
        }
+       spin_unlock(&rbd_dev_list_lock);
+       if (ret < 0 || already)
+               return ret;
 
-       spin_lock_irq(&rbd_dev->lock);
-       if (rbd_dev->open_count)
-               ret = -EBUSY;
-       else
-               set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
-       spin_unlock_irq(&rbd_dev->lock);
-       if (ret < 0)
-               goto done;
        rbd_bus_del_dev(rbd_dev);
        ret = rbd_dev_header_watch_sync(rbd_dev, false);
        if (ret)
                rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
        rbd_dev_image_release(rbd_dev);
        module_put(THIS_MODULE);
-       ret = count;
-done:
-       mutex_unlock(&ctl_mutex);
 
-       return ret;
+       return count;
 }
 
 /*
@@ -5267,6 +5272,7 @@ static void __exit rbd_exit(void)
 module_init(rbd_init);
 module_exit(rbd_exit);
 
+MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
 MODULE_DESCRIPTION("rados block device");
index f0508084e8c554dcfecbf9d57712ed6bf18d2e35..a8e9c0c8ffd235d0965ef2cf1cd76cdd0bb8801e 100644 (file)
@@ -60,8 +60,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        ll_delete_from_page_cache(page);
 }
 
-#  define d_refcount(d)                 ((d)->d_count)
-
 #ifdef ATTR_OPEN
 # define ATTR_FROM_OPEN ATTR_OPEN
 #else
index b4db6cb581bdb855912f44c40ed55fec3f8e4ebd..eb59ac7d5946b3dda15b3f909ad88833c2ccbbfd 100644 (file)
@@ -99,7 +99,7 @@ static inline void l_dput(struct dentry *de)
        if (!de || IS_ERR(de))
                return;
        //shrink_dcache_parent(de);
-       LASSERT(d_refcount(de) > 0);
+       LASSERT(d_count(de) > 0);
        dput(de);
 }
 
index e770d0260576a95af516511d90fdc62c94170c0b..55f182205d7850618bb8d9fd1a0f18cff0e7f6ee 100644 (file)
@@ -53,7 +53,7 @@ struct lprocfs_vars {
        /**
         * /proc file mode.
         */
-       mode_t                  proc_mode;
+       umode_t                 proc_mode;
 };
 
 struct lprocfs_static_vars {
@@ -600,11 +600,11 @@ extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list);
 extern int lprocfs_obd_cleanup(struct obd_device *obd);
 
 extern int lprocfs_seq_create(proc_dir_entry_t *parent, const char *name,
-                             mode_t mode,
+                             umode_t mode,
                              const struct file_operations *seq_fops,
                              void *data);
 extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
-                                 mode_t mode,
+                                 umode_t mode,
                                  const struct file_operations *seq_fops,
                                  void *data);
 
index 7d6abfff9740d36da38e3751652b083e05bb23cd..ff0d085077c845627bc42d8b9714459f134af633 100644 (file)
@@ -98,7 +98,7 @@ int ll_dcompare(const struct dentry *parent, const struct inode *pinode,
 
        CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n",
               name->len, name->name, dentry, dentry->d_flags,
-              d_refcount(dentry));
+              d_count(dentry));
 
        /* mountpoint is always valid */
        if (d_mountpoint((struct dentry *)dentry))
@@ -165,7 +165,7 @@ static int ll_ddelete(const struct dentry *de)
               list_empty(&de->d_subdirs) ? "" : "subdirs");
 
        /* kernel >= 2.6.38 last refcount is decreased after this function. */
-       LASSERT(d_refcount(de) == 1);
+       LASSERT(d_count(de) == 1);
 
        /* Disable this piece of code temproarily because this is called
         * inside dcache_lock so it's not appropriate to do lots of work
@@ -190,7 +190,7 @@ static int ll_set_dd(struct dentry *de)
 
        CDEBUG(D_DENTRY, "ldd on dentry %.*s (%p) parent %p inode %p refc %d\n",
                de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
-               d_refcount(de));
+               d_count(de));
 
        if (de->d_fsdata == NULL) {
                struct ll_dentry_data *lld;
@@ -540,7 +540,7 @@ out:
                CDEBUG(D_DENTRY, "revalidated dentry %.*s (%p) parent %p "
                       "inode %p refc %d\n", de->d_name.len,
                       de->d_name.name, de, de->d_parent, de->d_inode,
-                      d_refcount(de));
+                      d_count(de));
 
                ll_set_lock_data(exp, de->d_inode, it, &bits);
 
index 992cd203ca1a1121c8f42f65479588b42015c255..5227c5c4ebe21845905104d1f99bdc2ced46f6d1 100644 (file)
@@ -1529,12 +1529,12 @@ static inline void d_lustre_invalidate(struct dentry *dentry, int nested)
 {
        CDEBUG(D_DENTRY, "invalidate dentry %.*s (%p) parent %p inode %p "
               "refc %d\n", dentry->d_name.len, dentry->d_name.name, dentry,
-              dentry->d_parent, dentry->d_inode, d_refcount(dentry));
+              dentry->d_parent, dentry->d_inode, d_count(dentry));
 
        spin_lock_nested(&dentry->d_lock,
                         nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL);
        __d_lustre_invalidate(dentry);
-       if (d_refcount(dentry) == 0)
+       if (d_count(dentry) == 0)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
 }
index 2311b20ee99a29f74d5c474703229fa0c6f135f3..afae8010623d5ddbd90e21aac9ecb9a22453e58e 100644 (file)
@@ -659,7 +659,7 @@ void lustre_dump_dentry(struct dentry *dentry, int recur)
               " flags=0x%x, fsdata=%p, %d subdirs\n", dentry,
               dentry->d_name.len, dentry->d_name.name,
               dentry->d_parent->d_name.len, dentry->d_parent->d_name.name,
-              dentry->d_parent, dentry->d_inode, d_refcount(dentry),
+              dentry->d_parent, dentry->d_inode, d_count(dentry),
               dentry->d_flags, dentry->d_fsdata, subdirs);
        if (dentry->d_inode != NULL)
                ll_dump_inode(dentry->d_inode);
index 58d59aa126191cea965a6338428ada342448e3af..ff8f63de5612a03f7baae80efcb252ab865719ce 100644 (file)
@@ -409,7 +409,7 @@ struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
                        iput(inode);
                        CDEBUG(D_DENTRY,
                               "Reuse dentry %p inode %p refc %d flags %#x\n",
-                             new, new->d_inode, d_refcount(new), new->d_flags);
+                             new, new->d_inode, d_count(new), new->d_flags);
                        return new;
                }
        }
@@ -417,7 +417,7 @@ struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
        __d_lustre_invalidate(de);
        d_add(de, inode);
        CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n",
-              de, de->d_inode, d_refcount(de), de->d_flags);
+              de, de->d_inode, d_count(de), de->d_flags);
        return de;
 }
 
index 1e6f32c3549b321e56d0bc154539932dec5be6d3..e70d8fe99888fd82c44c94eea26d3c2226af2c3b 100644 (file)
@@ -121,8 +121,8 @@ void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
        OBD_SET_CTXT_MAGIC(save);
 
        save->fs = get_fs();
-       LASSERT(d_refcount(cfs_fs_pwd(current->fs)));
-       LASSERT(d_refcount(new_ctx->pwd));
+       LASSERT(d_count(cfs_fs_pwd(current->fs)));
+       LASSERT(d_count(new_ctx->pwd));
        save->pwd = dget(cfs_fs_pwd(current->fs));
        save->pwdmnt = mntget(cfs_fs_mnt(current->fs));
        save->luc.luc_umask = current_umask();
index 3b157f89c3008912d9c94248b0c030485c1aea1f..f7af3d6a4efcbf52f1cba02c6ad717c6efdbb879 100644 (file)
@@ -73,7 +73,7 @@ proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
                                     struct file_operations *fops)
 {
        proc_dir_entry_t *proc;
-       mode_t mode = 0;
+       umode_t mode = 0;
 
        if (root == NULL || name == NULL || fops == NULL)
                return ERR_PTR(-EINVAL);
@@ -140,7 +140,7 @@ int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
 
        while (list->name != NULL) {
                struct proc_dir_entry *proc;
-               mode_t mode = 0;
+               umode_t mode = 0;
 
                if (list->proc_mode != 0000) {
                        mode = list->proc_mode;
@@ -1899,7 +1899,7 @@ EXPORT_SYMBOL(lprocfs_find_named_value);
 
 int lprocfs_seq_create(proc_dir_entry_t *parent,
                       const char *name,
-                      mode_t mode,
+                      umode_t mode,
                       const struct file_operations *seq_fops,
                       void *data)
 {
@@ -1919,7 +1919,7 @@ EXPORT_SYMBOL(lprocfs_seq_create);
 
 int lprocfs_obd_seq_create(struct obd_device *dev,
                           const char *name,
-                          mode_t mode,
+                          umode_t mode,
                           const struct file_operations *seq_fops,
                           void *data)
 {
index 13ddec92341cdaaa63cf811ee1037a2038994191..3d9d3f5d5dda688bcd9bfc666210553ffa73be0d 100644 (file)
@@ -109,7 +109,7 @@ cont:
 
        spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
        /* Already gone or negative dentry (under construction) - try next */
-       if (q->d_count == 0 || !simple_positive(q)) {
+       if (!d_count(q) || !simple_positive(q)) {
                spin_unlock(&q->d_lock);
                next = q->d_u.d_child.next;
                goto cont;
@@ -267,7 +267,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
                        else
                                ino_count++;
 
-                       if (p->d_count > ino_count) {
+                       if (d_count(p) > ino_count) {
                                top_ino->last_used = jiffies;
                                dput(p);
                                return 1;
@@ -409,7 +409,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                if (!exp_leaves) {
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 1;
-                       if (dentry->d_count > ino_count)
+                       if (d_count(dentry) > ino_count)
                                goto next;
 
                        if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -423,7 +423,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                } else {
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 1;
-                       if (dentry->d_count > ino_count)
+                       if (d_count(dentry) > ino_count)
                                goto next;
 
                        expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
index ca8e55548d9893eebf82e4261e79dc84c3b06409..92ef341ba0cf35c1bd1b5f001b9b39068979f6ff 100644 (file)
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
                spin_lock(&active->d_lock);
 
                /* Already gone? */
-               if (active->d_count == 0)
+               if (!d_count(active))
                        goto next;
 
                qstr = &active->d_name;
index 290e347b6db3f925f414fd9be4e6ea394da6f887..eaf133384a8f97497098ab82cc94a67862116705 100644 (file)
@@ -255,13 +255,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
  * to a logical address
  */
 static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
-                                       int search_commit_root,
-                                       u64 time_seq,
-                                       struct __prelim_ref *ref,
-                                       struct ulist *parents,
-                                       const u64 *extent_item_pos)
+                                 struct btrfs_path *path, u64 time_seq,
+                                 struct __prelim_ref *ref,
+                                 struct ulist *parents,
+                                 const u64 *extent_item_pos)
 {
-       struct btrfs_path *path;
        struct btrfs_root *root;
        struct btrfs_key root_key;
        struct extent_buffer *eb;
@@ -269,11 +267,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
        int root_level;
        int level = ref->level;
 
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-       path->search_commit_root = !!search_commit_root;
-
        root_key.objectid = ref->root_id;
        root_key.type = BTRFS_ROOT_ITEM_KEY;
        root_key.offset = (u64)-1;
@@ -314,7 +307,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                                time_seq, ref->wanted_disk_byte,
                                extent_item_pos);
 out:
-       btrfs_free_path(path);
+       path->lowest_level = 0;
+       btrfs_release_path(path);
        return ret;
 }
 
@@ -322,7 +316,7 @@ out:
  * resolve all indirect backrefs from the list
  */
 static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
-                                  int search_commit_root, u64 time_seq,
+                                  struct btrfs_path *path, u64 time_seq,
                                   struct list_head *head,
                                   const u64 *extent_item_pos)
 {
@@ -349,9 +343,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                        continue;
                if (ref->count == 0)
                        continue;
-               err = __resolve_indirect_ref(fs_info, search_commit_root,
-                                            time_seq, ref, parents,
-                                            extent_item_pos);
+               err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
+                                            parents, extent_item_pos);
                if (err == -ENOMEM)
                        goto out;
                if (err)
@@ -604,6 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
        int slot;
        struct extent_buffer *leaf;
        struct btrfs_key key;
+       struct btrfs_key found_key;
        unsigned long ptr;
        unsigned long end;
        struct btrfs_extent_item *ei;
@@ -621,17 +615,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 
        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
        flags = btrfs_extent_flags(leaf, ei);
+       btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
        ptr = (unsigned long)(ei + 1);
        end = (unsigned long)ei + item_size;
 
-       if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+       if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+           flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                struct btrfs_tree_block_info *info;
 
                info = (struct btrfs_tree_block_info *)ptr;
                *info_level = btrfs_tree_block_level(leaf, info);
                ptr += sizeof(struct btrfs_tree_block_info);
                BUG_ON(ptr > end);
+       } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
+               *info_level = found_key.offset;
        } else {
                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
        }
@@ -795,7 +793,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_head *head;
        int info_level = 0;
        int ret;
-       int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
        struct list_head prefs_delayed;
        struct list_head prefs;
        struct __prelim_ref *ref;
@@ -804,13 +801,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
        INIT_LIST_HEAD(&prefs_delayed);
 
        key.objectid = bytenr;
-       key.type = BTRFS_EXTENT_ITEM_KEY;
        key.offset = (u64)-1;
+       if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+               key.type = BTRFS_METADATA_ITEM_KEY;
+       else
+               key.type = BTRFS_EXTENT_ITEM_KEY;
 
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-       path->search_commit_root = !!search_commit_root;
+       if (!trans)
+               path->search_commit_root = 1;
 
        /*
         * grab both a lock on the path and a lock on the delayed ref head.
@@ -825,7 +826,7 @@ again:
                goto out;
        BUG_ON(ret == 0);
 
-       if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) {
+       if (trans) {
                /*
                 * look if there are updates for this ref queued and lock the
                 * head
@@ -869,7 +870,8 @@ again:
                slot = path->slots[0];
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid == bytenr &&
-                   key.type == BTRFS_EXTENT_ITEM_KEY) {
+                   (key.type == BTRFS_EXTENT_ITEM_KEY ||
+                    key.type == BTRFS_METADATA_ITEM_KEY)) {
                        ret = __add_inline_refs(fs_info, path, bytenr,
                                                &info_level, &prefs);
                        if (ret)
@@ -890,8 +892,8 @@ again:
 
        __merge_refs(&prefs, 1);
 
-       ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
-                                     &prefs, extent_item_pos);
+       ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+                                     extent_item_pos);
        if (ret)
                goto out;
 
@@ -1283,12 +1285,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 {
        int ret;
        u64 flags;
+       u64 size = 0;
        u32 item_size;
        struct extent_buffer *eb;
        struct btrfs_extent_item *ei;
        struct btrfs_key key;
 
-       key.type = BTRFS_EXTENT_ITEM_KEY;
+       if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+               key.type = BTRFS_METADATA_ITEM_KEY;
+       else
+               key.type = BTRFS_EXTENT_ITEM_KEY;
        key.objectid = logical;
        key.offset = (u64)-1;
 
@@ -1301,9 +1307,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
                return ret;
 
        btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
-       if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
+       if (found_key->type == BTRFS_METADATA_ITEM_KEY)
+               size = fs_info->extent_root->leafsize;
+       else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
+               size = found_key->offset;
+
+       if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
+            found_key->type != BTRFS_METADATA_ITEM_KEY) ||
            found_key->objectid > logical ||
-           found_key->objectid + found_key->offset <= logical) {
+           found_key->objectid + size <= logical) {
                pr_debug("logical %llu is not within any extent\n",
                         (unsigned long long)logical);
                return -ENOENT;
@@ -1459,7 +1471,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
                                iterate_extent_inodes_t *iterate, void *ctx)
 {
        int ret;
-       struct btrfs_trans_handle *trans;
+       struct btrfs_trans_handle *trans = NULL;
        struct ulist *refs = NULL;
        struct ulist *roots = NULL;
        struct ulist_node *ref_node = NULL;
@@ -1471,9 +1483,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
        pr_debug("resolving all inodes for extent %llu\n",
                        extent_item_objectid);
 
-       if (search_commit_root) {
-               trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
-       } else {
+       if (!search_commit_root) {
                trans = btrfs_join_transaction(fs_info->extent_root);
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
index 0f446d7ca2c0d62b85e560e93574cb44efffc655..8f2e767029322d2d3d0ac2baf996c3964e999cdf 100644 (file)
@@ -23,8 +23,6 @@
 #include "ulist.h"
 #include "extent_io.h"
 
-#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
-
 struct inode_fs_paths {
        struct btrfs_path               *btrfs_path;
        struct btrfs_root               *fs_root;
index 17dffe33e8d0d570dd64eb5717e3f2d69f1db7fc..5bf4c39e2ad625f2e90bfbbc765fd4e13db47a2b 100644 (file)
@@ -1089,7 +1089,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
-               tree_mod_log_free_eb(root->fs_info, buf);
+               if (last_ref)
+                       tree_mod_log_free_eb(root->fs_info, buf);
                btrfs_free_tree_block(trans, root, buf, parent_start,
                                      last_ref);
        }
@@ -1161,8 +1162,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
  * time_seq).
  */
 static void
-__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
-                     struct tree_mod_elem *first_tm)
+__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+                     u64 time_seq, struct tree_mod_elem *first_tm)
 {
        u32 n;
        struct rb_node *next;
@@ -1172,6 +1173,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
        unsigned long p_size = sizeof(struct btrfs_key_ptr);
 
        n = btrfs_header_nritems(eb);
+       tree_mod_log_read_lock(fs_info);
        while (tm && tm->seq >= time_seq) {
                /*
                 * all the operations are recorded with the operator used for
@@ -1226,6 +1228,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
                if (tm->index != first_tm->index)
                        break;
        }
+       tree_mod_log_read_unlock(fs_info);
        btrfs_set_header_nritems(eb, n);
 }
 
@@ -1274,7 +1277,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 
        extent_buffer_get(eb_rewin);
        btrfs_tree_read_lock(eb_rewin);
-       __tree_mod_log_rewind(eb_rewin, time_seq, tm);
+       __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
        WARN_ON(btrfs_header_nritems(eb_rewin) >
                BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
 
@@ -1350,7 +1353,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
                btrfs_set_header_generation(eb, old_generation);
        }
        if (tm)
-               __tree_mod_log_rewind(eb, time_seq, tm);
+               __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
        else
                WARN_ON(btrfs_header_level(eb) != 0);
        WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
@@ -2178,12 +2181,8 @@ static void reada_for_search(struct btrfs_root *root,
        }
 }
 
-/*
- * returns -EAGAIN if it had to drop the path, or zero if everything was in
- * cache
- */
-static noinline int reada_for_balance(struct btrfs_root *root,
-                                     struct btrfs_path *path, int level)
+static noinline void reada_for_balance(struct btrfs_root *root,
+                                      struct btrfs_path *path, int level)
 {
        int slot;
        int nritems;
@@ -2192,12 +2191,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
        u64 gen;
        u64 block1 = 0;
        u64 block2 = 0;
-       int ret = 0;
        int blocksize;
 
        parent = path->nodes[level + 1];
        if (!parent)
-               return 0;
+               return;
 
        nritems = btrfs_header_nritems(parent);
        slot = path->slots[level + 1];
@@ -2224,28 +2222,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                        block2 = 0;
                free_extent_buffer(eb);
        }
-       if (block1 || block2) {
-               ret = -EAGAIN;
-
-               /* release the whole path */
-               btrfs_release_path(path);
-
-               /* read the blocks */
-               if (block1)
-                       readahead_tree_block(root, block1, blocksize, 0);
-               if (block2)
-                       readahead_tree_block(root, block2, blocksize, 0);
 
-               if (block1) {
-                       eb = read_tree_block(root, block1, blocksize, 0);
-                       free_extent_buffer(eb);
-               }
-               if (block2) {
-                       eb = read_tree_block(root, block2, blocksize, 0);
-                       free_extent_buffer(eb);
-               }
-       }
-       return ret;
+       if (block1)
+               readahead_tree_block(root, block1, blocksize, 0);
+       if (block2)
+               readahead_tree_block(root, block2, blocksize, 0);
 }
 
 
@@ -2359,35 +2340,28 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        tmp = btrfs_find_tree_block(root, blocknr, blocksize);
        if (tmp) {
                /* first we do an atomic uptodate check */
-               if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
-                       if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
-                               /*
-                                * we found an up to date block without
-                                * sleeping, return
-                                * right away
-                                */
-                               *eb_ret = tmp;
-                               return 0;
-                       }
-                       /* the pages were up to date, but we failed
-                        * the generation number check.  Do a full
-                        * read for the generation number that is correct.
-                        * We must do this without dropping locks so
-                        * we can trust our generation number
-                        */
-                       free_extent_buffer(tmp);
-                       btrfs_set_path_blocking(p);
+               if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+                       *eb_ret = tmp;
+                       return 0;
+               }
 
-                       /* now we're allowed to do a blocking uptodate check */
-                       tmp = read_tree_block(root, blocknr, blocksize, gen);
-                       if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
-                               *eb_ret = tmp;
-                               return 0;
-                       }
-                       free_extent_buffer(tmp);
-                       btrfs_release_path(p);
-                       return -EIO;
+               /* the pages were up to date, but we failed
+                * the generation number check.  Do a full
+                * read for the generation number that is correct.
+                * We must do this without dropping locks so
+                * we can trust our generation number
+                */
+               btrfs_set_path_blocking(p);
+
+               /* now we're allowed to do a blocking uptodate check */
+               ret = btrfs_read_buffer(tmp, gen);
+               if (!ret) {
+                       *eb_ret = tmp;
+                       return 0;
                }
+               free_extent_buffer(tmp);
+               btrfs_release_path(p);
+               return -EIO;
        }
 
        /*
@@ -2448,11 +2422,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                        goto again;
                }
 
-               sret = reada_for_balance(root, p, level);
-               if (sret)
-                       goto again;
-
                btrfs_set_path_blocking(p);
+               reada_for_balance(root, p, level);
                sret = split_node(trans, root, p, level);
                btrfs_clear_path_blocking(p, NULL, 0);
 
@@ -2472,11 +2443,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                        goto again;
                }
 
-               sret = reada_for_balance(root, p, level);
-               if (sret)
-                       goto again;
-
                btrfs_set_path_blocking(p);
+               reada_for_balance(root, p, level);
                sret = balance_level(trans, root, p, level);
                btrfs_clear_path_blocking(p, NULL, 0);
 
@@ -3143,7 +3111,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
  */
 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
-                          struct btrfs_path *path, int level, int log_removal)
+                          struct btrfs_path *path, int level)
 {
        u64 lower_gen;
        struct extent_buffer *lower;
@@ -3194,7 +3162,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(c);
 
        old = root->node;
-       tree_mod_log_set_root_pointer(root, c, log_removal);
+       tree_mod_log_set_root_pointer(root, c, 0);
        rcu_assign_pointer(root->node, c);
 
        /* the super has an extra ref to root->node */
@@ -3278,14 +3246,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                /*
                 * trying to split the root, lets make a new one
                 *
-                * tree mod log: We pass 0 as log_removal parameter to
+                * tree mod log: We don't log_removal old root in
                 * insert_new_root, because that root buffer will be kept as a
                 * normal node. We are going to log removal of half of the
                 * elements below with tree_mod_log_eb_copy. We're holding a
                 * tree lock on the buffer, which is why we cannot race with
                 * other tree_mod_log users.
                 */
-               ret = insert_new_root(trans, root, path, level + 1, 0);
+               ret = insert_new_root(trans, root, path, level + 1);
                if (ret)
                        return ret;
        } else {
@@ -3986,7 +3954,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
                return -EOVERFLOW;
 
        /* first try to make some room by pushing left and right */
-       if (data_size) {
+       if (data_size && path->nodes[1]) {
                wret = push_leaf_right(trans, root, path, data_size,
                                       data_size, 0, 0);
                if (wret < 0)
@@ -4005,7 +3973,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        }
 
        if (!path->nodes[1]) {
-               ret = insert_new_root(trans, root, path, 1, 1);
+               ret = insert_new_root(trans, root, path, 1);
                if (ret)
                        return ret;
        }
index d6dd49b51ba8dfe27069a139ab42cfd88e58c117..e795bf135e809fa473190e0169edf21ef7acfb2d 100644 (file)
@@ -961,8 +961,8 @@ struct btrfs_dev_replace_item {
 #define BTRFS_BLOCK_GROUP_RAID1                (1ULL << 4)
 #define BTRFS_BLOCK_GROUP_DUP          (1ULL << 5)
 #define BTRFS_BLOCK_GROUP_RAID10       (1ULL << 6)
-#define BTRFS_BLOCK_GROUP_RAID5    (1 << 7)
-#define BTRFS_BLOCK_GROUP_RAID6    (1 << 8)
+#define BTRFS_BLOCK_GROUP_RAID5         (1ULL << 7)
+#define BTRFS_BLOCK_GROUP_RAID6         (1ULL << 8)
 #define BTRFS_BLOCK_GROUP_RESERVED     BTRFS_AVAIL_ALLOC_BIT_SINGLE
 
 enum btrfs_raid_types {
@@ -1101,6 +1101,18 @@ struct btrfs_space_info {
        u64 disk_total;         /* total bytes on disk, takes mirrors into
                                   account */
 
+       /*
+        * bytes_pinned is kept in line with what is actually pinned, as in
+        * we've called update_block_group and dropped the bytes_used counter
+        * and increased the bytes_pinned counter.  However this means that
+        * bytes_pinned does not reflect the bytes that will be pinned once the
+        * delayed refs are flushed, so this counter is inc'ed everytime we call
+        * btrfs_free_extent so it is a realtime count of what will be freed
+        * once the transaction is committed.  It will be zero'ed everytime the
+        * transaction commits.
+        */
+       struct percpu_counter total_bytes_pinned;
+
        /*
         * we bump reservation progress every time we decrement
         * bytes_reserved.  This way people waiting for reservations
@@ -1437,25 +1449,22 @@ struct btrfs_fs_info {
        atomic_t open_ioctl_trans;
 
        /*
-        * this is used by the balancing code to wait for all the pending
-        * ordered extents
+        * this is used to protect the following list -- ordered_roots.
         */
-       spinlock_t ordered_extent_lock;
+       spinlock_t ordered_root_lock;
 
        /*
-        * all of the data=ordered extents pending writeback
+        * all fs/file tree roots in which there are data=ordered extents
+        * pending writeback are added into this list.
+        *
         * these can span multiple transactions and basically include
         * every dirty data page that isn't from nodatacow
         */
-       struct list_head ordered_extents;
+       struct list_head ordered_roots;
 
-       spinlock_t delalloc_lock;
-       /*
-        * all of the inodes that have delalloc bytes.  It is possible for
-        * this list to be empty even when there is still dirty data=ordered
-        * extents waiting to finish IO.
-        */
-       struct list_head delalloc_inodes;
+       spinlock_t delalloc_root_lock;
+       /* all fs/file tree roots that have delalloc inodes. */
+       struct list_head delalloc_roots;
 
        /*
         * there is a pool of worker threads for checksumming during writes
@@ -1498,8 +1507,6 @@ struct btrfs_fs_info {
        int do_barriers;
        int closing;
        int log_root_recovering;
-       int enospc_unlink;
-       int trans_no_join;
 
        u64 total_pinned;
 
@@ -1594,6 +1601,12 @@ struct btrfs_fs_info {
        struct rb_root qgroup_tree;
        spinlock_t qgroup_lock;
 
+       /*
+        * used to avoid frequently calling ulist_alloc()/ulist_free()
+        * when doing qgroup accounting, it must be protected by qgroup_lock.
+        */
+       struct ulist *qgroup_ulist;
+
        /* protect user change for quota operations */
        struct mutex qgroup_ioctl_lock;
 
@@ -1607,6 +1620,8 @@ struct btrfs_fs_info {
        struct mutex qgroup_rescan_lock; /* protects the progress item */
        struct btrfs_key qgroup_rescan_progress;
        struct btrfs_workers qgroup_rescan_workers;
+       struct completion qgroup_rescan_completion;
+       struct btrfs_work qgroup_rescan_work;
 
        /* filesystem state */
        unsigned long fs_state;
@@ -1739,6 +1754,31 @@ struct btrfs_root {
        int force_cow;
 
        spinlock_t root_item_lock;
+       atomic_t refs;
+
+       spinlock_t delalloc_lock;
+       /*
+        * all of the inodes that have delalloc bytes.  It is possible for
+        * this list to be empty even when there is still dirty data=ordered
+        * extents waiting to finish IO.
+        */
+       struct list_head delalloc_inodes;
+       struct list_head delalloc_root;
+       u64 nr_delalloc_inodes;
+       /*
+        * this is used by the balancing code to wait for all the pending
+        * ordered extents
+        */
+       spinlock_t ordered_extent_lock;
+
+       /*
+        * all of the data=ordered extents pending writeback
+        * these can span multiple transactions and basically include
+        * every dirty data page that isn't from nodatacow
+        */
+       struct list_head ordered_extents;
+       struct list_head ordered_root;
+       u64 nr_ordered_extents;
 };
 
 struct btrfs_ioctl_defrag_range_args {
@@ -3028,6 +3068,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
                num_items;
 }
 
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
@@ -3039,6 +3081,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
                                    u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct btrfs_root *root,
+                                struct extent_buffer *eb);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 objectid, u64 offset, u64 bytenr);
@@ -3155,6 +3199,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes);
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+                            struct btrfs_block_rsv *dest, u64 num_bytes,
+                            int min_factor);
 void btrfs_block_rsv_release(struct btrfs_root *root,
                             struct btrfs_block_rsv *block_rsv,
                             u64 num_bytes);
@@ -3311,6 +3358,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
        smp_mb();
        return fs_info->closing;
 }
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
+{
+       return (root->fs_info->sb->s_flags & MS_RDONLY ||
+               btrfs_fs_closing(root->fs_info));
+}
+
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
        kfree(fs_info->balance_ctl);
@@ -3357,9 +3416,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
                                   struct btrfs_root_item *item);
 void btrfs_read_root_item(struct extent_buffer *eb, int slot,
                          struct btrfs_root_item *item);
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
-                        btrfs_root_item *item, struct btrfs_key *key);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+                   struct btrfs_path *path, struct btrfs_root_item *root_item,
+                   struct btrfs_key *root_key);
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
 void btrfs_set_root_node(struct btrfs_root_item *item,
                         struct extent_buffer *node);
@@ -3493,6 +3552,10 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
                                           size_t pg_offset, u64 start, u64 len,
                                           int create);
+noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
+                             struct inode *inode, u64 offset, u64 *len,
+                             u64 *orig_start, u64 *orig_block_len,
+                             u64 *ram_bytes);
 
 /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
 #if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -3530,6 +3593,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+                                   int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -3814,6 +3879,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 int btrfs_quota_disable(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
                              struct btrfs_fs_info *fs_info, u64 src, u64 dst);
 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
index eb34438ddedbc8ca0377fd6410d831fb3e824f7e..375510913fe744784f8f56966ed29693ee8e3612 100644 (file)
@@ -535,20 +535,6 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
        return next;
 }
 
-static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
-                                                  u64 root_id)
-{
-       struct btrfs_key root_key;
-
-       if (root->objectid == root_id)
-               return root;
-
-       root_key.objectid = root_id;
-       root_key.type = BTRFS_ROOT_ITEM_KEY;
-       root_key.offset = (u64)-1;
-       return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
-}
-
 static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
                                               struct btrfs_root *root,
                                               struct btrfs_delayed_item *item)
index 65241f32d3f8aec282a80d443c7acc677f68726a..4253ad580e391489c71c671df92450f048d29aae 100644 (file)
@@ -400,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
        btrfs_dev_replace_unlock(dev_replace);
 
-       btrfs_wait_ordered_extents(root, 0);
+       btrfs_wait_all_ordered_extents(root->fs_info, 0);
 
        /* force writing the updated state information to disk */
        trans = btrfs_start_transaction(root, 0);
@@ -470,12 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         * flush all outstanding I/O and inode extent mappings before the
         * copy operation is declared as being finished
         */
-       ret = btrfs_start_delalloc_inodes(root, 0);
+       ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
        if (ret) {
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                return ret;
        }
-       btrfs_wait_ordered_extents(root, 0);
+       btrfs_wait_all_ordered_extents(root->fs_info, 0);
 
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
index b0292b3ead54d1651ba47d7e9efcc567566dd1ed..6b092a1c4e37bab47adb0e9fc35ae6ec3e6081f8 100644 (file)
@@ -1192,6 +1192,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->objectid = objectid;
        root->last_trans = 0;
        root->highest_objectid = 0;
+       root->nr_delalloc_inodes = 0;
+       root->nr_ordered_extents = 0;
        root->name = NULL;
        root->inode_tree = RB_ROOT;
        INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
@@ -1200,10 +1202,16 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->root_list);
+       INIT_LIST_HEAD(&root->delalloc_inodes);
+       INIT_LIST_HEAD(&root->delalloc_root);
+       INIT_LIST_HEAD(&root->ordered_extents);
+       INIT_LIST_HEAD(&root->ordered_root);
        INIT_LIST_HEAD(&root->logged_list[0]);
        INIT_LIST_HEAD(&root->logged_list[1]);
        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
+       spin_lock_init(&root->delalloc_lock);
+       spin_lock_init(&root->ordered_extent_lock);
        spin_lock_init(&root->accounting_lock);
        spin_lock_init(&root->log_extents_lock[0]);
        spin_lock_init(&root->log_extents_lock[1]);
@@ -1217,6 +1225,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        atomic_set(&root->log_writers, 0);
        atomic_set(&root->log_batch, 0);
        atomic_set(&root->orphan_inodes, 0);
+       atomic_set(&root->refs, 1);
        root->log_transid = 0;
        root->last_log_commit = 0;
        extent_io_tree_init(&root->dirty_log_pages,
@@ -1235,39 +1244,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        spin_lock_init(&root->root_item_lock);
 }
 
-static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
-                                           struct btrfs_fs_info *fs_info,
-                                           u64 objectid,
-                                           struct btrfs_root *root)
-{
-       int ret;
-       u32 blocksize;
-       u64 generation;
-
-       __setup_root(tree_root->nodesize, tree_root->leafsize,
-                    tree_root->sectorsize, tree_root->stripesize,
-                    root, fs_info, objectid);
-       ret = btrfs_find_last_root(tree_root, objectid,
-                                  &root->root_item, &root->root_key);
-       if (ret > 0)
-               return -ENOENT;
-       else if (ret < 0)
-               return ret;
-
-       generation = btrfs_root_generation(&root->root_item);
-       blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
-       root->commit_root = NULL;
-       root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-                                    blocksize, generation);
-       if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
-               free_extent_buffer(root->node);
-               root->node = NULL;
-               return -EIO;
-       }
-       root->commit_root = btrfs_root_node(root);
-       return 0;
-}
-
 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
 {
        struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
@@ -1452,70 +1428,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
        return 0;
 }
 
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
-                                              struct btrfs_key *location)
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+                                       struct btrfs_key *key)
 {
        struct btrfs_root *root;
        struct btrfs_fs_info *fs_info = tree_root->fs_info;
        struct btrfs_path *path;
-       struct extent_buffer *l;
        u64 generation;
        u32 blocksize;
-       int ret = 0;
-       int slot;
+       int ret;
 
-       root = btrfs_alloc_root(fs_info);
-       if (!root)
+       path = btrfs_alloc_path();
+       if (!path)
                return ERR_PTR(-ENOMEM);
-       if (location->offset == (u64)-1) {
-               ret = find_and_setup_root(tree_root, fs_info,
-                                         location->objectid, root);
-               if (ret) {
-                       kfree(root);
-                       return ERR_PTR(ret);
-               }
-               goto out;
+
+       root = btrfs_alloc_root(fs_info);
+       if (!root) {
+               ret = -ENOMEM;
+               goto alloc_fail;
        }
 
        __setup_root(tree_root->nodesize, tree_root->leafsize,
                     tree_root->sectorsize, tree_root->stripesize,
-                    root, fs_info, location->objectid);
+                    root, fs_info, key->objectid);
 
-       path = btrfs_alloc_path();
-       if (!path) {
-               kfree(root);
-               return ERR_PTR(-ENOMEM);
-       }
-       ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
-       if (ret == 0) {
-               l = path->nodes[0];
-               slot = path->slots[0];
-               btrfs_read_root_item(l, slot, &root->root_item);
-               memcpy(&root->root_key, location, sizeof(*location));
-       }
-       btrfs_free_path(path);
+       ret = btrfs_find_root(tree_root, key, path,
+                             &root->root_item, &root->root_key);
        if (ret) {
-               kfree(root);
                if (ret > 0)
                        ret = -ENOENT;
-               return ERR_PTR(ret);
+               goto find_fail;
        }
 
        generation = btrfs_root_generation(&root->root_item);
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
-       if (!root->node || !extent_buffer_uptodate(root->node)) {
-               ret = (!root->node) ? -ENOMEM : -EIO;
-
-               free_extent_buffer(root->node);
-               kfree(root);
-               return ERR_PTR(ret);
+       if (!root->node) {
+               ret = -ENOMEM;
+               goto find_fail;
+       } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+               ret = -EIO;
+               goto read_fail;
        }
-
        root->commit_root = btrfs_root_node(root);
 out:
-       if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+       btrfs_free_path(path);
+       return root;
+
+read_fail:
+       free_extent_buffer(root->node);
+find_fail:
+       kfree(root);
+alloc_fail:
+       root = ERR_PTR(ret);
+       goto out;
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+                                     struct btrfs_key *location)
+{
+       struct btrfs_root *root;
+
+       root = btrfs_read_tree_root(tree_root, location);
+       if (IS_ERR(root))
+               return root;
+
+       if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                root->ref_cows = 1;
                btrfs_check_and_init_root_item(&root->root_item);
        }
@@ -1523,6 +1502,66 @@ out:
        return root;
 }
 
+int btrfs_init_fs_root(struct btrfs_root *root)
+{
+       int ret;
+
+       root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+       root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+                                       GFP_NOFS);
+       if (!root->free_ino_pinned || !root->free_ino_ctl) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+
+       btrfs_init_free_ino_ctl(root);
+       mutex_init(&root->fs_commit_mutex);
+       spin_lock_init(&root->cache_lock);
+       init_waitqueue_head(&root->cache_wait);
+
+       ret = get_anon_bdev(&root->anon_dev);
+       if (ret)
+               goto fail;
+       return 0;
+fail:
+       kfree(root->free_ino_ctl);
+       kfree(root->free_ino_pinned);
+       return ret;
+}
+
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+                                       u64 root_id)
+{
+       struct btrfs_root *root;
+
+       spin_lock(&fs_info->fs_roots_radix_lock);
+       root = radix_tree_lookup(&fs_info->fs_roots_radix,
+                                (unsigned long)root_id);
+       spin_unlock(&fs_info->fs_roots_radix_lock);
+       return root;
+}
+
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+                        struct btrfs_root *root)
+{
+       int ret;
+
+       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+       if (ret)
+               return ret;
+
+       spin_lock(&fs_info->fs_roots_radix_lock);
+       ret = radix_tree_insert(&fs_info->fs_roots_radix,
+                               (unsigned long)root->root_key.objectid,
+                               root);
+       if (ret == 0)
+               root->in_radix = 1;
+       spin_unlock(&fs_info->fs_roots_radix_lock);
+       radix_tree_preload_end();
+
+       return ret;
+}
+
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                                              struct btrfs_key *location)
 {
@@ -1543,58 +1582,30 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                return fs_info->quota_root ? fs_info->quota_root :
                                             ERR_PTR(-ENOENT);
 again:
-       spin_lock(&fs_info->fs_roots_radix_lock);
-       root = radix_tree_lookup(&fs_info->fs_roots_radix,
-                                (unsigned long)location->objectid);
-       spin_unlock(&fs_info->fs_roots_radix_lock);
+       root = btrfs_lookup_fs_root(fs_info, location->objectid);
        if (root)
                return root;
 
-       root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+       root = btrfs_read_fs_root(fs_info->tree_root, location);
        if (IS_ERR(root))
                return root;
 
-       root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
-       root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
-                                       GFP_NOFS);
-       if (!root->free_ino_pinned || !root->free_ino_ctl) {
-               ret = -ENOMEM;
+       if (btrfs_root_refs(&root->root_item) == 0) {
+               ret = -ENOENT;
                goto fail;
        }
 
-       btrfs_init_free_ino_ctl(root);
-       mutex_init(&root->fs_commit_mutex);
-       spin_lock_init(&root->cache_lock);
-       init_waitqueue_head(&root->cache_wait);
-
-       ret = get_anon_bdev(&root->anon_dev);
+       ret = btrfs_init_fs_root(root);
        if (ret)
                goto fail;
 
-       if (btrfs_root_refs(&root->root_item) == 0) {
-               ret = -ENOENT;
-               goto fail;
-       }
-
        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
        if (ret < 0)
                goto fail;
        if (ret == 0)
                root->orphan_item_inserted = 1;
 
-       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
-       if (ret)
-               goto fail;
-
-       spin_lock(&fs_info->fs_roots_radix_lock);
-       ret = radix_tree_insert(&fs_info->fs_roots_radix,
-                               (unsigned long)root->root_key.objectid,
-                               root);
-       if (ret == 0)
-               root->in_radix = 1;
-
-       spin_unlock(&fs_info->fs_roots_radix_lock);
-       radix_tree_preload_end();
+       ret = btrfs_insert_fs_root(fs_info, root);
        if (ret) {
                if (ret == -EEXIST) {
                        free_fs_root(root);
@@ -1602,10 +1613,6 @@ again:
                }
                goto fail;
        }
-
-       ret = btrfs_find_dead_roots(fs_info->tree_root,
-                                   root->root_key.objectid);
-       WARN_ON(ret);
        return root;
 fail:
        free_fs_root(root);
@@ -1677,21 +1684,37 @@ static void end_workqueue_fn(struct btrfs_work *work)
 static int cleaner_kthread(void *arg)
 {
        struct btrfs_root *root = arg;
+       int again;
 
        do {
-               int again = 0;
-
-               if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
-                   down_read_trylock(&root->fs_info->sb->s_umount)) {
-                       if (mutex_trylock(&root->fs_info->cleaner_mutex)) {
-                               btrfs_run_delayed_iputs(root);
-                               again = btrfs_clean_one_deleted_snapshot(root);
-                               mutex_unlock(&root->fs_info->cleaner_mutex);
-                       }
-                       btrfs_run_defrag_inodes(root->fs_info);
-                       up_read(&root->fs_info->sb->s_umount);
+               again = 0;
+
+               /* Make the cleaner go to sleep early. */
+               if (btrfs_need_cleaner_sleep(root))
+                       goto sleep;
+
+               if (!mutex_trylock(&root->fs_info->cleaner_mutex))
+                       goto sleep;
+
+               /*
+                * Avoid the problem that we change the status of the fs
+                * during the above check and trylock.
+                */
+               if (btrfs_need_cleaner_sleep(root)) {
+                       mutex_unlock(&root->fs_info->cleaner_mutex);
+                       goto sleep;
                }
 
+               btrfs_run_delayed_iputs(root);
+               again = btrfs_clean_one_deleted_snapshot(root);
+               mutex_unlock(&root->fs_info->cleaner_mutex);
+
+               /*
+                * The defragger has dealt with the R/O remount and umount,
+                * needn't do anything special here.
+                */
+               btrfs_run_defrag_inodes(root->fs_info);
+sleep:
                if (!try_to_freeze() && !again) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        if (!kthread_should_stop())
@@ -1725,7 +1748,7 @@ static int transaction_kthread(void *arg)
                }
 
                now = get_seconds();
-               if (!cur->blocked &&
+               if (cur->state < TRANS_STATE_BLOCKED &&
                    (now < cur->start_time || now - cur->start_time < 30)) {
                        spin_unlock(&root->fs_info->trans_lock);
                        delay = HZ * 5;
@@ -2035,11 +2058,11 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
                list_del(&gang[0]->root_list);
 
                if (gang[0]->in_radix) {
-                       btrfs_free_fs_root(fs_info, gang[0]);
+                       btrfs_drop_and_free_fs_root(fs_info, gang[0]);
                } else {
                        free_extent_buffer(gang[0]->node);
                        free_extent_buffer(gang[0]->commit_root);
-                       kfree(gang[0]);
+                       btrfs_put_fs_root(gang[0]);
                }
        }
 
@@ -2050,7 +2073,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
                if (!ret)
                        break;
                for (i = 0; i < ret; i++)
-                       btrfs_free_fs_root(fs_info, gang[i]);
+                       btrfs_drop_and_free_fs_root(fs_info, gang[i]);
        }
 }
 
@@ -2082,14 +2105,8 @@ int open_ctree(struct super_block *sb,
        int backup_index = 0;
 
        tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
-       extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
-       csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
        chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
-       dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
-       quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
-
-       if (!tree_root || !extent_root || !csum_root ||
-           !chunk_root || !dev_root || !quota_root) {
+       if (!tree_root || !chunk_root) {
                err = -ENOMEM;
                goto fail;
        }
@@ -2132,9 +2149,9 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
-       INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+       INIT_LIST_HEAD(&fs_info->delalloc_roots);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
-       spin_lock_init(&fs_info->delalloc_lock);
+       spin_lock_init(&fs_info->delalloc_root_lock);
        spin_lock_init(&fs_info->trans_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
@@ -2170,7 +2187,6 @@ int open_ctree(struct super_block *sb,
        fs_info->max_inline = 8192 * 1024;
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
-       fs_info->trans_no_join = 0;
        fs_info->free_chunk_space = 0;
        fs_info->tree_mod_log = RB_ROOT;
 
@@ -2181,8 +2197,8 @@ int open_ctree(struct super_block *sb,
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
 
-       INIT_LIST_HEAD(&fs_info->ordered_extents);
-       spin_lock_init(&fs_info->ordered_extent_lock);
+       INIT_LIST_HEAD(&fs_info->ordered_roots);
+       spin_lock_init(&fs_info->ordered_root_lock);
        fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
                                        GFP_NOFS);
        if (!fs_info->delayed_root) {
@@ -2275,6 +2291,7 @@ int open_ctree(struct super_block *sb,
        fs_info->qgroup_seq = 1;
        fs_info->quota_enabled = 0;
        fs_info->pending_quota_state = 0;
+       fs_info->qgroup_ulist = NULL;
        mutex_init(&fs_info->qgroup_rescan_lock);
 
        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@ -2639,33 +2656,44 @@ retry_root_backup:
        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
        tree_root->commit_root = btrfs_root_node(tree_root);
 
-       ret = find_and_setup_root(tree_root, fs_info,
-                                 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
-       if (ret)
+       location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+       location.type = BTRFS_ROOT_ITEM_KEY;
+       location.offset = 0;
+
+       extent_root = btrfs_read_tree_root(tree_root, &location);
+       if (IS_ERR(extent_root)) {
+               ret = PTR_ERR(extent_root);
                goto recovery_tree_root;
+       }
        extent_root->track_dirty = 1;
+       fs_info->extent_root = extent_root;
 
-       ret = find_and_setup_root(tree_root, fs_info,
-                                 BTRFS_DEV_TREE_OBJECTID, dev_root);
-       if (ret)
+       location.objectid = BTRFS_DEV_TREE_OBJECTID;
+       dev_root = btrfs_read_tree_root(tree_root, &location);
+       if (IS_ERR(dev_root)) {
+               ret = PTR_ERR(dev_root);
                goto recovery_tree_root;
+       }
        dev_root->track_dirty = 1;
+       fs_info->dev_root = dev_root;
+       btrfs_init_devices_late(fs_info);
 
-       ret = find_and_setup_root(tree_root, fs_info,
-                                 BTRFS_CSUM_TREE_OBJECTID, csum_root);
-       if (ret)
+       location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+       csum_root = btrfs_read_tree_root(tree_root, &location);
+       if (IS_ERR(csum_root)) {
+               ret = PTR_ERR(csum_root);
                goto recovery_tree_root;
+       }
        csum_root->track_dirty = 1;
+       fs_info->csum_root = csum_root;
 
-       ret = find_and_setup_root(tree_root, fs_info,
-                                 BTRFS_QUOTA_TREE_OBJECTID, quota_root);
-       if (ret) {
-               kfree(quota_root);
-               quota_root = fs_info->quota_root = NULL;
-       } else {
+       location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+       quota_root = btrfs_read_tree_root(tree_root, &location);
+       if (!IS_ERR(quota_root)) {
                quota_root->track_dirty = 1;
                fs_info->quota_enabled = 1;
                fs_info->pending_quota_state = 1;
+               fs_info->quota_root = quota_root;
        }
 
        fs_info->generation = generation;
@@ -2818,11 +2846,9 @@ retry_root_backup:
 
        location.objectid = BTRFS_FS_TREE_OBJECTID;
        location.type = BTRFS_ROOT_ITEM_KEY;
-       location.offset = (u64)-1;
+       location.offset = 0;
 
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
-       if (!fs_info->fs_root)
-               goto fail_qgroup;
        if (IS_ERR(fs_info->fs_root)) {
                err = PTR_ERR(fs_info->fs_root);
                goto fail_qgroup;
@@ -2854,6 +2880,8 @@ retry_root_backup:
                return ret;
        }
 
+       btrfs_qgroup_rescan_resume(fs_info);
+
        return 0;
 
 fail_qgroup:
@@ -3259,7 +3287,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
                                            BTRFS_BLOCK_GROUP_RAID10)) {
                                                num_tolerated_disk_barrier_failures = 1;
                                        } else if (flags &
-                                                  BTRFS_BLOCK_GROUP_RAID5) {
+                                                  BTRFS_BLOCK_GROUP_RAID6) {
                                                num_tolerated_disk_barrier_failures = 2;
                                        }
                                }
@@ -3367,7 +3395,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+/* Drop a fs root from the radix tree and free it. */
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_root *root)
 {
        spin_lock(&fs_info->fs_roots_radix_lock);
        radix_tree_delete(&fs_info->fs_roots_radix,
@@ -3398,7 +3428,12 @@ static void free_fs_root(struct btrfs_root *root)
        kfree(root->free_ino_ctl);
        kfree(root->free_ino_pinned);
        kfree(root->name);
-       kfree(root);
+       btrfs_put_fs_root(root);
+}
+
+void btrfs_free_fs_root(struct btrfs_root *root)
+{
+       free_fs_root(root);
 }
 
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3654,7 +3689,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
        INIT_LIST_HEAD(&splice);
 
        mutex_lock(&root->fs_info->ordered_operations_mutex);
-       spin_lock(&root->fs_info->ordered_extent_lock);
+       spin_lock(&root->fs_info->ordered_root_lock);
 
        list_splice_init(&t->ordered_operations, &splice);
        while (!list_empty(&splice)) {
@@ -3662,14 +3697,14 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
                                         ordered_operations);
 
                list_del_init(&btrfs_inode->ordered_operations);
-               spin_unlock(&root->fs_info->ordered_extent_lock);
+               spin_unlock(&root->fs_info->ordered_root_lock);
 
                btrfs_invalidate_inodes(btrfs_inode->root);
 
-               spin_lock(&root->fs_info->ordered_extent_lock);
+               spin_lock(&root->fs_info->ordered_root_lock);
        }
 
-       spin_unlock(&root->fs_info->ordered_extent_lock);
+       spin_unlock(&root->fs_info->ordered_root_lock);
        mutex_unlock(&root->fs_info->ordered_operations_mutex);
 }
 
@@ -3677,15 +3712,36 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
 {
        struct btrfs_ordered_extent *ordered;
 
-       spin_lock(&root->fs_info->ordered_extent_lock);
+       spin_lock(&root->ordered_extent_lock);
        /*
         * This will just short circuit the ordered completion stuff which will
         * make sure the ordered extent gets properly cleaned up.
         */
-       list_for_each_entry(ordered, &root->fs_info->ordered_extents,
+       list_for_each_entry(ordered, &root->ordered_extents,
                            root_extent_list)
                set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
-       spin_unlock(&root->fs_info->ordered_extent_lock);
+       spin_unlock(&root->ordered_extent_lock);
+}
+
+static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_root *root;
+       struct list_head splice;
+
+       INIT_LIST_HEAD(&splice);
+
+       spin_lock(&fs_info->ordered_root_lock);
+       list_splice_init(&fs_info->ordered_roots, &splice);
+       while (!list_empty(&splice)) {
+               root = list_first_entry(&splice, struct btrfs_root,
+                                       ordered_root);
+               list_del_init(&root->ordered_root);
+
+               btrfs_destroy_ordered_extents(root);
+
+               cond_resched_lock(&fs_info->ordered_root_lock);
+       }
+       spin_unlock(&fs_info->ordered_root_lock);
 }
 
 int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -3707,6 +3763,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 
        while ((node = rb_first(&delayed_refs->root)) != NULL) {
                struct btrfs_delayed_ref_head *head = NULL;
+               bool pin_bytes = false;
 
                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
                atomic_set(&ref->refs, 1);
@@ -3727,8 +3784,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                        }
 
                        if (head->must_insert_reserved)
-                               btrfs_pin_extent(root, ref->bytenr,
-                                                ref->num_bytes, 1);
+                               pin_bytes = true;
                        btrfs_free_delayed_extent_op(head->extent_op);
                        delayed_refs->num_heads--;
                        if (list_empty(&head->cluster))
@@ -3739,9 +3795,13 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
-               if (head)
-                       mutex_unlock(&head->mutex);
                spin_unlock(&delayed_refs->lock);
+               if (head) {
+                       if (pin_bytes)
+                               btrfs_pin_extent(root, ref->bytenr,
+                                                ref->num_bytes, 1);
+                       mutex_unlock(&head->mutex);
+               }
                btrfs_put_delayed_ref(ref);
 
                cond_resched();
@@ -3778,24 +3838,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
 
        INIT_LIST_HEAD(&splice);
 
-       spin_lock(&root->fs_info->delalloc_lock);
-       list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+       spin_lock(&root->delalloc_lock);
+       list_splice_init(&root->delalloc_inodes, &splice);
 
        while (!list_empty(&splice)) {
-               btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-                                   delalloc_inodes);
+               btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
+                                              delalloc_inodes);
 
                list_del_init(&btrfs_inode->delalloc_inodes);
                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
                          &btrfs_inode->runtime_flags);
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_unlock(&root->delalloc_lock);
 
                btrfs_invalidate_inodes(btrfs_inode->root);
 
-               spin_lock(&root->fs_info->delalloc_lock);
+               spin_lock(&root->delalloc_lock);
        }
 
-       spin_unlock(&root->fs_info->delalloc_lock);
+       spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_root *root;
+       struct list_head splice;
+
+       INIT_LIST_HEAD(&splice);
+
+       spin_lock(&fs_info->delalloc_root_lock);
+       list_splice_init(&fs_info->delalloc_roots, &splice);
+       while (!list_empty(&splice)) {
+               root = list_first_entry(&splice, struct btrfs_root,
+                                        delalloc_root);
+               list_del_init(&root->delalloc_root);
+               root = btrfs_grab_fs_root(root);
+               BUG_ON(!root);
+               spin_unlock(&fs_info->delalloc_root_lock);
+
+               btrfs_destroy_delalloc_inodes(root);
+               btrfs_put_fs_root(root);
+
+               spin_lock(&fs_info->delalloc_root_lock);
+       }
+       spin_unlock(&fs_info->delalloc_root_lock);
 }
 
 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3879,19 +3964,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
        btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
                                cur_trans->dirty_pages.dirty_bytes);
 
-       /* FIXME: cleanup wait for commit */
-       cur_trans->in_commit = 1;
-       cur_trans->blocked = 1;
+       cur_trans->state = TRANS_STATE_COMMIT_START;
        wake_up(&root->fs_info->transaction_blocked_wait);
 
        btrfs_evict_pending_snapshots(cur_trans);
 
-       cur_trans->blocked = 0;
+       cur_trans->state = TRANS_STATE_UNBLOCKED;
        wake_up(&root->fs_info->transaction_wait);
 
-       cur_trans->commit_done = 1;
-       wake_up(&cur_trans->commit_wait);
-
        btrfs_destroy_delayed_inodes(root);
        btrfs_assert_delayed_root_empty(root);
 
@@ -3900,6 +3980,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
        btrfs_destroy_pinned_extent(root,
                                    root->fs_info->pinned_extents);
 
+       cur_trans->state =TRANS_STATE_COMPLETED;
+       wake_up(&cur_trans->commit_wait);
+
        /*
        memset(cur_trans, 0, sizeof(*cur_trans));
        kmem_cache_free(btrfs_transaction_cachep, cur_trans);
@@ -3915,7 +3998,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 
        spin_lock(&root->fs_info->trans_lock);
        list_splice_init(&root->fs_info->trans_list, &list);
-       root->fs_info->trans_no_join = 1;
+       root->fs_info->running_transaction = NULL;
        spin_unlock(&root->fs_info->trans_lock);
 
        while (!list_empty(&list)) {
@@ -3923,37 +4006,31 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 
                btrfs_destroy_ordered_operations(t, root);
 
-               btrfs_destroy_ordered_extents(root);
+               btrfs_destroy_all_ordered_extents(root->fs_info);
 
                btrfs_destroy_delayed_refs(t, root);
 
-               /* FIXME: cleanup wait for commit */
-               t->in_commit = 1;
-               t->blocked = 1;
+               /*
+                *  FIXME: cleanup wait for commit
+                *  We needn't acquire the lock here, because we are during
+                *  the umount, there is no other task which will change it.
+                */
+               t->state = TRANS_STATE_COMMIT_START;
                smp_mb();
                if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
                        wake_up(&root->fs_info->transaction_blocked_wait);
 
                btrfs_evict_pending_snapshots(t);
 
-               t->blocked = 0;
+               t->state = TRANS_STATE_UNBLOCKED;
                smp_mb();
                if (waitqueue_active(&root->fs_info->transaction_wait))
                        wake_up(&root->fs_info->transaction_wait);
 
-               t->commit_done = 1;
-               smp_mb();
-               if (waitqueue_active(&t->commit_wait))
-                       wake_up(&t->commit_wait);
-
                btrfs_destroy_delayed_inodes(root);
                btrfs_assert_delayed_root_empty(root);
 
-               btrfs_destroy_delalloc_inodes(root);
-
-               spin_lock(&root->fs_info->trans_lock);
-               root->fs_info->running_transaction = NULL;
-               spin_unlock(&root->fs_info->trans_lock);
+               btrfs_destroy_all_delalloc_inodes(root->fs_info);
 
                btrfs_destroy_marked_extents(root, &t->dirty_pages,
                                             EXTENT_DIRTY);
@@ -3961,15 +4038,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
                btrfs_destroy_pinned_extent(root,
                                            root->fs_info->pinned_extents);
 
+               t->state = TRANS_STATE_COMPLETED;
+               smp_mb();
+               if (waitqueue_active(&t->commit_wait))
+                       wake_up(&t->commit_wait);
+
                atomic_set(&t->use_count, 0);
                list_del_init(&t->list);
                memset(t, 0, sizeof(*t));
                kmem_cache_free(btrfs_transaction_cachep, t);
        }
 
-       spin_lock(&root->fs_info->trans_lock);
-       root->fs_info->trans_no_join = 0;
-       spin_unlock(&root->fs_info->trans_lock);
        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 
        return 0;
index be69ce1b07a22ac83a2dedd808be0e6d20040333..b71acd6e1e5b1941e75ed4c5c056d47268cdc407 100644 (file)
@@ -63,14 +63,40 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize);
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
-                                              struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+                                     struct btrfs_key *location);
+int btrfs_init_fs_root(struct btrfs_root *root);
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+                        struct btrfs_root *root);
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                                              struct btrfs_key *location);
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
 void btrfs_btree_balance_dirty(struct btrfs_root *root);
 void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
-void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+                                struct btrfs_root *root);
+void btrfs_free_fs_root(struct btrfs_root *root);
+
+/*
+ * This function is used to grab the root, and avoid it is freed when we
+ * access it. But it doesn't ensure that the tree is not dropped.
+ *
+ * If you want to ensure the whole tree is safe, you should use
+ *     fs_info->subvol_srcu
+ */
+static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+{
+       if (atomic_inc_not_zero(&root->refs))
+               return root;
+       return NULL;
+}
+
+static inline void btrfs_put_fs_root(struct btrfs_root *root)
+{
+       if (atomic_dec_and_test(&root->refs))
+               kfree(root);
+}
+
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
                          int atomic);
index 81ee29eeb7ca569dd53bea2b35789c1fe9c2c777..4b86916073737a175770605de4a25b862faa6929 100644 (file)
@@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
                goto fail;
        }
 
-       if (btrfs_root_refs(&root->root_item) == 0) {
-               err = -ENOENT;
-               goto fail;
-       }
-
        key.objectid = objectid;
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
        key.offset = 0;
index df472ab1b5acca7b411b05bcc414913bcebcc244..0236de711989097bbf5191dbb6871281d840de22 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/kthread.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <linux/percpu_counter.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -2526,6 +2527,51 @@ static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
        return 0;
 }
 
+static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
+{
+       u64 num_bytes;
+
+       num_bytes = heads * (sizeof(struct btrfs_extent_item) +
+                            sizeof(struct btrfs_extent_inline_ref));
+       if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+               num_bytes += heads * sizeof(struct btrfs_tree_block_info);
+
+       /*
+        * We don't ever fill up leaves all the way so multiply by 2 just to be
+        * closer to what we're really going to want to ouse.
+        */
+       return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root)
+{
+       struct btrfs_block_rsv *global_rsv;
+       u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
+       u64 num_bytes;
+       int ret = 0;
+
+       num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+       num_heads = heads_to_leaves(root, num_heads);
+       if (num_heads > 1)
+               num_bytes += (num_heads - 1) * root->leafsize;
+       num_bytes <<= 1;
+       global_rsv = &root->fs_info->global_block_rsv;
+
+       /*
+        * If we can't allocate any more chunks lets make sure we have _lots_ of
+        * wiggle room since running delayed refs can create more delayed refs.
+        */
+       if (global_rsv->space_info->full)
+               num_bytes <<= 1;
+
+       spin_lock(&global_rsv->lock);
+       if (global_rsv->reserved <= num_bytes)
+               ret = 1;
+       spin_unlock(&global_rsv->lock);
+       return ret;
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2573,7 +2619,8 @@ progress:
                old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
                if (old) {
                        DEFINE_WAIT(__wait);
-                       if (delayed_refs->num_entries < 16348)
+                       if (delayed_refs->flushing ||
+                           !btrfs_should_throttle_delayed_refs(trans, root))
                                return 0;
 
                        prepare_to_wait(&delayed_refs->wait, &__wait,
@@ -2608,7 +2655,7 @@ again:
 
        while (1) {
                if (!(run_all || run_most) &&
-                   delayed_refs->num_heads_ready < 64)
+                   !btrfs_should_throttle_delayed_refs(trans, root))
                        break;
 
                /*
@@ -2629,6 +2676,7 @@ again:
                        spin_unlock(&delayed_refs->lock);
                        btrfs_abort_transaction(trans, root, ret);
                        atomic_dec(&delayed_refs->procs_running_refs);
+                       wake_up(&delayed_refs->wait);
                        return ret;
                }
 
@@ -3310,6 +3358,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        struct btrfs_space_info *found;
        int i;
        int factor;
+       int ret;
 
        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
                     BTRFS_BLOCK_GROUP_RAID10))
@@ -3333,6 +3382,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        if (!found)
                return -ENOMEM;
 
+       ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+       if (ret) {
+               kfree(found);
+               return ret;
+       }
+
        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
                INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
@@ -3565,10 +3620,11 @@ alloc:
                }
 
                /*
-                * If we have less pinned bytes than we want to allocate then
-                * don't bother committing the transaction, it won't help us.
+                * If we don't have enough pinned space to deal with this
+                * allocation don't bother committing the transaction.
                 */
-               if (data_sinfo->bytes_pinned < bytes)
+               if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
+                                          bytes) < 0)
                        committed = 1;
                spin_unlock(&data_sinfo->lock);
 
@@ -3577,6 +3633,7 @@ commit_trans:
                if (!committed &&
                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
                        committed = 1;
+
                        trans = btrfs_join_transaction(root);
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
@@ -3609,6 +3666,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 
        data_sinfo = root->fs_info->data_sinfo;
        spin_lock(&data_sinfo->lock);
+       WARN_ON(data_sinfo->bytes_may_use < bytes);
        data_sinfo->bytes_may_use -= bytes;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
                                      data_sinfo->flags, bytes, 0);
@@ -3886,12 +3944,11 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
                                         unsigned long nr_pages)
 {
        struct super_block *sb = root->fs_info->sb;
-       int started;
 
-       /* If we can not start writeback, just sync all the delalloc file. */
-       started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
-                                                     WB_REASON_FS_FREE_SPACE);
-       if (!started) {
+       if (down_read_trylock(&sb->s_umount)) {
+               writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
+               up_read(&sb->s_umount);
+       } else {
                /*
                 * We needn't worry the filesystem going from r/w to r/o though
                 * we don't acquire ->s_umount mutex, because the filesystem
@@ -3899,9 +3956,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
                 * the filesystem is readonly(all dirty pages are written to
                 * the disk).
                 */
-               btrfs_start_delalloc_inodes(root, 0);
+               btrfs_start_all_delalloc_inodes(root->fs_info, 0);
                if (!current->journal_info)
-                       btrfs_wait_ordered_extents(root, 0);
+                       btrfs_wait_all_ordered_extents(root->fs_info, 0);
        }
 }
 
@@ -3931,7 +3988,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        if (delalloc_bytes == 0) {
                if (trans)
                        return;
-               btrfs_wait_ordered_extents(root, 0);
+               btrfs_wait_all_ordered_extents(root->fs_info, 0);
                return;
        }
 
@@ -3959,7 +4016,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 
                loops++;
                if (wait_ordered && !trans) {
-                       btrfs_wait_ordered_extents(root, 0);
+                       btrfs_wait_all_ordered_extents(root->fs_info, 0);
                } else {
                        time_left = schedule_timeout_killable(1);
                        if (time_left)
@@ -3997,7 +4054,8 @@ static int may_commit_transaction(struct btrfs_root *root,
 
        /* See if there is enough pinned space to make this reservation */
        spin_lock(&space_info->lock);
-       if (space_info->bytes_pinned >= bytes) {
+       if (percpu_counter_compare(&space_info->total_bytes_pinned,
+                                  bytes) >= 0) {
                spin_unlock(&space_info->lock);
                goto commit;
        }
@@ -4012,7 +4070,8 @@ static int may_commit_transaction(struct btrfs_root *root,
 
        spin_lock(&space_info->lock);
        spin_lock(&delayed_rsv->lock);
-       if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
+       if (percpu_counter_compare(&space_info->total_bytes_pinned,
+                                  bytes - delayed_rsv->size) >= 0) {
                spin_unlock(&delayed_rsv->lock);
                spin_unlock(&space_info->lock);
                return -ENOSPC;
@@ -4297,6 +4356,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
        spin_unlock(&block_rsv->lock);
 }
 
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+                            struct btrfs_block_rsv *dest, u64 num_bytes,
+                            int min_factor)
+{
+       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+       u64 min_bytes;
+
+       if (global_rsv->space_info != dest->space_info)
+               return -ENOSPC;
+
+       spin_lock(&global_rsv->lock);
+       min_bytes = div_factor(global_rsv->size, min_factor);
+       if (global_rsv->reserved < min_bytes + num_bytes) {
+               spin_unlock(&global_rsv->lock);
+               return -ENOSPC;
+       }
+       global_rsv->reserved -= num_bytes;
+       if (global_rsv->reserved < global_rsv->size)
+               global_rsv->full = 0;
+       spin_unlock(&global_rsv->lock);
+
+       block_rsv_add_bytes(dest, num_bytes, 1);
+       return 0;
+}
+
 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
                                    struct btrfs_block_rsv *block_rsv,
                                    struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5030,14 +5114,14 @@ static int update_block_group(struct btrfs_root *root,
        int factor;
 
        /* block accounting for super block */
-       spin_lock(&info->delalloc_lock);
+       spin_lock(&info->delalloc_root_lock);
        old_val = btrfs_super_bytes_used(info->super_copy);
        if (alloc)
                old_val += num_bytes;
        else
                old_val -= num_bytes;
        btrfs_set_super_bytes_used(info->super_copy, old_val);
-       spin_unlock(&info->delalloc_lock);
+       spin_unlock(&info->delalloc_root_lock);
 
        while (total) {
                cache = btrfs_lookup_block_group(info, bytenr);
@@ -5189,6 +5273,80 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
        return ret;
 }
 
+static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
+{
+       int ret;
+       struct btrfs_block_group_cache *block_group;
+       struct btrfs_caching_control *caching_ctl;
+
+       block_group = btrfs_lookup_block_group(root->fs_info, start);
+       if (!block_group)
+               return -EINVAL;
+
+       cache_block_group(block_group, 0);
+       caching_ctl = get_caching_control(block_group);
+
+       if (!caching_ctl) {
+               /* Logic error */
+               BUG_ON(!block_group_cache_done(block_group));
+               ret = btrfs_remove_free_space(block_group, start, num_bytes);
+       } else {
+               mutex_lock(&caching_ctl->mutex);
+
+               if (start >= caching_ctl->progress) {
+                       ret = add_excluded_extent(root, start, num_bytes);
+               } else if (start + num_bytes <= caching_ctl->progress) {
+                       ret = btrfs_remove_free_space(block_group,
+                                                     start, num_bytes);
+               } else {
+                       num_bytes = caching_ctl->progress - start;
+                       ret = btrfs_remove_free_space(block_group,
+                                                     start, num_bytes);
+                       if (ret)
+                               goto out_lock;
+
+                       num_bytes = (start + num_bytes) -
+                               caching_ctl->progress;
+                       start = caching_ctl->progress;
+                       ret = add_excluded_extent(root, start, num_bytes);
+               }
+out_lock:
+               mutex_unlock(&caching_ctl->mutex);
+               put_caching_control(caching_ctl);
+       }
+       btrfs_put_block_group(block_group);
+       return ret;
+}
+
+int btrfs_exclude_logged_extents(struct btrfs_root *log,
+                                struct extent_buffer *eb)
+{
+       struct btrfs_file_extent_item *item;
+       struct btrfs_key key;
+       int found_type;
+       int i;
+
+       if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
+               return 0;
+
+       for (i = 0; i < btrfs_header_nritems(eb); i++) {
+               btrfs_item_key_to_cpu(eb, &key, i);
+               if (key.type != BTRFS_EXTENT_DATA_KEY)
+                       continue;
+               item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+               found_type = btrfs_file_extent_type(eb, item);
+               if (found_type == BTRFS_FILE_EXTENT_INLINE)
+                       continue;
+               if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+                       continue;
+               key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+               key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+               __exclude_logged_extent(log, key.objectid, key.offset);
+       }
+
+       return 0;
+}
+
 /**
  * btrfs_update_reserved_bytes - update the block_group and space info counters
  * @cache:     The cache we are manipulating
@@ -5251,6 +5409,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
        struct btrfs_caching_control *next;
        struct btrfs_caching_control *caching_ctl;
        struct btrfs_block_group_cache *cache;
+       struct btrfs_space_info *space_info;
 
        down_write(&fs_info->extent_commit_sem);
 
@@ -5273,6 +5432,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 
        up_write(&fs_info->extent_commit_sem);
 
+       list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
+               percpu_counter_set(&space_info->total_bytes_pinned, 0);
+
        update_global_block_rsv(fs_info);
 }
 
@@ -5370,6 +5532,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
+                            u64 owner, u64 root_objectid)
+{
+       struct btrfs_space_info *space_info;
+       u64 flags;
+
+       if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+               if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
+                       flags = BTRFS_BLOCK_GROUP_SYSTEM;
+               else
+                       flags = BTRFS_BLOCK_GROUP_METADATA;
+       } else {
+               flags = BTRFS_BLOCK_GROUP_DATA;
+       }
+
+       space_info = __find_space_info(fs_info, flags);
+       BUG_ON(!space_info); /* Logic bug */
+       percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
+}
+
+
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@ -5590,6 +5773,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                goto out;
                        }
                }
+               add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
+                                root_objectid);
        } else {
                if (found_extent) {
                        BUG_ON(is_data && refs_to_drop !=
@@ -5713,6 +5898,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           u64 parent, int last_ref)
 {
        struct btrfs_block_group_cache *cache = NULL;
+       int pin = 1;
        int ret;
 
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -5745,8 +5931,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 
                btrfs_add_free_space(cache, buf->start, buf->len);
                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+               pin = 0;
        }
 out:
+       if (pin)
+               add_pinned_bytes(root->fs_info, buf->len,
+                                btrfs_header_level(buf),
+                                root->root_key.objectid);
+
        /*
         * Deleting the buffer, clear the corrupt flag since it doesn't matter
         * anymore.
@@ -5763,6 +5955,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
 
+       add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
+
        /*
         * tree log blocks never actually go into the extent allocation
         * tree, just update pinning info and exit early.
@@ -6560,52 +6754,26 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 {
        int ret;
        struct btrfs_block_group_cache *block_group;
-       struct btrfs_caching_control *caching_ctl;
-       u64 start = ins->objectid;
-       u64 num_bytes = ins->offset;
-
-       block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-       cache_block_group(block_group, 0);
-       caching_ctl = get_caching_control(block_group);
-
-       if (!caching_ctl) {
-               BUG_ON(!block_group_cache_done(block_group));
-               ret = btrfs_remove_free_space(block_group, start, num_bytes);
-               if (ret)
-                       goto out;
-       } else {
-               mutex_lock(&caching_ctl->mutex);
 
-               if (start >= caching_ctl->progress) {
-                       ret = add_excluded_extent(root, start, num_bytes);
-               } else if (start + num_bytes <= caching_ctl->progress) {
-                       ret = btrfs_remove_free_space(block_group,
-                                                     start, num_bytes);
-               } else {
-                       num_bytes = caching_ctl->progress - start;
-                       ret = btrfs_remove_free_space(block_group,
-                                                     start, num_bytes);
-                       if (ret)
-                               goto out_lock;
-
-                       start = caching_ctl->progress;
-                       num_bytes = ins->objectid + ins->offset -
-                                   caching_ctl->progress;
-                       ret = add_excluded_extent(root, start, num_bytes);
-               }
-out_lock:
-               mutex_unlock(&caching_ctl->mutex);
-               put_caching_control(caching_ctl);
+       /*
+        * Mixed block groups will exclude before processing the log so we only
+        * need to do the exlude dance if this fs isn't mixed.
+        */
+       if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
+               ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
                if (ret)
-                       goto out;
+                       return ret;
        }
 
+       block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+       if (!block_group)
+               return -EINVAL;
+
        ret = btrfs_update_reserved_bytes(block_group, ins->offset,
                                          RESERVE_ALLOC_NO_ACCOUNT);
        BUG_ON(ret); /* logic error */
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
                                         0, owner, offset, ins, 1);
-out:
        btrfs_put_block_group(block_group);
        return ret;
 }
@@ -7384,7 +7552,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
        while (1) {
-               if (!for_reloc && btrfs_fs_closing(root->fs_info)) {
+               if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
                        pr_debug("btrfs: drop snapshot early exit\n");
                        err = -EAGAIN;
                        goto out_end_trans;
@@ -7447,8 +7615,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        }
 
        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-               ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
-                                          NULL, NULL);
+               ret = btrfs_find_root(tree_root, &root->root_key, path,
+                                     NULL, NULL);
                if (ret < 0) {
                        btrfs_abort_transaction(trans, tree_root, ret);
                        err = ret;
@@ -7465,11 +7633,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        }
 
        if (root->in_radix) {
-               btrfs_free_fs_root(tree_root->fs_info, root);
+               btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
        } else {
                free_extent_buffer(root->node);
                free_extent_buffer(root->commit_root);
-               kfree(root);
+               btrfs_put_fs_root(root);
        }
 out_end_trans:
        btrfs_end_transaction_throttle(trans, tree_root);
@@ -7782,6 +7950,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        struct btrfs_space_info *space_info;
        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        struct btrfs_device *device;
+       struct btrfs_trans_handle *trans;
        u64 min_free;
        u64 dev_min = 1;
        u64 dev_nr = 0;
@@ -7868,6 +8037,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                do_div(min_free, dev_min);
        }
 
+       /* We need to do this so that we can look at pending chunks */
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out;
+       }
+
        mutex_lock(&root->fs_info->chunk_mutex);
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
                u64 dev_offset;
@@ -7878,7 +8054,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 */
                if (device->total_bytes > device->bytes_used + min_free &&
                    !device->is_tgtdev_for_dev_replace) {
-                       ret = find_free_dev_extent(device, min_free,
+                       ret = find_free_dev_extent(trans, device, min_free,
                                                   &dev_offset, NULL);
                        if (!ret)
                                dev_nr++;
@@ -7890,6 +8066,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                }
        }
        mutex_unlock(&root->fs_info->chunk_mutex);
+       btrfs_end_transaction(trans, root);
 out:
        btrfs_put_block_group(block_group);
        return ret;
@@ -8032,6 +8209,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                                dump_space_info(space_info, 0, 0);
                        }
                }
+               percpu_counter_destroy(&space_info->total_bytes_pinned);
                list_del(&space_info->list);
                kfree(space_info);
        }
@@ -8254,6 +8432,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                                        sizeof(item));
                if (ret)
                        btrfs_abort_transaction(trans, extent_root, ret);
+               ret = btrfs_finish_chunk_alloc(trans, extent_root,
+                                              key.objectid, key.offset);
+               if (ret)
+                       btrfs_abort_transaction(trans, extent_root, ret);
        }
 }
 
@@ -8591,8 +8773,15 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
                if (end - start >= range->minlen) {
                        if (!block_group_cache_done(cache)) {
                                ret = cache_block_group(cache, 0);
-                               if (!ret)
-                                       wait_block_group_cache_done(cache);
+                               if (ret) {
+                                       btrfs_put_block_group(cache);
+                                       break;
+                               }
+                               ret = wait_block_group_cache_done(cache);
+                               if (ret) {
+                                       btrfs_put_block_group(cache);
+                                       break;
+                               }
                        }
                        ret = btrfs_trim_block_group(cache,
                                                     &group_trimmed,
index 6bca9472f313cda2cb7ad1f230dda69bf4b1e8a9..583d98bd065ed83ca979a2786b59ae4342380c47 100644 (file)
@@ -77,10 +77,29 @@ void btrfs_leak_debug_check(void)
                kmem_cache_free(extent_buffer_cache, eb);
        }
 }
+
+#define btrfs_debug_check_extent_io_range(inode, start, end)           \
+       __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+               struct inode *inode, u64 start, u64 end)
+{
+       u64 isize = i_size_read(inode);
+
+       if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+               printk_ratelimited(KERN_DEBUG
+                   "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+                               caller,
+                               (unsigned long long)btrfs_ino(inode),
+                               (unsigned long long)isize,
+                               (unsigned long long)start,
+                               (unsigned long long)end);
+       }
+}
 #else
 #define btrfs_leak_debug_add(new, head)        do {} while (0)
 #define btrfs_leak_debug_del(entry)    do {} while (0)
 #define btrfs_leak_debug_check()       do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e)     do {} while (0)
 #endif
 
 #define BUFFER_LRU_MAX 64
@@ -522,6 +541,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        int err;
        int clear = 0;
 
+       btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
+       if (bits & EXTENT_DELALLOC)
+               bits |= EXTENT_NORESERVE;
+
        if (delete)
                bits |= ~EXTENT_CTLBITS;
        bits |= EXTENT_FIRST_DELALLOC;
@@ -677,6 +701,8 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        struct extent_state *state;
        struct rb_node *node;
 
+       btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
        spin_lock(&tree->lock);
 again:
        while (1) {
@@ -769,6 +795,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        u64 last_start;
        u64 last_end;
 
+       btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
        bits |= EXTENT_FIRST_DELALLOC;
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
@@ -989,6 +1017,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        u64 last_start;
        u64 last_end;
 
+       btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
                prealloc = alloc_extent_state(mask);
@@ -2450,11 +2480,12 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                struct extent_state *cached = NULL;
                struct extent_state *state;
                struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+               struct inode *inode = page->mapping->host;
 
                pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
                         "mirror=%lu\n", (u64)bio->bi_sector, err,
                         io_bio->mirror_num);
-               tree = &BTRFS_I(page->mapping->host)->io_tree;
+               tree = &BTRFS_I(inode)->io_tree;
 
                /* We always issue full-page reads, but if some block
                 * in a page fails to read, blk_update_request() will
@@ -2528,6 +2559,14 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
 
                if (uptodate) {
+                       loff_t i_size = i_size_read(inode);
+                       pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+                       unsigned offset;
+
+                       /* Zero out the end if this page straddles i_size */
+                       offset = i_size & (PAGE_CACHE_SIZE-1);
+                       if (page->index == end_index && offset)
+                               zero_user_segment(page, offset, PAGE_CACHE_SIZE);
                        SetPageUptodate(page);
                } else {
                        ClearPageUptodate(page);
index 41fb81e7ec53c3fda80caf28a039af7b4c1a0682..3b8c4e26e1da08f69e081a75a518d452b221bbf4 100644 (file)
@@ -19,6 +19,7 @@
 #define EXTENT_FIRST_DELALLOC (1 << 12)
 #define EXTENT_NEED_WAIT (1 << 13)
 #define EXTENT_DAMAGED (1 << 14)
+#define EXTENT_NORESERVE (1 << 15)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
index b193bf324a4123685483a7754537938267813734..a7bfc954180336348273f8e8f0f145b40581012e 100644 (file)
@@ -34,8 +34,7 @@
 
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
                                   sizeof(struct btrfs_ordered_sum)) / \
-                                  sizeof(struct btrfs_sector_sum) * \
-                                  (r)->sectorsize - (r)->sectorsize)
+                                  sizeof(u32) * (r)->sectorsize)
 
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
@@ -297,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_ordered_sum *sums;
-       struct btrfs_sector_sum *sector_sum;
        struct btrfs_csum_item *item;
        LIST_HEAD(tmplist);
        unsigned long offset;
@@ -368,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
                                      struct btrfs_csum_item);
                while (start < csum_end) {
                        size = min_t(size_t, csum_end - start,
-                                       MAX_ORDERED_SUM_BYTES(root));
+                                    MAX_ORDERED_SUM_BYTES(root));
                        sums = kzalloc(btrfs_ordered_sum_size(root, size),
-                                       GFP_NOFS);
+                                      GFP_NOFS);
                        if (!sums) {
                                ret = -ENOMEM;
                                goto fail;
                        }
 
-                       sector_sum = sums->sums;
                        sums->bytenr = start;
-                       sums->len = size;
+                       sums->len = (int)size;
 
                        offset = (start - key.offset) >>
                                root->fs_info->sb->s_blocksize_bits;
                        offset *= csum_size;
+                       size >>= root->fs_info->sb->s_blocksize_bits;
 
-                       while (size > 0) {
-                               read_extent_buffer(path->nodes[0],
-                                               &sector_sum->sum,
-                                               ((unsigned long)item) +
-                                               offset, csum_size);
-                               sector_sum->bytenr = start;
-
-                               size -= root->sectorsize;
-                               start += root->sectorsize;
-                               offset += csum_size;
-                               sector_sum++;
-                       }
+                       read_extent_buffer(path->nodes[0],
+                                          sums->sums,
+                                          ((unsigned long)item) + offset,
+                                          csum_size * size);
+
+                       start += root->sectorsize * size;
                        list_add_tail(&sums->list, &tmplist);
                }
                path->slots[0]++;
@@ -417,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                       struct bio *bio, u64 file_start, int contig)
 {
        struct btrfs_ordered_sum *sums;
-       struct btrfs_sector_sum *sector_sum;
        struct btrfs_ordered_extent *ordered;
        char *data;
        struct bio_vec *bvec = bio->bi_io_vec;
        int bio_index = 0;
+       int index;
        unsigned long total_bytes = 0;
        unsigned long this_sum_bytes = 0;
        u64 offset;
-       u64 disk_bytenr;
 
        WARN_ON(bio->bi_vcnt <= 0);
        sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
        if (!sums)
                return -ENOMEM;
 
-       sector_sum = sums->sums;
-       disk_bytenr = (u64)bio->bi_sector << 9;
        sums->len = bio->bi_size;
        INIT_LIST_HEAD(&sums->list);
 
@@ -444,7 +433,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 
        ordered = btrfs_lookup_ordered_extent(inode, offset);
        BUG_ON(!ordered); /* Logic error */
-       sums->bytenr = ordered->start;
+       sums->bytenr = (u64)bio->bi_sector << 9;
+       index = 0;
 
        while (bio_index < bio->bi_vcnt) {
                if (!contig)
@@ -463,28 +453,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                        sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
                                       GFP_NOFS);
                        BUG_ON(!sums); /* -ENOMEM */
-                       sector_sum = sums->sums;
                        sums->len = bytes_left;
                        ordered = btrfs_lookup_ordered_extent(inode, offset);
                        BUG_ON(!ordered); /* Logic error */
-                       sums->bytenr = ordered->start;
+                       sums->bytenr = ((u64)bio->bi_sector << 9) +
+                                      total_bytes;
+                       index = 0;
                }
 
                data = kmap_atomic(bvec->bv_page);
-               sector_sum->sum = ~(u32)0;
-               sector_sum->sum = btrfs_csum_data(data + bvec->bv_offset,
-                                                 sector_sum->sum,
-                                                 bvec->bv_len);
+               sums->sums[index] = ~(u32)0;
+               sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
+                                                   sums->sums[index],
+                                                   bvec->bv_len);
                kunmap_atomic(data);
-               btrfs_csum_final(sector_sum->sum,
-                                (char *)&sector_sum->sum);
-               sector_sum->bytenr = disk_bytenr;
+               btrfs_csum_final(sums->sums[index],
+                                (char *)(sums->sums + index));
 
-               sector_sum++;
                bio_index++;
+               index++;
                total_bytes += bvec->bv_len;
                this_sum_bytes += bvec->bv_len;
-               disk_bytenr += bvec->bv_len;
                offset += bvec->bv_len;
                bvec++;
        }
@@ -672,62 +661,46 @@ out:
        return ret;
 }
 
-static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
-                                struct btrfs_sector_sum *sector_sum,
-                                u64 total_bytes, u64 sectorsize)
-{
-       u64 tmp = sectorsize;
-       u64 next_sector = sector_sum->bytenr;
-       struct btrfs_sector_sum *next = sector_sum + 1;
-
-       while ((tmp + total_bytes) < sums->len) {
-               if (next_sector + sectorsize != next->bytenr)
-                       break;
-               tmp += sectorsize;
-               next_sector = next->bytenr;
-               next++;
-       }
-       return tmp;
-}
-
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct btrfs_ordered_sum *sums)
 {
-       u64 bytenr;
-       int ret;
        struct btrfs_key file_key;
        struct btrfs_key found_key;
-       u64 next_offset;
-       u64 total_bytes = 0;
-       int found_next;
        struct btrfs_path *path;
        struct btrfs_csum_item *item;
        struct btrfs_csum_item *item_end;
        struct extent_buffer *leaf = NULL;
+       u64 next_offset;
+       u64 total_bytes = 0;
        u64 csum_offset;
-       struct btrfs_sector_sum *sector_sum;
+       u64 bytenr;
        u32 nritems;
        u32 ins_size;
+       int index = 0;
+       int found_next;
+       int ret;
        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-
-       sector_sum = sums->sums;
 again:
        next_offset = (u64)-1;
        found_next = 0;
+       bytenr = sums->bytenr + total_bytes;
        file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
-       file_key.offset = sector_sum->bytenr;
-       bytenr = sector_sum->bytenr;
+       file_key.offset = bytenr;
        btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
 
-       item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+       item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
        if (!IS_ERR(item)) {
-               leaf = path->nodes[0];
                ret = 0;
+               leaf = path->nodes[0];
+               item_end = btrfs_item_ptr(leaf, path->slots[0],
+                                         struct btrfs_csum_item);
+               item_end = (struct btrfs_csum_item *)((char *)item_end +
+                          btrfs_item_size_nr(leaf, path->slots[0]));
                goto found;
        }
        ret = PTR_ERR(item);
@@ -807,8 +780,7 @@ again:
 
                free_space = btrfs_leaf_free_space(root, leaf) -
                                         sizeof(struct btrfs_item) - csum_size;
-               tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
-                                           root->sectorsize);
+               tmp = sums->len - total_bytes;
                tmp >>= root->fs_info->sb->s_blocksize_bits;
                WARN_ON(tmp < 1);
 
@@ -822,6 +794,7 @@ again:
                diff *= csum_size;
 
                btrfs_extend_item(root, path, diff);
+               ret = 0;
                goto csum;
        }
 
@@ -831,8 +804,7 @@ insert:
        if (found_next) {
                u64 tmp;
 
-               tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
-                                           root->sectorsize);
+               tmp = sums->len - total_bytes;
                tmp >>= root->fs_info->sb->s_blocksize_bits;
                tmp = min(tmp, (next_offset - file_key.offset) >>
                                         root->fs_info->sb->s_blocksize_bits);
@@ -853,31 +825,25 @@ insert:
                WARN_ON(1);
                goto fail_unlock;
        }
-csum:
        leaf = path->nodes[0];
+csum:
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
-       ret = 0;
+       item_end = (struct btrfs_csum_item *)((unsigned char *)item +
+                                     btrfs_item_size_nr(leaf, path->slots[0]));
        item = (struct btrfs_csum_item *)((unsigned char *)item +
                                          csum_offset * csum_size);
 found:
-       item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
-       item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
-                                     btrfs_item_size_nr(leaf, path->slots[0]));
-next_sector:
-
-       write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size);
-
-       total_bytes += root->sectorsize;
-       sector_sum++;
-       if (total_bytes < sums->len) {
-               item = (struct btrfs_csum_item *)((char *)item +
-                                                 csum_size);
-               if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
-                   sector_sum->bytenr) {
-                       bytenr = sector_sum->bytenr;
-                       goto next_sector;
-               }
-       }
+       ins_size = (u32)(sums->len - total_bytes) >>
+                  root->fs_info->sb->s_blocksize_bits;
+       ins_size *= csum_size;
+       ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
+                             ins_size);
+       write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
+                           ins_size);
+
+       ins_size /= csum_size;
+       total_bytes += ins_size * root->sectorsize;
+       index += ins_size;
 
        btrfs_mark_buffer_dirty(path->nodes[0]);
        if (total_bytes < sums->len) {
index 89da56a58b635c9bf80197c0cf32e2dc3f698442..a005fe2c072ad0751254adba0fa4e04db10cc996 100644 (file)
@@ -309,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
                ret = PTR_ERR(inode_root);
                goto cleanup;
        }
-       if (btrfs_root_refs(&inode_root->root_item) == 0) {
-               ret = -ENOENT;
-               goto cleanup;
-       }
 
        key.objectid = defrag->ino;
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -1317,6 +1313,56 @@ fail:
 
 }
 
+static noinline int check_can_nocow(struct inode *inode, loff_t pos,
+                                   size_t *write_bytes)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_ordered_extent *ordered;
+       u64 lockstart, lockend;
+       u64 num_bytes;
+       int ret;
+
+       lockstart = round_down(pos, root->sectorsize);
+       lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+
+       while (1) {
+               lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+               ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                    lockend - lockstart + 1);
+               if (!ordered) {
+                       break;
+               }
+               unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+               btrfs_start_ordered_extent(inode, ordered, 1);
+               btrfs_put_ordered_extent(ordered);
+       }
+
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans)) {
+               unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+               return PTR_ERR(trans);
+       }
+
+       num_bytes = lockend - lockstart + 1;
+       ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
+                              NULL);
+       btrfs_end_transaction(trans, root);
+       if (ret <= 0) {
+               ret = 0;
+       } else {
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                EXTENT_DIRTY | EXTENT_DELALLOC |
+                                EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+                                NULL, GFP_NOFS);
+               *write_bytes = min_t(size_t, *write_bytes, num_bytes);
+       }
+
+       unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+
+       return ret;
+}
+
 static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                               struct iov_iter *i,
                                               loff_t pos)
@@ -1324,10 +1370,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct page **pages = NULL;
+       u64 release_bytes = 0;
        unsigned long first_index;
        size_t num_written = 0;
        int nrptrs;
        int ret = 0;
+       bool only_release_metadata = false;
        bool force_page_uptodate = false;
 
        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
@@ -1348,6 +1396,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                         offset);
                size_t num_pages = (write_bytes + offset +
                                    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+               size_t reserve_bytes;
                size_t dirty_pages;
                size_t copied;
 
@@ -1362,11 +1411,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                        break;
                }
 
-               ret = btrfs_delalloc_reserve_space(inode,
-                                       num_pages << PAGE_CACHE_SHIFT);
+               reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+               ret = btrfs_check_data_free_space(inode, reserve_bytes);
+               if (ret == -ENOSPC &&
+                   (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+                                             BTRFS_INODE_PREALLOC))) {
+                       ret = check_can_nocow(inode, pos, &write_bytes);
+                       if (ret > 0) {
+                               only_release_metadata = true;
+                               /*
+                                * our prealloc extent may be smaller than
+                                * write_bytes, so scale down.
+                                */
+                               num_pages = (write_bytes + offset +
+                                            PAGE_CACHE_SIZE - 1) >>
+                                       PAGE_CACHE_SHIFT;
+                               reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+                               ret = 0;
+                       } else {
+                               ret = -ENOSPC;
+                       }
+               }
+
                if (ret)
                        break;
 
+               ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
+               if (ret) {
+                       if (!only_release_metadata)
+                               btrfs_free_reserved_data_space(inode,
+                                                              reserve_bytes);
+                       break;
+               }
+
+               release_bytes = reserve_bytes;
+
                /*
                 * This is going to setup the pages array with the number of
                 * pages we want, so we don't really need to worry about the
@@ -1375,11 +1454,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                ret = prepare_pages(root, file, pages, num_pages,
                                    pos, first_index, write_bytes,
                                    force_page_uptodate);
-               if (ret) {
-                       btrfs_delalloc_release_space(inode,
-                                       num_pages << PAGE_CACHE_SHIFT);
+               if (ret)
                        break;
-               }
 
                copied = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, i);
@@ -1409,30 +1485,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                 * managed to copy.
                 */
                if (num_pages > dirty_pages) {
+                       release_bytes = (num_pages - dirty_pages) <<
+                               PAGE_CACHE_SHIFT;
                        if (copied > 0) {
                                spin_lock(&BTRFS_I(inode)->lock);
                                BTRFS_I(inode)->outstanding_extents++;
                                spin_unlock(&BTRFS_I(inode)->lock);
                        }
-                       btrfs_delalloc_release_space(inode,
-                                       (num_pages - dirty_pages) <<
-                                       PAGE_CACHE_SHIFT);
+                       if (only_release_metadata)
+                               btrfs_delalloc_release_metadata(inode,
+                                                               release_bytes);
+                       else
+                               btrfs_delalloc_release_space(inode,
+                                                            release_bytes);
                }
 
+               release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
                if (copied > 0) {
                        ret = btrfs_dirty_pages(root, inode, pages,
                                                dirty_pages, pos, copied,
                                                NULL);
                        if (ret) {
-                               btrfs_delalloc_release_space(inode,
-                                       dirty_pages << PAGE_CACHE_SHIFT);
                                btrfs_drop_pages(pages, num_pages);
                                break;
                        }
                }
 
+               release_bytes = 0;
                btrfs_drop_pages(pages, num_pages);
 
+               if (only_release_metadata && copied > 0) {
+                       u64 lockstart = round_down(pos, root->sectorsize);
+                       u64 lockend = lockstart +
+                               (dirty_pages << PAGE_CACHE_SHIFT) - 1;
+
+                       set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                      lockend, EXTENT_NORESERVE, NULL,
+                                      NULL, GFP_NOFS);
+                       only_release_metadata = false;
+               }
+
                cond_resched();
 
                balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1445,6 +1537,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 
        kfree(pages);
 
+       if (release_bytes) {
+               if (only_release_metadata)
+                       btrfs_delalloc_release_metadata(inode, release_bytes);
+               else
+                       btrfs_delalloc_release_space(inode, release_bytes);
+       }
+
        return num_written ? num_written : ret;
 }
 
@@ -2175,12 +2274,6 @@ static long btrfs_fallocate(struct file *file, int mode,
                        goto out_reserve_fail;
        }
 
-       /*
-        * wait for ordered IO before we have any locks.  We'll loop again
-        * below with the locks held.
-        */
-       btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-
        mutex_lock(&inode->i_mutex);
        ret = inode_newsize_ok(inode, alloc_end);
        if (ret)
@@ -2191,8 +2284,23 @@ static long btrfs_fallocate(struct file *file, int mode,
                                        alloc_start);
                if (ret)
                        goto out;
+       } else {
+               /*
+                * If we are fallocating from the end of the file onward we
+                * need to zero out the end of the page if i_size lands in the
+                * middle of a page.
+                */
+               ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+               if (ret)
+                       goto out;
        }
 
+       /*
+        * wait for ordered IO before we have any locks.  We'll loop again
+        * below with the locks held.
+        */
+       btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
        locked_end = alloc_end - 1;
        while (1) {
                struct btrfs_ordered_extent *ordered;
index 2750b50235269d2304ce45a906b2c5b43dae8b5c..b21a3cd667d8cc656878b8d462aa7cd45ebc8435 100644 (file)
@@ -213,7 +213,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
        else
                ret = 0;
        spin_unlock(&rsv->lock);
-       return 0;
+       return ret;
 }
 
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
@@ -3150,6 +3150,8 @@ again:
        return 0;
 }
 
+#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__)
+
 /*
  * This test just does basic sanity checking, making sure we can add an exten
  * entry and remove space from either end and the middle, and make sure we can
@@ -3159,63 +3161,63 @@ static int test_extents(struct btrfs_block_group_cache *cache)
 {
        int ret = 0;
 
-       printk(KERN_ERR "Running extent only tests\n");
+       test_msg("Running extent only tests\n");
 
        /* First just make sure we can remove an entire entry */
        ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error adding initial extents %d\n", ret);
+               test_msg("Error adding initial extents %d\n", ret);
                return ret;
        }
 
        ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error removing extent %d\n", ret);
+               test_msg("Error removing extent %d\n", ret);
                return ret;
        }
 
        if (check_exists(cache, 0, 4 * 1024 * 1024)) {
-               printk(KERN_ERR "Full remove left some lingering space\n");
+               test_msg("Full remove left some lingering space\n");
                return -1;
        }
 
        /* Ok edge and middle cases now */
        ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error adding half extent %d\n", ret);
+               test_msg("Error adding half extent %d\n", ret);
                return ret;
        }
 
        ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error removing tail end %d\n", ret);
+               test_msg("Error removing tail end %d\n", ret);
                return ret;
        }
 
        ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error removing front end %d\n", ret);
+               test_msg("Error removing front end %d\n", ret);
                return ret;
        }
 
        ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
        if (ret) {
-               printk(KERN_ERR "Error removing middle piece %d\n", ret);
+               test_msg("Error removing middle piece %d\n", ret);
                return ret;
        }
 
        if (check_exists(cache, 0, 1 * 1024 * 1024)) {
-               printk(KERN_ERR "Still have space at the front\n");
+               test_msg("Still have space at the front\n");
                return -1;
        }
 
        if (check_exists(cache, 2 * 1024 * 1024, 4096)) {
-               printk(KERN_ERR "Still have space in the middle\n");
+               test_msg("Still have space in the middle\n");
                return -1;
        }
 
        if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
-               printk(KERN_ERR "Still have space at the end\n");
+               test_msg("Still have space at the end\n");
                return -1;
        }
 
@@ -3230,34 +3232,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
        u64 next_bitmap_offset;
        int ret;
 
-       printk(KERN_ERR "Running bitmap only tests\n");
+       test_msg("Running bitmap only tests\n");
 
        ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't create a bitmap entry %d\n", ret);
+               test_msg("Couldn't create a bitmap entry %d\n", ret);
                return ret;
        }
 
        ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error removing bitmap full range %d\n", ret);
+               test_msg("Error removing bitmap full range %d\n", ret);
                return ret;
        }
 
        if (check_exists(cache, 0, 4 * 1024 * 1024)) {
-               printk(KERN_ERR "Left some space in bitmap\n");
+               test_msg("Left some space in bitmap\n");
                return -1;
        }
 
        ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't add to our bitmap entry %d\n", ret);
+               test_msg("Couldn't add to our bitmap entry %d\n", ret);
                return ret;
        }
 
        ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Couldn't remove middle chunk %d\n", ret);
+               test_msg("Couldn't remove middle chunk %d\n", ret);
                return ret;
        }
 
@@ -3271,21 +3273,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
        ret = add_free_space_entry(cache, next_bitmap_offset -
                                   (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't add space that straddles two bitmaps"
-                      " %d\n", ret);
+               test_msg("Couldn't add space that straddles two bitmaps %d\n",
+                               ret);
                return ret;
        }
 
        ret = btrfs_remove_free_space(cache, next_bitmap_offset -
                                      (1 * 1024 * 1024), 2 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret);
+               test_msg("Couldn't remove overlapping space %d\n", ret);
                return ret;
        }
 
        if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
                         2 * 1024 * 1024)) {
-               printk(KERN_ERR "Left some space when removing overlapping\n");
+               test_msg("Left some space when removing overlapping\n");
                return -1;
        }
 
@@ -3300,7 +3302,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
        u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
        int ret;
 
-       printk(KERN_ERR "Running bitmap and extent tests\n");
+       test_msg("Running bitmap and extent tests\n");
 
        /*
         * First let's do something simple, an extent at the same offset as the
@@ -3309,42 +3311,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
         */
        ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't create bitmap entry %d\n", ret);
+               test_msg("Couldn't create bitmap entry %d\n", ret);
                return ret;
        }
 
        ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
        if (ret) {
-               printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+               test_msg("Couldn't add extent entry %d\n", ret);
                return ret;
        }
 
        ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Couldn't remove extent entry %d\n", ret);
+               test_msg("Couldn't remove extent entry %d\n", ret);
                return ret;
        }
 
        if (check_exists(cache, 0, 1 * 1024 * 1024)) {
-               printk(KERN_ERR "Left remnants after our remove\n");
+               test_msg("Left remnants after our remove\n");
                return -1;
        }
 
        /* Now to add back the extent entry and remove from the bitmap */
        ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
        if (ret) {
-               printk(KERN_ERR "Couldn't re-add extent entry %d\n", ret);
+               test_msg("Couldn't re-add extent entry %d\n", ret);
                return ret;
        }
 
        ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Couldn't remove from bitmap %d\n", ret);
+               test_msg("Couldn't remove from bitmap %d\n", ret);
                return ret;
        }
 
        if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
-               printk(KERN_ERR "Left remnants in the bitmap\n");
+               test_msg("Left remnants in the bitmap\n");
                return -1;
        }
 
@@ -3354,19 +3356,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
         */
        ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't add to a bitmap %d\n", ret);
+               test_msg("Couldn't add to a bitmap %d\n", ret);
                return ret;
        }
 
        ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret);
+               test_msg("Couldn't remove overlapping space %d\n", ret);
                return ret;
        }
 
        if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
-               printk(KERN_ERR "Left over peices after removing "
-                      "overlapping\n");
+               test_msg("Left over peices after removing overlapping\n");
                return -1;
        }
 
@@ -3375,24 +3376,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
        /* Now with the extent entry offset into the bitmap */
        ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't add space to the bitmap %d\n", ret);
+               test_msg("Couldn't add space to the bitmap %d\n", ret);
                return ret;
        }
 
        ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
        if (ret) {
-               printk(KERN_ERR "Couldn't add extent to the cache %d\n", ret);
+               test_msg("Couldn't add extent to the cache %d\n", ret);
                return ret;
        }
 
        ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Problem removing overlapping space %d\n", ret);
+               test_msg("Problem removing overlapping space %d\n", ret);
                return ret;
        }
 
        if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
-               printk(KERN_ERR "Left something behind when removing space");
+               test_msg("Left something behind when removing space");
                return -1;
        }
 
@@ -3410,27 +3411,27 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
        ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
                                   4 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't add bitmap %d\n", ret);
+               test_msg("Couldn't add bitmap %d\n", ret);
                return ret;
        }
 
        ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
                                   5 * 1024 * 1024, 0);
        if (ret) {
-               printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+               test_msg("Couldn't add extent entry %d\n", ret);
                return ret;
        }
 
        ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
                                      5 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Failed to free our space %d\n", ret);
+               test_msg("Failed to free our space %d\n", ret);
                return ret;
        }
 
        if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
                         5 * 1024 * 1024)) {
-               printk(KERN_ERR "Left stuff over\n");
+               test_msg("Left stuff over\n");
                return -1;
        }
 
@@ -3444,20 +3445,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
         */
        ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't add bitmap entry %d\n", ret);
+               test_msg("Couldn't add bitmap entry %d\n", ret);
                return ret;
        }
 
        ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
        if (ret) {
-               printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+               test_msg("Couldn't add extent entry %d\n", ret);
                return ret;
        }
 
        ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error removing bitmap and extent "
-                      "overlapping %d\n", ret);
+               test_msg("Error removing bitmap and extent overlapping %d\n", ret);
                return ret;
        }
 
@@ -3469,11 +3469,11 @@ void btrfs_test_free_space_cache(void)
 {
        struct btrfs_block_group_cache *cache;
 
-       printk(KERN_ERR "Running btrfs free space cache tests\n");
+       test_msg("Running btrfs free space cache tests\n");
 
        cache = init_test_block_group();
        if (!cache) {
-               printk(KERN_ERR "Couldn't run the tests\n");
+               test_msg("Couldn't run the tests\n");
                return;
        }
 
@@ -3487,6 +3487,9 @@ out:
        __btrfs_remove_free_space_cache(cache->free_space_ctl);
        kfree(cache->free_space_ctl);
        kfree(cache);
-       printk(KERN_ERR "Free space cache tests finished\n");
+       test_msg("Free space cache tests finished\n");
 }
-#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
+#undef test_msg
+#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
+void btrfs_test_free_space_cache(void) {}
+#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
index 8b7f19f4496153886975547923cdcdc815940e96..894116b71304c391aca53d31e4f063913952a6e6 100644 (file)
@@ -113,8 +113,6 @@ int btrfs_return_cluster_to_free_space(
 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
                           u64 *trimmed, u64 start, u64 end, u64 minlen);
 
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 void btrfs_test_free_space_cache(void);
-#endif
 
 #endif
index 4f9d16b70d3d87da9dd6e3cae926dbaaf4fa3345..6d1b93c8aafb8a4d7b832cab8585ebf1ac1ced42 100644 (file)
@@ -42,6 +42,7 @@
 #include <linux/mount.h>
 #include <linux/btrfs.h>
 #include <linux/blkdev.h>
+#include <linux/posix_acl_xattr.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -57,6 +58,7 @@
 #include "free-space-cache.h"
 #include "inode-map.h"
 #include "backref.h"
+#include "hash.h"
 
 struct btrfs_iget_args {
        u64 ino;
@@ -701,8 +703,12 @@ retry:
                        async_extent->nr_pages = 0;
                        async_extent->pages = NULL;
 
-                       if (ret == -ENOSPC)
+                       if (ret == -ENOSPC) {
+                               unlock_extent(io_tree, async_extent->start,
+                                             async_extent->start +
+                                             async_extent->ram_size - 1);
                                goto retry;
+                       }
                        goto out_free;
                }
 
@@ -1529,6 +1535,46 @@ static void btrfs_merge_extent_hook(struct inode *inode,
        spin_unlock(&BTRFS_I(inode)->lock);
 }
 
+static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+                                     struct inode *inode)
+{
+       spin_lock(&root->delalloc_lock);
+       if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+               list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+                             &root->delalloc_inodes);
+               set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                       &BTRFS_I(inode)->runtime_flags);
+               root->nr_delalloc_inodes++;
+               if (root->nr_delalloc_inodes == 1) {
+                       spin_lock(&root->fs_info->delalloc_root_lock);
+                       BUG_ON(!list_empty(&root->delalloc_root));
+                       list_add_tail(&root->delalloc_root,
+                                     &root->fs_info->delalloc_roots);
+                       spin_unlock(&root->fs_info->delalloc_root_lock);
+               }
+       }
+       spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+                                    struct inode *inode)
+{
+       spin_lock(&root->delalloc_lock);
+       if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+               list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+               clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                         &BTRFS_I(inode)->runtime_flags);
+               root->nr_delalloc_inodes--;
+               if (!root->nr_delalloc_inodes) {
+                       spin_lock(&root->fs_info->delalloc_root_lock);
+                       BUG_ON(list_empty(&root->delalloc_root));
+                       list_del_init(&root->delalloc_root);
+                       spin_unlock(&root->fs_info->delalloc_root_lock);
+               }
+       }
+       spin_unlock(&root->delalloc_lock);
+}
+
 /*
  * extent_io.c set_bit_hook, used to track delayed allocation
  * bytes in this file, and to maintain the list of inodes that
@@ -1561,16 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
                spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes += len;
                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                        &BTRFS_I(inode)->runtime_flags)) {
-                       spin_lock(&root->fs_info->delalloc_lock);
-                       if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                               list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
-                                             &root->fs_info->delalloc_inodes);
-                               set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                       &BTRFS_I(inode)->runtime_flags);
-                       }
-                       spin_unlock(&root->fs_info->delalloc_lock);
-               }
+                                        &BTRFS_I(inode)->runtime_flags))
+                       btrfs_add_delalloc_inodes(root, inode);
                spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
@@ -1604,7 +1642,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                        btrfs_delalloc_release_metadata(inode, len);
 
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
-                   && do_list)
+                   && do_list && !(state->state & EXTENT_NORESERVE))
                        btrfs_free_reserved_data_space(inode, len);
 
                __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
@@ -1613,15 +1651,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                BTRFS_I(inode)->delalloc_bytes -= len;
                if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
                    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                            &BTRFS_I(inode)->runtime_flags)) {
-                       spin_lock(&root->fs_info->delalloc_lock);
-                       if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                               list_del_init(&BTRFS_I(inode)->delalloc_inodes);
-                               clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                         &BTRFS_I(inode)->runtime_flags);
-                       }
-                       spin_unlock(&root->fs_info->delalloc_lock);
-               }
+                            &BTRFS_I(inode)->runtime_flags))
+                       btrfs_del_delalloc_inode(root, inode);
                spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
@@ -2263,11 +2294,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
                        return 0;
                return PTR_ERR(root);
        }
-       if (btrfs_root_refs(&root->root_item) == 0) {
-               srcu_read_unlock(&fs_info->subvol_srcu, index);
-               /* parse ENOENT to 0 */
-               return 0;
-       }
 
        /* step 2: get inode */
        key.objectid = backref->inum;
@@ -3215,13 +3241,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                        /* 1 for the orphan item deletion. */
                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
+                               iput(inode);
                                ret = PTR_ERR(trans);
                                goto out;
                        }
                        ret = btrfs_orphan_add(trans, inode);
                        btrfs_end_transaction(trans, root);
-                       if (ret)
+                       if (ret) {
+                               iput(inode);
                                goto out;
+                       }
 
                        ret = btrfs_truncate(inode);
                        if (ret)
@@ -3274,8 +3303,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 {
        u32 nritems = btrfs_header_nritems(leaf);
        struct btrfs_key found_key;
+       static u64 xattr_access = 0;
+       static u64 xattr_default = 0;
        int scanned = 0;
 
+       if (!xattr_access) {
+               xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
+                                       strlen(POSIX_ACL_XATTR_ACCESS));
+               xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
+                                       strlen(POSIX_ACL_XATTR_DEFAULT));
+       }
+
        slot++;
        while (slot < nritems) {
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -3285,8 +3323,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
                        return 0;
 
                /* we found an xattr, assume we've got an acl */
-               if (found_key.type == BTRFS_XATTR_ITEM_KEY)
-                       return 1;
+               if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+                       if (found_key.offset == xattr_access ||
+                           found_key.offset == xattr_default)
+                               return 1;
+               }
 
                /*
                 * we found a key greater than an xattr key, there can't
@@ -3660,53 +3701,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        }
        return ret;
 }
-               
-
-/* helper to check if there is any shared block in the path */
-static int check_path_shared(struct btrfs_root *root,
-                            struct btrfs_path *path)
-{
-       struct extent_buffer *eb;
-       int level;
-       u64 refs = 1;
-
-       for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
-               int ret;
-
-               if (!path->nodes[level])
-                       break;
-               eb = path->nodes[level];
-               if (!btrfs_block_can_be_shared(root, eb))
-                       continue;
-               ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1,
-                                              &refs, NULL);
-               if (refs > 1)
-                       return 1;
-       }
-       return 0;
-}
 
 /*
  * helper to start transaction for unlink and rmdir.
  *
- * unlink and rmdir are special in btrfs, they do not always free space.
- * so in enospc case, we should make sure they will free space before
- * allowing them to use the global metadata reservation.
+ * unlink and rmdir are special in btrfs, they do not always free space, so
+ * if we cannot make our reservations the normal way try and see if there is
+ * plenty of slack room in the global reserve to migrate, otherwise we cannot
+ * allow the unlink to occur.
  */
-static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
-                                                      struct dentry *dentry)
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
-       struct btrfs_path *path;
-       struct btrfs_dir_item *di;
-       struct inode *inode = dentry->d_inode;
-       u64 index;
-       int check_link = 1;
-       int err = -ENOSPC;
        int ret;
-       u64 ino = btrfs_ino(inode);
-       u64 dir_ino = btrfs_ino(dir);
 
        /*
         * 1 for the possible orphan item
@@ -3719,158 +3727,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
                return trans;
 
-       if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
-               return ERR_PTR(-ENOSPC);
-
-       /* check if there is someone else holds reference */
-       if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
-               return ERR_PTR(-ENOSPC);
-
-       if (atomic_read(&inode->i_count) > 2)
-               return ERR_PTR(-ENOSPC);
-
-       if (xchg(&root->fs_info->enospc_unlink, 1))
-               return ERR_PTR(-ENOSPC);
-
-       path = btrfs_alloc_path();
-       if (!path) {
-               root->fs_info->enospc_unlink = 0;
-               return ERR_PTR(-ENOMEM);
-       }
+       if (PTR_ERR(trans) == -ENOSPC) {
+               u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
 
-       /* 1 for the orphan item */
-       trans = btrfs_start_transaction(root, 1);
-       if (IS_ERR(trans)) {
-               btrfs_free_path(path);
-               root->fs_info->enospc_unlink = 0;
-               return trans;
-       }
-
-       path->skip_locking = 1;
-       path->search_commit_root = 1;
-
-       ret = btrfs_lookup_inode(trans, root, path,
-                               &BTRFS_I(dir)->location, 0);
-       if (ret < 0) {
-               err = ret;
-               goto out;
-       }
-       if (ret == 0) {
-               if (check_path_shared(root, path))
-                       goto out;
-       } else {
-               check_link = 0;
-       }
-       btrfs_release_path(path);
-
-       ret = btrfs_lookup_inode(trans, root, path,
-                               &BTRFS_I(inode)->location, 0);
-       if (ret < 0) {
-               err = ret;
-               goto out;
-       }
-       if (ret == 0) {
-               if (check_path_shared(root, path))
-                       goto out;
-       } else {
-               check_link = 0;
-       }
-       btrfs_release_path(path);
-
-       if (ret == 0 && S_ISREG(inode->i_mode)) {
-               ret = btrfs_lookup_file_extent(trans, root, path,
-                                              ino, (u64)-1, 0);
-               if (ret < 0) {
-                       err = ret;
-                       goto out;
+               trans = btrfs_start_transaction(root, 0);
+               if (IS_ERR(trans))
+                       return trans;
+               ret = btrfs_cond_migrate_bytes(root->fs_info,
+                                              &root->fs_info->trans_block_rsv,
+                                              num_bytes, 5);
+               if (ret) {
+                       btrfs_end_transaction(trans, root);
+                       return ERR_PTR(ret);
                }
-               BUG_ON(ret == 0); /* Corruption */
-               if (check_path_shared(root, path))
-                       goto out;
-               btrfs_release_path(path);
-       }
-
-       if (!check_link) {
-               err = 0;
-               goto out;
-       }
-
-       di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
-                               dentry->d_name.name, dentry->d_name.len, 0);
-       if (IS_ERR(di)) {
-               err = PTR_ERR(di);
-               goto out;
-       }
-       if (di) {
-               if (check_path_shared(root, path))
-                       goto out;
-       } else {
-               err = 0;
-               goto out;
-       }
-       btrfs_release_path(path);
-
-       ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
-                                       dentry->d_name.len, ino, dir_ino, 0,
-                                       &index);
-       if (ret) {
-               err = ret;
-               goto out;
-       }
-
-       if (check_path_shared(root, path))
-               goto out;
-
-       btrfs_release_path(path);
-
-       /*
-        * This is a commit root search, if we can lookup inode item and other
-        * relative items in the commit root, it means the transaction of
-        * dir/file creation has been committed, and the dir index item that we
-        * delay to insert has also been inserted into the commit root. So
-        * we needn't worry about the delayed insertion of the dir index item
-        * here.
-        */
-       di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
-                               dentry->d_name.name, dentry->d_name.len, 0);
-       if (IS_ERR(di)) {
-               err = PTR_ERR(di);
-               goto out;
-       }
-       BUG_ON(ret == -ENOENT);
-       if (check_path_shared(root, path))
-               goto out;
-
-       err = 0;
-out:
-       btrfs_free_path(path);
-       /* Migrate the orphan reservation over */
-       if (!err)
-               err = btrfs_block_rsv_migrate(trans->block_rsv,
-                               &root->fs_info->global_block_rsv,
-                               trans->bytes_reserved);
-
-       if (err) {
-               btrfs_end_transaction(trans, root);
-               root->fs_info->enospc_unlink = 0;
-               return ERR_PTR(err);
-       }
-
-       trans->block_rsv = &root->fs_info->global_block_rsv;
-       return trans;
-}
-
-static void __unlink_end_trans(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root)
-{
-       if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
-               btrfs_block_rsv_release(root, trans->block_rsv,
-                                       trans->bytes_reserved);
                trans->block_rsv = &root->fs_info->trans_block_rsv;
-               BUG_ON(!root->fs_info->enospc_unlink);
-               root->fs_info->enospc_unlink = 0;
+               trans->bytes_reserved = num_bytes;
        }
-       btrfs_end_transaction(trans, root);
+       return trans;
 }
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3880,7 +3753,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        int ret;
 
-       trans = __unlink_start_trans(dir, dentry);
+       trans = __unlink_start_trans(dir);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
@@ -3898,7 +3771,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        }
 
 out:
-       __unlink_end_trans(trans, root);
+       btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
        return ret;
 }
@@ -3995,7 +3868,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
                return -EPERM;
 
-       trans = __unlink_start_trans(dir, dentry);
+       trans = __unlink_start_trans(dir);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
@@ -4017,7 +3890,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (!err)
                btrfs_i_size_write(inode, 0);
 out:
-       __unlink_end_trans(trans, root);
+       btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
 
        return err;
@@ -4395,6 +4268,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
        u64 hole_size;
        int err = 0;
 
+       /*
+        * If our size started in the middle of a page we need to zero out the
+        * rest of the page before we expand the i_size, otherwise we could
+        * expose stale data.
+        */
+       err = btrfs_truncate_page(inode, oldsize, 0, 0);
+       if (err)
+               return err;
+
        if (size <= hole_start)
                return 0;
 
@@ -4822,11 +4704,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
                goto out;
        }
 
-       if (btrfs_root_refs(&new_root->root_item) == 0) {
-               err = -ENOENT;
-               goto out;
-       }
-
        *sub_root = new_root;
        location->objectid = btrfs_root_dirid(&new_root->root_item);
        location->type = BTRFS_INODE_ITEM_KEY;
@@ -5092,8 +4969,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                if (!(inode->i_sb->s_flags & MS_RDONLY))
                        ret = btrfs_orphan_cleanup(sub_root);
                up_read(&root->fs_info->cleanup_work_sem);
-               if (ret)
+               if (ret) {
+                       iput(inode);
                        inode = ERR_PTR(ret);
+               }
        }
 
        return inode;
@@ -6501,10 +6380,10 @@ out:
  * returns 1 when the nocow is safe, < 1 on error, 0 if the
  * block must be cow'd
  */
-static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
-                                     struct inode *inode, u64 offset, u64 *len,
-                                     u64 *orig_start, u64 *orig_block_len,
-                                     u64 *ram_bytes)
+noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
+                             struct inode *inode, u64 offset, u64 *len,
+                             u64 *orig_start, u64 *orig_block_len,
+                             u64 *ram_bytes)
 {
        struct btrfs_path *path;
        int ret;
@@ -6518,7 +6397,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
        u64 num_bytes;
        int slot;
        int found_type;
-
+       bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -6558,18 +6437,28 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
                /* not a regular extent, must cow */
                goto out;
        }
+
+       if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+               goto out;
+
        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+       if (disk_bytenr == 0)
+               goto out;
+
+       if (btrfs_file_extent_compression(leaf, fi) ||
+           btrfs_file_extent_encryption(leaf, fi) ||
+           btrfs_file_extent_other_encoding(leaf, fi))
+               goto out;
+
        backref_offset = btrfs_file_extent_offset(leaf, fi);
 
-       *orig_start = key.offset - backref_offset;
-       *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
-       *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+       if (orig_start) {
+               *orig_start = key.offset - backref_offset;
+               *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+               *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+       }
 
        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
-       if (extent_end < offset + *len) {
-               /* extent doesn't include our full range, must cow */
-               goto out;
-       }
 
        if (btrfs_extent_readonly(root, disk_bytenr))
                goto out;
@@ -6813,8 +6702,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                if (IS_ERR(trans))
                        goto must_cow;
 
-               if (can_nocow_odirect(trans, inode, start, &len, &orig_start,
-                                     &orig_block_len, &ram_bytes) == 1) {
+               if (can_nocow_extent(trans, inode, start, &len, &orig_start,
+                                    &orig_block_len, &ram_bytes) == 1) {
                        if (type == BTRFS_ORDERED_PREALLOC) {
                                free_extent_map(em);
                                em = create_pinned_em(inode, start, len,
@@ -7243,7 +7132,6 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_dio_private *dip;
-       struct bio_vec *bvec = dio_bio->bi_io_vec;
        struct bio *io_bio;
        int skip_sum;
        int write = rw & REQ_WRITE;
@@ -7265,16 +7153,9 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
        }
 
        dip->private = dio_bio->bi_private;
-       io_bio->bi_private = dio_bio->bi_private;
        dip->inode = inode;
        dip->logical_offset = file_offset;
-
-       dip->bytes = 0;
-       do {
-               dip->bytes += bvec->bv_len;
-               bvec++;
-       } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1));
-
+       dip->bytes = dio_bio->bi_size;
        dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
        io_bio->bi_private = dip;
        dip->errors = 0;
@@ -7373,8 +7254,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
        atomic_inc(&inode->i_dio_count);
        smp_mb__after_atomic_inc();
 
+       /*
+        * The generic stuff only does filemap_write_and_wait_range, which isn't
+        * enough if we've written compressed pages to this area, so we need to
+        * call btrfs_wait_ordered_range to make absolutely sure that any
+        * outstanding dirty pages are on disk.
+        */
+       count = iov_length(iov, nr_segs);
+       btrfs_wait_ordered_range(inode, offset, count);
+
        if (rw & WRITE) {
-               count = iov_length(iov, nr_segs);
                /*
                 * If the write DIO is beyond the EOF, we need update
                 * the isize, but it is protected by i_mutex. So we can
@@ -7694,16 +7583,12 @@ static int btrfs_truncate(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_block_rsv *rsv;
-       int ret;
+       int ret = 0;
        int err = 0;
        struct btrfs_trans_handle *trans;
        u64 mask = root->sectorsize - 1;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
-       ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
-       if (ret)
-               return ret;
-
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
 
@@ -7961,9 +7846,9 @@ void btrfs_destroy_inode(struct inode *inode)
         */
        smp_mb();
        if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-               spin_lock(&root->fs_info->ordered_extent_lock);
+               spin_lock(&root->fs_info->ordered_root_lock);
                list_del_init(&BTRFS_I(inode)->ordered_operations);
-               spin_unlock(&root->fs_info->ordered_extent_lock);
+               spin_unlock(&root->fs_info->ordered_root_lock);
        }
 
        if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
@@ -8333,7 +8218,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
        struct btrfs_inode *binode;
        struct inode *inode;
@@ -8342,30 +8227,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        struct list_head splice;
        int ret = 0;
 
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
-
        INIT_LIST_HEAD(&works);
        INIT_LIST_HEAD(&splice);
 
-       spin_lock(&root->fs_info->delalloc_lock);
-       list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+       spin_lock(&root->delalloc_lock);
+       list_splice_init(&root->delalloc_inodes, &splice);
        while (!list_empty(&splice)) {
                binode = list_entry(splice.next, struct btrfs_inode,
                                    delalloc_inodes);
 
-               list_del_init(&binode->delalloc_inodes);
-
+               list_move_tail(&binode->delalloc_inodes,
+                              &root->delalloc_inodes);
                inode = igrab(&binode->vfs_inode);
                if (!inode) {
-                       clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                 &binode->runtime_flags);
+                       cond_resched_lock(&root->delalloc_lock);
                        continue;
                }
-
-               list_add_tail(&binode->delalloc_inodes,
-                             &root->fs_info->delalloc_inodes);
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_unlock(&root->delalloc_lock);
 
                work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
                if (unlikely(!work)) {
@@ -8377,16 +8255,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                                   &work->work);
 
                cond_resched();
-               spin_lock(&root->fs_info->delalloc_lock);
+               spin_lock(&root->delalloc_lock);
        }
-       spin_unlock(&root->fs_info->delalloc_lock);
+       spin_unlock(&root->delalloc_lock);
 
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
                btrfs_wait_and_free_delalloc_work(work);
        }
+       return 0;
+out:
+       list_for_each_entry_safe(work, next, &works, list) {
+               list_del_init(&work->list);
+               btrfs_wait_and_free_delalloc_work(work);
+       }
+
+       if (!list_empty_careful(&splice)) {
+               spin_lock(&root->delalloc_lock);
+               list_splice_tail(&splice, &root->delalloc_inodes);
+               spin_unlock(&root->delalloc_lock);
+       }
+       return ret;
+}
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+       int ret;
 
-       /* the filemap_flush will queue IO into the worker threads, but
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       ret = __start_delalloc_inodes(root, delay_iput);
+       /*
+        * the filemap_flush will queue IO into the worker threads, but
         * we have to make sure the IO is actually started and that
         * ordered extents get created before we return
         */
@@ -8398,17 +8299,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
        }
        atomic_dec(&root->fs_info->async_submit_draining);
-       return 0;
-out:
-       list_for_each_entry_safe(work, next, &works, list) {
-               list_del_init(&work->list);
-               btrfs_wait_and_free_delalloc_work(work);
+       return ret;
+}
+
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+                                   int delay_iput)
+{
+       struct btrfs_root *root;
+       struct list_head splice;
+       int ret;
+
+       if (fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       INIT_LIST_HEAD(&splice);
+
+       spin_lock(&fs_info->delalloc_root_lock);
+       list_splice_init(&fs_info->delalloc_roots, &splice);
+       while (!list_empty(&splice)) {
+               root = list_first_entry(&splice, struct btrfs_root,
+                                       delalloc_root);
+               root = btrfs_grab_fs_root(root);
+               BUG_ON(!root);
+               list_move_tail(&root->delalloc_root,
+                              &fs_info->delalloc_roots);
+               spin_unlock(&fs_info->delalloc_root_lock);
+
+               ret = __start_delalloc_inodes(root, delay_iput);
+               btrfs_put_fs_root(root);
+               if (ret)
+                       goto out;
+
+               spin_lock(&fs_info->delalloc_root_lock);
        }
+       spin_unlock(&fs_info->delalloc_root_lock);
 
+       atomic_inc(&fs_info->async_submit_draining);
+       while (atomic_read(&fs_info->nr_async_submits) ||
+             atomic_read(&fs_info->async_delalloc_pages)) {
+               wait_event(fs_info->async_submit_wait,
+                  (atomic_read(&fs_info->nr_async_submits) == 0 &&
+                   atomic_read(&fs_info->async_delalloc_pages) == 0));
+       }
+       atomic_dec(&fs_info->async_submit_draining);
+       return 0;
+out:
        if (!list_empty_careful(&splice)) {
-               spin_lock(&root->fs_info->delalloc_lock);
-               list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_lock(&fs_info->delalloc_root_lock);
+               list_splice_tail(&splice, &fs_info->delalloc_roots);
+               spin_unlock(&fs_info->delalloc_root_lock);
        }
        return ret;
 }
index cd7e96c73cb71df0589f1866346ab5ff2714eb96..238a05545ee2230629fc850191f348b94cadd8cf 100644 (file)
@@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
        if (!root->ref_cows)
                return -EINVAL;
 
+       ret = btrfs_start_delalloc_inodes(root, 0);
+       if (ret)
+               return ret;
+
+       btrfs_wait_ordered_extents(root, 0);
+
        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
        if (!pending_snapshot)
                return -ENOMEM;
@@ -2354,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
        if (ret)
                return ret;
 
-       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
-                       1)) {
-               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-               mnt_drop_write_file(file);
-               return -EINVAL;
-       }
-
-       mutex_lock(&root->fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
@@ -2369,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
        }
 
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-       ret = btrfs_rm_device(root, vol_args->name);
 
-       kfree(vol_args);
-out:
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+               goto out;
+       }
+
+       mutex_lock(&root->fs_info->volume_mutex);
+       ret = btrfs_rm_device(root, vol_args->name);
        mutex_unlock(&root->fs_info->volume_mutex);
        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+
+out:
+       kfree(vol_args);
        mnt_drop_write_file(file);
        return ret;
 }
@@ -2480,6 +2486,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        int ret;
        u64 len = olen;
        u64 bs = root->fs_info->sb->s_blocksize;
+       int same_inode = 0;
 
        /*
         * TODO:
@@ -2516,7 +2523,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
        ret = -EINVAL;
        if (src == inode)
-               goto out_fput;
+               same_inode = 1;
 
        /* the src must be open for reading */
        if (!(src_file.file->f_mode & FMODE_READ))
@@ -2547,12 +2554,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        }
        path->reada = 2;
 
-       if (inode < src) {
-               mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-               mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+       if (!same_inode) {
+               if (inode < src) {
+                       mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+                       mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+               } else {
+                       mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+                       mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+               }
        } else {
-               mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
-               mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+               mutex_lock(&src->i_mutex);
        }
 
        /* determine range to clone */
@@ -2570,6 +2581,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
            !IS_ALIGNED(destoff, bs))
                goto out_unlock;
 
+       /* verify if ranges are overlapped within the same file */
+       if (same_inode) {
+               if (destoff + len > off && destoff < off + len)
+                       goto out_unlock;
+       }
+
        if (destoff > inode->i_size) {
                ret = btrfs_cont_expand(inode, inode->i_size, destoff);
                if (ret)
@@ -2846,7 +2863,8 @@ out:
        unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
 out_unlock:
        mutex_unlock(&src->i_mutex);
-       mutex_unlock(&inode->i_mutex);
+       if (!same_inode)
+               mutex_unlock(&inode->i_mutex);
        vfree(buf);
        btrfs_free_path(path);
 out_fput:
@@ -2951,11 +2969,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
                goto out;
        }
 
-       if (btrfs_root_refs(&new_root->root_item) == 0) {
-               ret = -ENOENT;
-               goto out;
-       }
-
        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
@@ -3719,9 +3732,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
                break;
        }
 
-       if (copy_to_user(arg, sa, sizeof(*sa)))
-               ret = -EFAULT;
-
        err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
        if (err && !ret)
                ret = err;
@@ -3937,6 +3947,16 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
        return ret;
 }
 
+static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+{
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       return btrfs_qgroup_wait_for_completion(root->fs_info);
+}
+
 static long btrfs_ioctl_set_received_subvol(struct file *file,
                                            void __user *arg)
 {
@@ -4179,6 +4199,8 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_quota_rescan(file, argp);
        case BTRFS_IOC_QUOTA_RESCAN_STATUS:
                return btrfs_ioctl_quota_rescan_status(file, argp);
+       case BTRFS_IOC_QUOTA_RESCAN_WAIT:
+               return btrfs_ioctl_quota_rescan_wait(file, argp);
        case BTRFS_IOC_DEV_REPLACE:
                return btrfs_ioctl_dev_replace(root, argp);
        case BTRFS_IOC_GET_FSLABEL:
index 743b86fa4fcb326adea754c65e39f88fb114a518..f93151a98886d460bdf2c357c00a8c7613f2d08e 100644 (file)
@@ -31,8 +31,8 @@
 
 struct workspace {
        void *mem;
-       void *buf;      /* where compressed data goes */
-       void *cbuf;     /* where decompressed data goes */
+       void *buf;      /* where decompressed data goes */
+       void *cbuf;     /* where compressed data goes */
        struct list_head list;
 };
 
index 1ddd728541eea20d747c002f2890cfacd2eb19b4..81369827e5146552edd7428911faef52ca3e1de6 100644 (file)
@@ -24,6 +24,7 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "extent_io.h"
+#include "disk-io.h"
 
 static struct kmem_cache *btrfs_ordered_extent_cache;
 
@@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                                      u64 start, u64 len, u64 disk_len,
                                      int type, int dio, int compress_type)
 {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
        struct btrfs_ordered_extent *entry;
@@ -227,10 +229,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                ordered_data_tree_panic(inode, -EEXIST, file_offset);
        spin_unlock_irq(&tree->lock);
 
-       spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+       spin_lock(&root->ordered_extent_lock);
        list_add_tail(&entry->root_extent_list,
-                     &BTRFS_I(inode)->root->fs_info->ordered_extents);
-       spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+                     &root->ordered_extents);
+       root->nr_ordered_extents++;
+       if (root->nr_ordered_extents == 1) {
+               spin_lock(&root->fs_info->ordered_root_lock);
+               BUG_ON(!list_empty(&root->ordered_root));
+               list_add_tail(&root->ordered_root,
+                             &root->fs_info->ordered_roots);
+               spin_unlock(&root->fs_info->ordered_root_lock);
+       }
+       spin_unlock(&root->ordered_extent_lock);
 
        return 0;
 }
@@ -516,8 +526,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
        spin_unlock_irq(&tree->lock);
 
-       spin_lock(&root->fs_info->ordered_extent_lock);
+       spin_lock(&root->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
+       root->nr_ordered_extents--;
 
        trace_btrfs_ordered_extent_remove(inode, entry);
 
@@ -530,7 +541,14 @@ void btrfs_remove_ordered_extent(struct inode *inode,
            !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
                list_del_init(&BTRFS_I(inode)->ordered_operations);
        }
-       spin_unlock(&root->fs_info->ordered_extent_lock);
+
+       if (!root->nr_ordered_extents) {
+               spin_lock(&root->fs_info->ordered_root_lock);
+               BUG_ON(list_empty(&root->ordered_root));
+               list_del_init(&root->ordered_root);
+               spin_unlock(&root->fs_info->ordered_root_lock);
+       }
+       spin_unlock(&root->ordered_extent_lock);
        wake_up(&entry->wait);
 }
 
@@ -550,7 +568,6 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 {
        struct list_head splice, works;
-       struct list_head *cur;
        struct btrfs_ordered_extent *ordered, *next;
        struct inode *inode;
 
@@ -558,35 +575,34 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
        INIT_LIST_HEAD(&works);
 
        mutex_lock(&root->fs_info->ordered_operations_mutex);
-       spin_lock(&root->fs_info->ordered_extent_lock);
-       list_splice_init(&root->fs_info->ordered_extents, &splice);
+       spin_lock(&root->ordered_extent_lock);
+       list_splice_init(&root->ordered_extents, &splice);
        while (!list_empty(&splice)) {
-               cur = splice.next;
-               ordered = list_entry(cur, struct btrfs_ordered_extent,
-                                    root_extent_list);
-               list_del_init(&ordered->root_extent_list);
-               atomic_inc(&ordered->refs);
-
+               ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
+                                          root_extent_list);
+               list_move_tail(&ordered->root_extent_list,
+                              &root->ordered_extents);
                /*
                 * the inode may be getting freed (in sys_unlink path).
                 */
                inode = igrab(ordered->inode);
+               if (!inode) {
+                       cond_resched_lock(&root->ordered_extent_lock);
+                       continue;
+               }
 
-               spin_unlock(&root->fs_info->ordered_extent_lock);
+               atomic_inc(&ordered->refs);
+               spin_unlock(&root->ordered_extent_lock);
 
-               if (inode) {
-                       ordered->flush_work.func = btrfs_run_ordered_extent_work;
-                       list_add_tail(&ordered->work_list, &works);
-                       btrfs_queue_worker(&root->fs_info->flush_workers,
-                                          &ordered->flush_work);
-               } else {
-                       btrfs_put_ordered_extent(ordered);
-               }
+               ordered->flush_work.func = btrfs_run_ordered_extent_work;
+               list_add_tail(&ordered->work_list, &works);
+               btrfs_queue_worker(&root->fs_info->flush_workers,
+                                  &ordered->flush_work);
 
                cond_resched();
-               spin_lock(&root->fs_info->ordered_extent_lock);
+               spin_lock(&root->ordered_extent_lock);
        }
-       spin_unlock(&root->fs_info->ordered_extent_lock);
+       spin_unlock(&root->ordered_extent_lock);
 
        list_for_each_entry_safe(ordered, next, &works, work_list) {
                list_del_init(&ordered->work_list);
@@ -604,6 +620,33 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
        mutex_unlock(&root->fs_info->ordered_operations_mutex);
 }
 
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
+                                   int delay_iput)
+{
+       struct btrfs_root *root;
+       struct list_head splice;
+
+       INIT_LIST_HEAD(&splice);
+
+       spin_lock(&fs_info->ordered_root_lock);
+       list_splice_init(&fs_info->ordered_roots, &splice);
+       while (!list_empty(&splice)) {
+               root = list_first_entry(&splice, struct btrfs_root,
+                                       ordered_root);
+               root = btrfs_grab_fs_root(root);
+               BUG_ON(!root);
+               list_move_tail(&root->ordered_root,
+                              &fs_info->ordered_roots);
+               spin_unlock(&fs_info->ordered_root_lock);
+
+               btrfs_wait_ordered_extents(root, delay_iput);
+               btrfs_put_fs_root(root);
+
+               spin_lock(&fs_info->ordered_root_lock);
+       }
+       spin_unlock(&fs_info->ordered_root_lock);
+}
+
 /*
  * this is used during transaction commit to write all the inodes
  * added to the ordered operation list.  These files must be fully on
@@ -629,7 +672,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
        INIT_LIST_HEAD(&works);
 
        mutex_lock(&root->fs_info->ordered_operations_mutex);
-       spin_lock(&root->fs_info->ordered_extent_lock);
+       spin_lock(&root->fs_info->ordered_root_lock);
        list_splice_init(&cur_trans->ordered_operations, &splice);
        while (!list_empty(&splice)) {
                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
@@ -648,17 +691,17 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
                if (!wait)
                        list_add_tail(&BTRFS_I(inode)->ordered_operations,
                                      &cur_trans->ordered_operations);
-               spin_unlock(&root->fs_info->ordered_extent_lock);
+               spin_unlock(&root->fs_info->ordered_root_lock);
 
                work = btrfs_alloc_delalloc_work(inode, wait, 1);
                if (!work) {
-                       spin_lock(&root->fs_info->ordered_extent_lock);
+                       spin_lock(&root->fs_info->ordered_root_lock);
                        if (list_empty(&BTRFS_I(inode)->ordered_operations))
                                list_add_tail(&btrfs_inode->ordered_operations,
                                              &splice);
                        list_splice_tail(&splice,
                                         &cur_trans->ordered_operations);
-                       spin_unlock(&root->fs_info->ordered_extent_lock);
+                       spin_unlock(&root->fs_info->ordered_root_lock);
                        ret = -ENOMEM;
                        goto out;
                }
@@ -667,9 +710,9 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
                                   &work->work);
 
                cond_resched();
-               spin_lock(&root->fs_info->ordered_extent_lock);
+               spin_lock(&root->fs_info->ordered_root_lock);
        }
-       spin_unlock(&root->fs_info->ordered_extent_lock);
+       spin_unlock(&root->fs_info->ordered_root_lock);
 out:
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
@@ -989,7 +1032,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
                           u32 *sum, int len)
 {
        struct btrfs_ordered_sum *ordered_sum;
-       struct btrfs_sector_sum *sector_sums;
        struct btrfs_ordered_extent *ordered;
        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
        unsigned long num_sectors;
@@ -1007,18 +1049,16 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
                    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
                        i = (disk_bytenr - ordered_sum->bytenr) >>
                            inode->i_sb->s_blocksize_bits;
-                       sector_sums = ordered_sum->sums + i;
                        num_sectors = ordered_sum->len >>
                                      inode->i_sb->s_blocksize_bits;
-                       for (; i < num_sectors; i++) {
-                               if (sector_sums[i].bytenr == disk_bytenr) {
-                                       sum[index] = sector_sums[i].sum;
-                                       index++;
-                                       if (index == len)
-                                               goto out;
-                                       disk_bytenr += sectorsize;
-                               }
-                       }
+                       num_sectors = min_t(int, len - index, num_sectors - i);
+                       memcpy(sum + index, ordered_sum->sums + i,
+                              num_sectors);
+
+                       index += (int)num_sectors;
+                       if (index == len)
+                               goto out;
+                       disk_bytenr += num_sectors * sectorsize;
                }
        }
 out:
@@ -1055,12 +1095,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
        if (last_mod < root->fs_info->last_trans_committed)
                return;
 
-       spin_lock(&root->fs_info->ordered_extent_lock);
+       spin_lock(&root->fs_info->ordered_root_lock);
        if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
                list_add_tail(&BTRFS_I(inode)->ordered_operations,
                              &cur_trans->ordered_operations);
        }
-       spin_unlock(&root->fs_info->ordered_extent_lock);
+       spin_unlock(&root->fs_info->ordered_root_lock);
 }
 
 int __init ordered_data_init(void)
index 58b0e3b0ebadb633b22916f3a50015c64c24a109..68844d59ee6f10e05403102fa1f53e87c270d030 100644 (file)
@@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree {
        struct rb_node *last;
 };
 
-/*
- * these are used to collect checksums done just before bios submission.
- * They are attached via a list into the ordered extent, and
- * checksum items are inserted into the tree after all the blocks in
- * the ordered extent are on disk
- */
-struct btrfs_sector_sum {
-       /* bytenr on disk */
-       u64 bytenr;
-       u32 sum;
-};
-
 struct btrfs_ordered_sum {
        /* bytenr is the start of this extent on disk */
        u64 bytenr;
@@ -45,10 +33,10 @@ struct btrfs_ordered_sum {
        /*
         * this is the length in bytes covered by the sums array below.
         */
-       unsigned long len;
+       int len;
        struct list_head list;
-       /* last field is a variable length array of btrfs_sector_sums */
-       struct btrfs_sector_sum sums[];
+       /* last field is a variable length array of csums */
+       u32 sums[];
 };
 
 /*
@@ -149,11 +137,8 @@ struct btrfs_ordered_extent {
 static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
                                         unsigned long bytes)
 {
-       unsigned long num_sectors = (bytes + root->sectorsize - 1) /
-               root->sectorsize;
-       num_sectors++;
-       return sizeof(struct btrfs_ordered_sum) +
-               num_sectors * sizeof(struct btrfs_sector_sum);
+       int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
+       return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
 }
 
 static inline void
@@ -204,6 +189,8 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode);
 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
+                                   int delay_iput);
 void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
 void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
index 9d49c586995a18e2aaca7aed94c84b6abd735a2c..1280eff8af56989dcc4a00e822527bd31ce2b3f9 100644 (file)
@@ -98,13 +98,10 @@ struct btrfs_qgroup_list {
        struct btrfs_qgroup *member;
 };
 
-struct qgroup_rescan {
-       struct btrfs_work       work;
-       struct btrfs_fs_info    *fs_info;
-};
-
-static void qgroup_rescan_start(struct btrfs_fs_info *fs_info,
-                               struct qgroup_rescan *qscan);
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+                  int init_flags);
+static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
 
 /* must be called with qgroup_ioctl_lock held */
 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
@@ -255,10 +252,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
        int slot;
        int ret = 0;
        u64 flags = 0;
+       u64 rescan_progress = 0;
 
        if (!fs_info->quota_enabled)
                return 0;
 
+       fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+       if (!fs_info->qgroup_ulist) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
@@ -306,20 +310,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
                        }
                        fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
                                                                          ptr);
-                       fs_info->qgroup_rescan_progress.objectid =
-                                       btrfs_qgroup_status_rescan(l, ptr);
-                       if (fs_info->qgroup_flags &
-                           BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
-                               struct qgroup_rescan *qscan =
-                                       kmalloc(sizeof(*qscan), GFP_NOFS);
-                               if (!qscan) {
-                                       ret = -ENOMEM;
-                                       goto out;
-                               }
-                               fs_info->qgroup_rescan_progress.type = 0;
-                               fs_info->qgroup_rescan_progress.offset = 0;
-                               qgroup_rescan_start(fs_info, qscan);
-                       }
+                       rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
                        goto next1;
                }
 
@@ -421,9 +412,18 @@ out:
        if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
                fs_info->quota_enabled = 0;
                fs_info->pending_quota_state = 0;
+       } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
+                  ret >= 0) {
+               ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
        }
        btrfs_free_path(path);
 
+       if (ret < 0) {
+               ulist_free(fs_info->qgroup_ulist);
+               fs_info->qgroup_ulist = NULL;
+               fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+       }
+
        return ret < 0 ? ret : 0;
 }
 
@@ -460,6 +460,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
                }
                kfree(qgroup);
        }
+       ulist_free(fs_info->qgroup_ulist);
 }
 
 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
@@ -819,6 +820,12 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
                goto out;
        }
 
+       fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+       if (!fs_info->qgroup_ulist) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
        /*
         * initially create the quota tree
         */
@@ -916,6 +923,10 @@ out_free_root:
                kfree(quota_root);
        }
 out:
+       if (ret) {
+               ulist_free(fs_info->qgroup_ulist);
+               fs_info->qgroup_ulist = NULL;
+       }
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        return ret;
 }
@@ -1355,7 +1366,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
        u64 ref_root;
        struct btrfs_qgroup *qgroup;
        struct ulist *roots = NULL;
-       struct ulist *tmp = NULL;
        u64 seq;
        int ret = 0;
        int sgn;
@@ -1428,14 +1438,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
        if (ret < 0)
                return ret;
 
-       mutex_lock(&fs_info->qgroup_rescan_lock);
        spin_lock(&fs_info->qgroup_lock);
-       if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
-               if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
-                       ret = 0;
-                       goto unlock;
-               }
-       }
 
        quota_root = fs_info->quota_root;
        if (!quota_root)
@@ -1448,39 +1451,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
        /*
         * step 1: for each old ref, visit all nodes once and inc refcnt
         */
-       tmp = ulist_alloc(GFP_ATOMIC);
-       if (!tmp) {
-               ret = -ENOMEM;
-               goto unlock;
-       }
+       ulist_reinit(fs_info->qgroup_ulist);
        seq = fs_info->qgroup_seq;
        fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
 
-       ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq);
+       ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
+                                      seq);
        if (ret)
                goto unlock;
 
        /*
         * step 2: walk from the new root
         */
-       ret = qgroup_account_ref_step2(fs_info, roots, tmp, seq, sgn,
-                                      node->num_bytes, qgroup);
+       ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
+                                      seq, sgn, node->num_bytes, qgroup);
        if (ret)
                goto unlock;
 
        /*
         * step 3: walk again from old refs
         */
-       ret = qgroup_account_ref_step3(fs_info, roots, tmp, seq, sgn,
-                                      node->num_bytes);
+       ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
+                                      seq, sgn, node->num_bytes);
        if (ret)
                goto unlock;
 
 unlock:
        spin_unlock(&fs_info->qgroup_lock);
-       mutex_unlock(&fs_info->qgroup_rescan_lock);
        ulist_free(roots);
-       ulist_free(tmp);
 
        return ret;
 }
@@ -1527,9 +1525,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 
        if (!ret && start_rescan_worker) {
-               ret = btrfs_qgroup_rescan(fs_info);
-               if (ret)
-                       pr_err("btrfs: start rescan quota failed: %d\n", ret);
+               ret = qgroup_rescan_init(fs_info, 0, 1);
+               if (!ret) {
+                       qgroup_rescan_zero_tracking(fs_info);
+                       btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+                                          &fs_info->qgroup_rescan_work);
+               }
                ret = 0;
        }
 
@@ -1720,7 +1721,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 ref_root = root->root_key.objectid;
        int ret = 0;
-       struct ulist *ulist = NULL;
        struct ulist_node *unode;
        struct ulist_iterator uiter;
 
@@ -1743,17 +1743,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
         * in a first step, we check all affected qgroups if any limits would
         * be exceeded
         */
-       ulist = ulist_alloc(GFP_ATOMIC);
-       if (!ulist) {
-               ret = -ENOMEM;
-               goto out;
-       }
-       ret = ulist_add(ulist, qgroup->qgroupid,
+       ulist_reinit(fs_info->qgroup_ulist);
+       ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
                        (uintptr_t)qgroup, GFP_ATOMIC);
        if (ret < 0)
                goto out;
        ULIST_ITER_INIT(&uiter);
-       while ((unode = ulist_next(ulist, &uiter))) {
+       while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
 
@@ -1774,7 +1770,8 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
                }
 
                list_for_each_entry(glist, &qg->groups, next_group) {
-                       ret = ulist_add(ulist, glist->group->qgroupid,
+                       ret = ulist_add(fs_info->qgroup_ulist,
+                                       glist->group->qgroupid,
                                        (uintptr_t)glist->group, GFP_ATOMIC);
                        if (ret < 0)
                                goto out;
@@ -1785,7 +1782,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
         * no limits exceeded, now record the reservation into all qgroups
         */
        ULIST_ITER_INIT(&uiter);
-       while ((unode = ulist_next(ulist, &uiter))) {
+       while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
                struct btrfs_qgroup *qg;
 
                qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
@@ -1795,8 +1792,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 
 out:
        spin_unlock(&fs_info->qgroup_lock);
-       ulist_free(ulist);
-
        return ret;
 }
 
@@ -1805,7 +1800,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
        struct btrfs_root *quota_root;
        struct btrfs_qgroup *qgroup;
        struct btrfs_fs_info *fs_info = root->fs_info;
-       struct ulist *ulist = NULL;
        struct ulist_node *unode;
        struct ulist_iterator uiter;
        u64 ref_root = root->root_key.objectid;
@@ -1827,17 +1821,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
        if (!qgroup)
                goto out;
 
-       ulist = ulist_alloc(GFP_ATOMIC);
-       if (!ulist) {
-               btrfs_std_error(fs_info, -ENOMEM);
-               goto out;
-       }
-       ret = ulist_add(ulist, qgroup->qgroupid,
+       ulist_reinit(fs_info->qgroup_ulist);
+       ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
                        (uintptr_t)qgroup, GFP_ATOMIC);
        if (ret < 0)
                goto out;
        ULIST_ITER_INIT(&uiter);
-       while ((unode = ulist_next(ulist, &uiter))) {
+       while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
 
@@ -1846,7 +1836,8 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
                qg->reserved -= num_bytes;
 
                list_for_each_entry(glist, &qg->groups, next_group) {
-                       ret = ulist_add(ulist, glist->group->qgroupid,
+                       ret = ulist_add(fs_info->qgroup_ulist,
+                                       glist->group->qgroupid,
                                        (uintptr_t)glist->group, GFP_ATOMIC);
                        if (ret < 0)
                                goto out;
@@ -1855,7 +1846,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 
 out:
        spin_unlock(&fs_info->qgroup_lock);
-       ulist_free(ulist);
 }
 
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
@@ -1874,12 +1864,11 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
  * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
  */
 static int
-qgroup_rescan_leaf(struct qgroup_rescan *qscan, struct btrfs_path *path,
+qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
                   struct btrfs_trans_handle *trans, struct ulist *tmp,
                   struct extent_buffer *scratch_leaf)
 {
        struct btrfs_key found;
-       struct btrfs_fs_info *fs_info = qscan->fs_info;
        struct ulist *roots = NULL;
        struct ulist_node *unode;
        struct ulist_iterator uiter;
@@ -2007,11 +1996,10 @@ out:
 
 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 {
-       struct qgroup_rescan *qscan = container_of(work, struct qgroup_rescan,
-                                                  work);
+       struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
+                                                    qgroup_rescan_work);
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans = NULL;
-       struct btrfs_fs_info *fs_info = qscan->fs_info;
        struct ulist *tmp = NULL;
        struct extent_buffer *scratch_leaf = NULL;
        int err = -ENOMEM;
@@ -2036,7 +2024,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
                if (!fs_info->quota_enabled) {
                        err = -EINTR;
                } else {
-                       err = qgroup_rescan_leaf(qscan, path, trans,
+                       err = qgroup_rescan_leaf(fs_info, path, trans,
                                                 tmp, scratch_leaf);
                }
                if (err > 0)
@@ -2049,7 +2037,6 @@ out:
        kfree(scratch_leaf);
        ulist_free(tmp);
        btrfs_free_path(path);
-       kfree(qscan);
 
        mutex_lock(&fs_info->qgroup_rescan_lock);
        fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -2068,47 +2055,74 @@ out:
        } else {
                pr_err("btrfs: qgroup scan failed with %d\n", err);
        }
-}
 
-static void
-qgroup_rescan_start(struct btrfs_fs_info *fs_info, struct qgroup_rescan *qscan)
-{
-       memset(&qscan->work, 0, sizeof(qscan->work));
-       qscan->work.func = btrfs_qgroup_rescan_worker;
-       qscan->fs_info = fs_info;
-
-       pr_info("btrfs: qgroup scan started\n");
-       btrfs_queue_worker(&fs_info->qgroup_rescan_workers, &qscan->work);
+       complete_all(&fs_info->qgroup_rescan_completion);
 }
 
-int
-btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+/*
+ * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
+ * memory required for the rescan context.
+ */
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+                  int init_flags)
 {
        int ret = 0;
-       struct rb_node *n;
-       struct btrfs_qgroup *qgroup;
-       struct qgroup_rescan *qscan = kmalloc(sizeof(*qscan), GFP_NOFS);
 
-       if (!qscan)
-               return -ENOMEM;
+       if (!init_flags &&
+           (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
+            !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
+               ret = -EINVAL;
+               goto err;
+       }
 
        mutex_lock(&fs_info->qgroup_rescan_lock);
        spin_lock(&fs_info->qgroup_lock);
-       if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
-               ret = -EINPROGRESS;
-       else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
-               ret = -EINVAL;
-       if (ret) {
-               spin_unlock(&fs_info->qgroup_lock);
-               mutex_unlock(&fs_info->qgroup_rescan_lock);
-               kfree(qscan);
-               return ret;
+
+       if (init_flags) {
+               if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+                       ret = -EINPROGRESS;
+               else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+                       ret = -EINVAL;
+
+               if (ret) {
+                       spin_unlock(&fs_info->qgroup_lock);
+                       mutex_unlock(&fs_info->qgroup_rescan_lock);
+                       goto err;
+               }
+
+               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
        }
 
-       fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
        memset(&fs_info->qgroup_rescan_progress, 0,
                sizeof(fs_info->qgroup_rescan_progress));
+       fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+
+       spin_unlock(&fs_info->qgroup_lock);
+       mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+       init_completion(&fs_info->qgroup_rescan_completion);
+
+       memset(&fs_info->qgroup_rescan_work, 0,
+              sizeof(fs_info->qgroup_rescan_work));
+       fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
+
+       if (ret) {
+err:
+               pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void
+qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
+{
+       struct rb_node *n;
+       struct btrfs_qgroup *qgroup;
 
+       spin_lock(&fs_info->qgroup_lock);
        /* clear all current qgroup tracking information */
        for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
                qgroup = rb_entry(n, struct btrfs_qgroup, node);
@@ -2118,9 +2132,74 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
                qgroup->excl_cmpr = 0;
        }
        spin_unlock(&fs_info->qgroup_lock);
-       mutex_unlock(&fs_info->qgroup_rescan_lock);
+}
+
+int
+btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+{
+       int ret = 0;
+       struct btrfs_trans_handle *trans;
 
-       qgroup_rescan_start(fs_info, qscan);
+       ret = qgroup_rescan_init(fs_info, 0, 1);
+       if (ret)
+               return ret;
+
+       /*
+        * We have set the rescan_progress to 0, which means no more
+        * delayed refs will be accounted by btrfs_qgroup_account_ref.
+        * However, btrfs_qgroup_account_ref may be right after its call
+        * to btrfs_find_all_roots, in which case it would still do the
+        * accounting.
+        * To solve this, we're committing the transaction, which will
+        * ensure we run all delayed refs and only after that, we are
+        * going to clear all tracking information for a clean start.
+        */
+
+       trans = btrfs_join_transaction(fs_info->fs_root);
+       if (IS_ERR(trans)) {
+               fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+               return PTR_ERR(trans);
+       }
+       ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+       if (ret) {
+               fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+               return ret;
+       }
+
+       qgroup_rescan_zero_tracking(fs_info);
+
+       btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+                          &fs_info->qgroup_rescan_work);
 
        return 0;
 }
+
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+{
+       int running;
+       int ret = 0;
+
+       mutex_lock(&fs_info->qgroup_rescan_lock);
+       spin_lock(&fs_info->qgroup_lock);
+       running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+       spin_unlock(&fs_info->qgroup_lock);
+       mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+       if (running)
+               ret = wait_for_completion_interruptible(
+                                       &fs_info->qgroup_rescan_completion);
+
+       return ret;
+}
+
+/*
+ * this is only called from open_ctree where we're still single threaded, thus
+ * locking is omitted here.
+ */
+void
+btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
+{
+       if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+               btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+                                  &fs_info->qgroup_rescan_work);
+}
index 4febca4fc2de7fe79fb9239a5e44f9ce3ad4eba0..12096496cc99eb24e6412ebc3ed5780f0e2b2430 100644 (file)
@@ -1305,6 +1305,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
        struct extent_buffer *eb;
        struct btrfs_root_item *root_item;
        struct btrfs_key root_key;
+       u64 last_snap = 0;
        int ret;
 
        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1320,6 +1321,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
                                      BTRFS_TREE_RELOC_OBJECTID);
                BUG_ON(ret);
 
+               last_snap = btrfs_root_last_snapshot(&root->root_item);
                btrfs_set_root_last_snapshot(&root->root_item,
                                             trans->transid - 1);
        } else {
@@ -1345,6 +1347,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
                memset(&root_item->drop_progress, 0,
                       sizeof(struct btrfs_disk_key));
                root_item->drop_level = 0;
+               /*
+                * abuse rtransid, it is safe because it is impossible to
+                * receive data into a relocation tree.
+                */
+               btrfs_set_root_rtransid(root_item, last_snap);
+               btrfs_set_root_otransid(root_item, trans->transid);
        }
 
        btrfs_tree_unlock(eb);
@@ -1355,8 +1363,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        kfree(root_item);
 
-       reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
-                                                &root_key);
+       reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
        BUG_ON(IS_ERR(reloc_root));
        reloc_root->last_trans = trans->transid;
        return reloc_root;
@@ -2273,8 +2280,12 @@ void free_reloc_roots(struct list_head *list)
 static noinline_for_stack
 int merge_reloc_roots(struct reloc_control *rc)
 {
+       struct btrfs_trans_handle *trans;
        struct btrfs_root *root;
        struct btrfs_root *reloc_root;
+       u64 last_snap;
+       u64 otransid;
+       u64 objectid;
        LIST_HEAD(reloc_roots);
        int found = 0;
        int ret = 0;
@@ -2308,12 +2319,44 @@ again:
                } else {
                        list_del_init(&reloc_root->root_list);
                }
+
+               /*
+                * we keep the old last snapshod transid in rtranid when we
+                * created the relocation tree.
+                */
+               last_snap = btrfs_root_rtransid(&reloc_root->root_item);
+               otransid = btrfs_root_otransid(&reloc_root->root_item);
+               objectid = reloc_root->root_key.offset;
+
                ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
                if (ret < 0) {
                        if (list_empty(&reloc_root->root_list))
                                list_add_tail(&reloc_root->root_list,
                                              &reloc_roots);
                        goto out;
+               } else if (!ret) {
+                       /*
+                        * recover the last snapshot tranid to avoid
+                        * the space balance break NOCOW.
+                        */
+                       root = read_fs_root(rc->extent_root->fs_info,
+                                           objectid);
+                       if (IS_ERR(root))
+                               continue;
+
+                       if (btrfs_root_refs(&root->root_item) == 0)
+                               continue;
+
+                       trans = btrfs_join_transaction(root);
+                       BUG_ON(IS_ERR(trans));
+
+                       /* Check if the fs/file tree was snapshoted or not. */
+                       if (btrfs_root_last_snapshot(&root->root_item) ==
+                           otransid - 1)
+                               btrfs_set_root_last_snapshot(&root->root_item,
+                                                            last_snap);
+                               
+                       btrfs_end_transaction(trans, root);
                }
        }
 
@@ -3266,6 +3309,8 @@ static int __add_tree_block(struct reloc_control *rc,
        struct btrfs_path *path;
        struct btrfs_key key;
        int ret;
+       bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
+                                       SKINNY_METADATA);
 
        if (tree_block_processed(bytenr, blocksize, rc))
                return 0;
@@ -3276,10 +3321,15 @@ static int __add_tree_block(struct reloc_control *rc,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-
+again:
        key.objectid = bytenr;
-       key.type = BTRFS_EXTENT_ITEM_KEY;
-       key.offset = blocksize;
+       if (skinny) {
+               key.type = BTRFS_METADATA_ITEM_KEY;
+               key.offset = (u64)-1;
+       } else {
+               key.type = BTRFS_EXTENT_ITEM_KEY;
+               key.offset = blocksize;
+       }
 
        path->search_commit_root = 1;
        path->skip_locking = 1;
@@ -3287,11 +3337,23 @@ static int __add_tree_block(struct reloc_control *rc,
        if (ret < 0)
                goto out;
 
-       btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-       if (ret > 0) {
-               if (key.objectid == bytenr &&
-                   key.type == BTRFS_METADATA_ITEM_KEY)
-                       ret = 0;
+       if (ret > 0 && skinny) {
+               if (path->slots[0]) {
+                       path->slots[0]--;
+                       btrfs_item_key_to_cpu(path->nodes[0], &key,
+                                             path->slots[0]);
+                       if (key.objectid == bytenr &&
+                           (key.type == BTRFS_METADATA_ITEM_KEY ||
+                            (key.type == BTRFS_EXTENT_ITEM_KEY &&
+                             key.offset == blocksize)))
+                               ret = 0;
+               }
+
+               if (ret) {
+                       skinny = false;
+                       btrfs_release_path(path);
+                       goto again;
+               }
        }
        BUG_ON(ret);
 
@@ -4160,12 +4222,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
               (unsigned long long)rc->block_group->key.objectid,
               (unsigned long long)rc->block_group->flags);
 
-       ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+       ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
        if (ret < 0) {
                err = ret;
                goto out;
        }
-       btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+       btrfs_wait_all_ordered_extents(fs_info, 0);
 
        while (1) {
                mutex_lock(&fs_info->cleaner_mutex);
@@ -4277,7 +4339,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
                    key.type != BTRFS_ROOT_ITEM_KEY)
                        break;
 
-               reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+               reloc_root = btrfs_read_fs_root(root, &key);
                if (IS_ERR(reloc_root)) {
                        err = PTR_ERR(reloc_root);
                        goto out;
@@ -4396,10 +4458,8 @@ out:
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 {
        struct btrfs_ordered_sum *sums;
-       struct btrfs_sector_sum *sector_sum;
        struct btrfs_ordered_extent *ordered;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       size_t offset;
        int ret;
        u64 disk_bytenr;
        LIST_HEAD(list);
@@ -4413,19 +4473,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
        if (ret)
                goto out;
 
+       disk_bytenr = ordered->start;
        while (!list_empty(&list)) {
                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
                list_del_init(&sums->list);
 
-               sector_sum = sums->sums;
-               sums->bytenr = ordered->start;
-
-               offset = 0;
-               while (offset < sums->len) {
-                       sector_sum->bytenr += ordered->start - disk_bytenr;
-                       sector_sum++;
-                       offset += root->sectorsize;
-               }
+               sums->bytenr = disk_bytenr;
+               disk_bytenr += sums->len;
 
                btrfs_add_ordered_sum(inode, ordered, sums);
        }
index 5bf1ed57f178ea60b7ca8e98aceb6710fc9f55c4..ffb1036ef10db97f31bfa4c00ca7c36a45d09e4f 100644 (file)
@@ -64,52 +64,59 @@ void btrfs_read_root_item(struct extent_buffer *eb, int slot,
 }
 
 /*
- * lookup the root with the highest offset for a given objectid.  The key we do
- * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0
- * on error.
+ * btrfs_find_root - lookup the root by the key.
+ * root: the root of the root tree
+ * search_key: the key to search
+ * path: the path we search
+ * root_item: the root item of the tree we look for
+ * root_key: the reak key of the tree we look for
+ *
+ * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
+ * of the search key, just lookup the root with the highest offset for a
+ * given objectid.
+ *
+ * If we find something return 0, otherwise > 0, < 0 on error.
  */
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
-                       struct btrfs_root_item *item, struct btrfs_key *key)
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+                   struct btrfs_path *path, struct btrfs_root_item *root_item,
+                   struct btrfs_key *root_key)
 {
-       struct btrfs_path *path;
-       struct btrfs_key search_key;
        struct btrfs_key found_key;
        struct extent_buffer *l;
        int ret;
        int slot;
 
-       search_key.objectid = objectid;
-       search_key.type = BTRFS_ROOT_ITEM_KEY;
-       search_key.offset = (u64)-1;
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-       ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+       ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
        if (ret < 0)
-               goto out;
+               return ret;
 
-       BUG_ON(ret == 0);
-       if (path->slots[0] == 0) {
-               ret = 1;
-               goto out;
+       if (search_key->offset != -1ULL) {      /* the search key is exact */
+               if (ret > 0)
+                       goto out;
+       } else {
+               BUG_ON(ret == 0);               /* Logical error */
+               if (path->slots[0] == 0)
+                       goto out;
+               path->slots[0]--;
+               ret = 0;
        }
+
        l = path->nodes[0];
-       slot = path->slots[0] - 1;
+       slot = path->slots[0];
+
        btrfs_item_key_to_cpu(l, &found_key, slot);
-       if (found_key.objectid != objectid ||
+       if (found_key.objectid != search_key->objectid ||
            found_key.type != BTRFS_ROOT_ITEM_KEY) {
                ret = 1;
                goto out;
        }
-       if (item)
-               btrfs_read_root_item(l, slot, item);
-       if (key)
-               memcpy(key, &found_key, sizeof(found_key));
 
-       ret = 0;
+       if (root_item)
+               btrfs_read_root_item(l, slot, root_item);
+       if (root_key)
+               memcpy(root_key, &found_key, sizeof(found_key));
 out:
-       btrfs_free_path(path);
+       btrfs_release_path(path);
        return ret;
 }
 
@@ -212,86 +219,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        return btrfs_insert_item(trans, root, key, item, sizeof(*item));
 }
 
-/*
- * at mount time we want to find all the old transaction snapshots that were in
- * the process of being deleted if we crashed.  This is any root item with an
- * offset lower than the latest root.  They need to be queued for deletion to
- * finish what was happening when we crashed.
- */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
-{
-       struct btrfs_root *dead_root;
-       struct btrfs_root_item *ri;
-       struct btrfs_key key;
-       struct btrfs_key found_key;
-       struct btrfs_path *path;
-       int ret;
-       u32 nritems;
-       struct extent_buffer *leaf;
-       int slot;
-
-       key.objectid = objectid;
-       btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-       key.offset = 0;
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-again:
-       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-       if (ret < 0)
-               goto err;
-       while (1) {
-               leaf = path->nodes[0];
-               nritems = btrfs_header_nritems(leaf);
-               slot = path->slots[0];
-               if (slot >= nritems) {
-                       ret = btrfs_next_leaf(root, path);
-                       if (ret)
-                               break;
-                       leaf = path->nodes[0];
-                       nritems = btrfs_header_nritems(leaf);
-                       slot = path->slots[0];
-               }
-               btrfs_item_key_to_cpu(leaf, &key, slot);
-               if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
-                       goto next;
-
-               if (key.objectid < objectid)
-                       goto next;
-
-               if (key.objectid > objectid)
-                       break;
-
-               ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
-               if (btrfs_disk_root_refs(leaf, ri) != 0)
-                       goto next;
-
-               memcpy(&found_key, &key, sizeof(key));
-               key.offset++;
-               btrfs_release_path(path);
-               dead_root =
-                       btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
-                                                   &found_key);
-               if (IS_ERR(dead_root)) {
-                       ret = PTR_ERR(dead_root);
-                       goto err;
-               }
-
-               ret = btrfs_add_dead_root(dead_root);
-               if (ret)
-                       goto err;
-               goto again;
-next:
-               slot++;
-               path->slots[0]++;
-       }
-       ret = 0;
-err:
-       btrfs_free_path(path);
-       return ret;
-}
-
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 {
        struct extent_buffer *leaf;
@@ -301,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
        struct btrfs_root *root;
        int err = 0;
        int ret;
+       bool can_recover = true;
+
+       if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
+               can_recover = false;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -340,20 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                root_key.objectid = key.offset;
                key.offset++;
 
-               root = btrfs_read_fs_root_no_name(tree_root->fs_info,
-                                                 &root_key);
-               if (!IS_ERR(root))
+               root = btrfs_read_fs_root(tree_root, &root_key);
+               err = PTR_RET(root);
+               if (err && err != -ENOENT) {
+                       break;
+               } else if (err == -ENOENT) {
+                       struct btrfs_trans_handle *trans;
+
+                       btrfs_release_path(path);
+
+                       trans = btrfs_join_transaction(tree_root);
+                       if (IS_ERR(trans)) {
+                               err = PTR_ERR(trans);
+                               btrfs_error(tree_root->fs_info, err,
+                                           "Failed to start trans to delete "
+                                           "orphan item");
+                               break;
+                       }
+                       err = btrfs_del_orphan_item(trans, tree_root,
+                                                   root_key.objectid);
+                       btrfs_end_transaction(trans, tree_root);
+                       if (err) {
+                               btrfs_error(tree_root->fs_info, err,
+                                           "Failed to delete root orphan "
+                                           "item");
+                               break;
+                       }
                        continue;
+               }
 
-               ret = PTR_ERR(root);
-               if (ret != -ENOENT) {
-                       err = ret;
+               if (btrfs_root_refs(&root->root_item) == 0) {
+                       btrfs_add_dead_root(root);
+                       continue;
+               }
+
+               err = btrfs_init_fs_root(root);
+               if (err) {
+                       btrfs_free_fs_root(root);
                        break;
                }
 
-               ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
-               if (ret) {
-                       err = ret;
+               root->orphan_item_inserted = 1;
+
+               err = btrfs_insert_fs_root(root->fs_info, root);
+               if (err) {
+                       BUG_ON(err == -EEXIST);
+                       btrfs_free_fs_root(root);
                        break;
                }
        }
@@ -368,8 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
        struct btrfs_path *path;
        int ret;
-       struct btrfs_root_item *ri;
-       struct extent_buffer *leaf;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -379,8 +340,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                goto out;
 
        BUG_ON(ret != 0);
-       leaf = path->nodes[0];
-       ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
 
        ret = btrfs_del_item(trans, root, path);
 out:
index 79bd479317cb53cbea30a7021d81a891ed9ae20c..4ba2a69a60ad4ffc327697ec4a8db3d04159d0a8 100644 (file)
@@ -2126,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
                           u8 *csum)
 {
        struct btrfs_ordered_sum *sum = NULL;
-       int ret = 0;
-       unsigned long i;
+       unsigned long index;
        unsigned long num_sectors;
 
        while (!list_empty(&sctx->csum_list)) {
@@ -2146,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
        if (!sum)
                return 0;
 
+       index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
        num_sectors = sum->len / sctx->sectorsize;
-       for (i = 0; i < num_sectors; ++i) {
-               if (sum->sums[i].bytenr == logical) {
-                       memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
-                       ret = 1;
-                       break;
-               }
-       }
-       if (ret && i == num_sectors - 1) {
+       memcpy(csum, sum->sums + index, sctx->csum_size);
+       if (index == num_sectors - 1) {
                list_del(&sum->list);
                kfree(sum);
        }
-       return ret;
+       return 1;
 }
 
 /* scrub extent tries to collect up to 64 kB for each bio */
@@ -2505,6 +2499,7 @@ again:
                        if (ret)
                                goto out;
 
+                       scrub_free_csums(sctx);
                        if (extent_logical + extent_len <
                            key.objectid + bytes) {
                                logical += increment;
@@ -3204,16 +3199,18 @@ out:
 
 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
 {
-       unsigned long index;
        struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
-       int ret = 0;
+       struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
        struct btrfs_key key;
-       struct inode *inode = NULL;
+       struct inode *inode;
+       struct page *page;
        struct btrfs_root *local_root;
        u64 physical_for_dev_replace;
        u64 len;
-       struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+       unsigned long index;
        int srcu_index;
+       int ret;
+       int err;
 
        key.objectid = root;
        key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3227,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
                return PTR_ERR(local_root);
        }
 
+       if (btrfs_root_refs(&local_root->root_item) == 0) {
+               srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+               return -ENOENT;
+       }
+
        key.type = BTRFS_INODE_ITEM_KEY;
        key.objectid = inum;
        key.offset = 0;
@@ -3235,19 +3237,21 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
        if (IS_ERR(inode))
                return PTR_ERR(inode);
 
+       /* Avoid truncate/dio/punch hole.. */
+       mutex_lock(&inode->i_mutex);
+       inode_dio_wait(inode);
+
+       ret = 0;
        physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
        len = nocow_ctx->len;
        while (len >= PAGE_CACHE_SIZE) {
-               struct page *page = NULL;
-               int ret_sub;
-
                index = offset >> PAGE_CACHE_SHIFT;
-
+again:
                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
                if (!page) {
                        pr_err("find_or_create_page() failed\n");
                        ret = -ENOMEM;
-                       goto next_page;
+                       goto out;
                }
 
                if (PageUptodate(page)) {
@@ -3255,39 +3259,49 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
                                goto next_page;
                } else {
                        ClearPageError(page);
-                       ret_sub = extent_read_full_page(&BTRFS_I(inode)->
+                       err = extent_read_full_page(&BTRFS_I(inode)->
                                                         io_tree,
                                                        page, btrfs_get_extent,
                                                        nocow_ctx->mirror_num);
-                       if (ret_sub) {
-                               ret = ret_sub;
+                       if (err) {
+                               ret = err;
                                goto next_page;
                        }
-                       wait_on_page_locked(page);
+
+                       lock_page(page);
+                       /*
+                        * If the page has been remove from the page cache,
+                        * the data on it is meaningless, because it may be
+                        * old one, the new data may be written into the new
+                        * page in the page cache.
+                        */
+                       if (page->mapping != inode->i_mapping) {
+                               page_cache_release(page);
+                               goto again;
+                       }
                        if (!PageUptodate(page)) {
                                ret = -EIO;
                                goto next_page;
                        }
                }
-               ret_sub = write_page_nocow(nocow_ctx->sctx,
-                                          physical_for_dev_replace, page);
-               if (ret_sub) {
-                       ret = ret_sub;
-                       goto next_page;
-               }
-
+               err = write_page_nocow(nocow_ctx->sctx,
+                                      physical_for_dev_replace, page);
+               if (err)
+                       ret = err;
 next_page:
-               if (page) {
-                       unlock_page(page);
-                       put_page(page);
-               }
+               unlock_page(page);
+               page_cache_release(page);
+
+               if (ret)
+                       break;
+
                offset += PAGE_CACHE_SIZE;
                physical_for_dev_replace += PAGE_CACHE_SIZE;
                len -= PAGE_CACHE_SIZE;
        }
-
-       if (inode)
-               iput(inode);
+out:
+       mutex_unlock(&inode->i_mutex);
+       iput(inode);
        return ret;
 }
 
index ff40f1c00ce315d59eaf4da6d20145351a000e51..d3f3b43cae0bdef23c889f6c5939c49aea13eda6 100644 (file)
@@ -158,7 +158,7 @@ static void fs_path_reset(struct fs_path *p)
        }
 }
 
-static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
+static struct fs_path *fs_path_alloc(void)
 {
        struct fs_path *p;
 
@@ -173,11 +173,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
        return p;
 }
 
-static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
+static struct fs_path *fs_path_alloc_reversed(void)
 {
        struct fs_path *p;
 
-       p = fs_path_alloc(sctx);
+       p = fs_path_alloc();
        if (!p)
                return NULL;
        p->reversed = 1;
@@ -185,7 +185,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
        return p;
 }
 
-static void fs_path_free(struct send_ctx *sctx, struct fs_path *p)
+static void fs_path_free(struct fs_path *p)
 {
        if (!p)
                return;
@@ -753,8 +753,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
  *
  * path must point to the INODE_REF or INODE_EXTREF when called.
  */
-static int iterate_inode_ref(struct send_ctx *sctx,
-                            struct btrfs_root *root, struct btrfs_path *path,
+static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
                             struct btrfs_key *found_key, int resolve,
                             iterate_inode_ref_t iterate, void *ctx)
 {
@@ -777,13 +776,13 @@ static int iterate_inode_ref(struct send_ctx *sctx,
        unsigned long elem_size;
        unsigned long ptr;
 
-       p = fs_path_alloc_reversed(sctx);
+       p = fs_path_alloc_reversed();
        if (!p)
                return -ENOMEM;
 
        tmp_path = alloc_path_for_send();
        if (!tmp_path) {
-               fs_path_free(sctx, p);
+               fs_path_free(p);
                return -ENOMEM;
        }
 
@@ -858,7 +857,7 @@ static int iterate_inode_ref(struct send_ctx *sctx,
 
 out:
        btrfs_free_path(tmp_path);
-       fs_path_free(sctx, p);
+       fs_path_free(p);
        return ret;
 }
 
@@ -874,8 +873,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
  *
  * path must point to the dir item when called.
  */
-static int iterate_dir_item(struct send_ctx *sctx,
-                           struct btrfs_root *root, struct btrfs_path *path,
+static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
                            struct btrfs_key *found_key,
                            iterate_dir_item_t iterate, void *ctx)
 {
@@ -990,7 +988,7 @@ static int __copy_first_ref(int num, u64 dir, int index,
  * Retrieve the first path of an inode. If an inode has more then one
  * ref/hardlink, this is ignored.
  */
-static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
+static int get_inode_path(struct btrfs_root *root,
                          u64 ino, struct fs_path *path)
 {
        int ret;
@@ -1022,8 +1020,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
                goto out;
        }
 
-       ret = iterate_inode_ref(sctx, root, p, &found_key, 1,
-                       __copy_first_ref, path);
+       ret = iterate_inode_ref(root, p, &found_key, 1,
+                               __copy_first_ref, path);
        if (ret < 0)
                goto out;
        ret = 0;
@@ -1314,8 +1312,7 @@ out:
        return ret;
 }
 
-static int read_symlink(struct send_ctx *sctx,
-                       struct btrfs_root *root,
+static int read_symlink(struct btrfs_root *root,
                        u64 ino,
                        struct fs_path *dest)
 {
@@ -1562,8 +1559,7 @@ out:
  * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
  * generation of the parent dir and the name of the dir entry.
  */
-static int get_first_ref(struct send_ctx *sctx,
-                        struct btrfs_root *root, u64 ino,
+static int get_first_ref(struct btrfs_root *root, u64 ino,
                         u64 *dir, u64 *dir_gen, struct fs_path *name)
 {
        int ret;
@@ -1628,8 +1624,7 @@ out:
        return ret;
 }
 
-static int is_first_ref(struct send_ctx *sctx,
-                       struct btrfs_root *root,
+static int is_first_ref(struct btrfs_root *root,
                        u64 ino, u64 dir,
                        const char *name, int name_len)
 {
@@ -1638,11 +1633,11 @@ static int is_first_ref(struct send_ctx *sctx,
        u64 tmp_dir;
        u64 tmp_dir_gen;
 
-       tmp_name = fs_path_alloc(sctx);
+       tmp_name = fs_path_alloc();
        if (!tmp_name)
                return -ENOMEM;
 
-       ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
+       ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
        if (ret < 0)
                goto out;
 
@@ -1654,7 +1649,7 @@ static int is_first_ref(struct send_ctx *sctx,
        ret = !memcmp(tmp_name->start, name, name_len);
 
 out:
-       fs_path_free(sctx, tmp_name);
+       fs_path_free(tmp_name);
        return ret;
 }
 
@@ -1783,11 +1778,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
        if (!sctx->parent_root)
                goto out;
 
-       name = fs_path_alloc(sctx);
+       name = fs_path_alloc();
        if (!name)
                return -ENOMEM;
 
-       ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name);
+       ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
        if (ret < 0)
                goto out;
 
@@ -1795,7 +1790,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
                        name->start, fs_path_len(name));
 
 out:
-       fs_path_free(sctx, name);
+       fs_path_free(name);
        return ret;
 }
 
@@ -1979,11 +1974,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
         * send_root or parent_root for ref lookup.
         */
        if (ino < sctx->send_progress)
-               ret = get_first_ref(sctx, sctx->send_root, ino,
-                               parent_ino, parent_gen, dest);
+               ret = get_first_ref(sctx->send_root, ino,
+                                   parent_ino, parent_gen, dest);
        else
-               ret = get_first_ref(sctx, sctx->parent_root, ino,
-                               parent_ino, parent_gen, dest);
+               ret = get_first_ref(sctx->parent_root, ino,
+                                   parent_ino, parent_gen, dest);
        if (ret < 0)
                goto out;
 
@@ -2070,7 +2065,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
        u64 parent_gen = 0;
        int stop = 0;
 
-       name = fs_path_alloc(sctx);
+       name = fs_path_alloc();
        if (!name) {
                ret = -ENOMEM;
                goto out;
@@ -2098,7 +2093,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
        }
 
 out:
-       fs_path_free(sctx, name);
+       fs_path_free(name);
        if (!ret)
                fs_path_unreverse(dest);
        return ret;
@@ -2263,7 +2258,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
 
 verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
 
-       p = fs_path_alloc(sctx);
+       p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
 
@@ -2281,7 +2276,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
 
 tlv_put_failure:
 out:
-       fs_path_free(sctx, p);
+       fs_path_free(p);
        return ret;
 }
 
@@ -2292,7 +2287,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
 
 verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
 
-       p = fs_path_alloc(sctx);
+       p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
 
@@ -2310,7 +2305,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
 
 tlv_put_failure:
 out:
-       fs_path_free(sctx, p);
+       fs_path_free(p);
        return ret;
 }
 
@@ -2321,7 +2316,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
 
 verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
 
-       p = fs_path_alloc(sctx);
+       p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
 
@@ -2340,7 +2335,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
 
 tlv_put_failure:
 out:
-       fs_path_free(sctx, p);
+       fs_path_free(p);
        return ret;
 }
 
@@ -2356,7 +2351,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
 
 verbose_printk("btrfs: send_utimes %llu\n", ino);
 
-       p = fs_path_alloc(sctx);
+       p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
 
@@ -2397,7 +2392,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
 
 tlv_put_failure:
 out:
-       fs_path_free(sctx, p);
+       fs_path_free(p);
        btrfs_free_path(path);
        return ret;
 }
@@ -2418,7 +2413,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
 
 verbose_printk("btrfs: send_create_inode %llu\n", ino);
 
-       p = fs_path_alloc(sctx);
+       p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
 
@@ -2459,7 +2454,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
 
        if (S_ISLNK(mode)) {
                fs_path_reset(p);
-               ret = read_symlink(sctx, sctx->send_root, ino, p);
+               ret = read_symlink(sctx->send_root, ino, p);
                if (ret < 0)
                        goto out;
                TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
@@ -2476,7 +2471,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
 
 tlv_put_failure:
 out:
-       fs_path_free(sctx, p);
+       fs_path_free(p);
        return ret;
 }
 
@@ -2615,13 +2610,13 @@ static int record_ref(struct list_head *head, u64 dir,
        return 0;
 }
 
-static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
+static void __free_recorded_refs(struct list_head *head)
 {
        struct recorded_ref *cur;
 
        while (!list_empty(head)) {
                cur = list_entry(head->next, struct recorded_ref, list);
-               fs_path_free(sctx, cur->full_path);
+               fs_path_free(cur->full_path);
                list_del(&cur->list);
                kfree(cur);
        }
@@ -2629,8 +2624,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
 
 static void free_recorded_refs(struct send_ctx *sctx)
 {
-       __free_recorded_refs(sctx, &sctx->new_refs);
-       __free_recorded_refs(sctx, &sctx->deleted_refs);
+       __free_recorded_refs(&sctx->new_refs);
+       __free_recorded_refs(&sctx->deleted_refs);
 }
 
 /*
@@ -2644,7 +2639,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
        int ret;
        struct fs_path *orphan;
 
-       orphan = fs_path_alloc(sctx);
+       orphan = fs_path_alloc();
        if (!orphan)
                return -ENOMEM;
 
@@ -2655,7 +2650,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
        ret = send_rename(sctx, path, orphan);
 
 out:
-       fs_path_free(sctx, orphan);
+       fs_path_free(orphan);
        return ret;
 }
 
@@ -2746,7 +2741,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
         */
        BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
 
-       valid_path = fs_path_alloc(sctx);
+       valid_path = fs_path_alloc();
        if (!valid_path) {
                ret = -ENOMEM;
                goto out;
@@ -2843,9 +2838,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                if (ret < 0)
                        goto out;
                if (ret) {
-                       ret = is_first_ref(sctx, sctx->parent_root,
-                                       ow_inode, cur->dir, cur->name,
-                                       cur->name_len);
+                       ret = is_first_ref(sctx->parent_root,
+                                          ow_inode, cur->dir, cur->name,
+                                          cur->name_len);
                        if (ret < 0)
                                goto out;
                        if (ret) {
@@ -3024,7 +3019,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 out:
        free_recorded_refs(sctx);
        ulist_free(check_dirs);
-       fs_path_free(sctx, valid_path);
+       fs_path_free(valid_path);
        return ret;
 }
 
@@ -3037,7 +3032,7 @@ static int __record_new_ref(int num, u64 dir, int index,
        struct fs_path *p;
        u64 gen;
 
-       p = fs_path_alloc(sctx);
+       p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
 
@@ -3057,7 +3052,7 @@ static int __record_new_ref(int num, u64 dir, int index,
 
 out:
        if (ret)
-               fs_path_free(sctx, p);
+               fs_path_free(p);
        return ret;
 }
 
@@ -3070,7 +3065,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
        struct fs_path *p;
        u64 gen;
 
-       p = fs_path_alloc(sctx);
+       p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
 
@@ -3090,7 +3085,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
 
 out:
        if (ret)
-               fs_path_free(sctx, p);
+               fs_path_free(p);
        return ret;
 }
 
@@ -3098,8 +3093,8 @@ static int record_new_ref(struct send_ctx *sctx)
 {
        int ret;
 
-       ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
-                       sctx->cmp_key, 0, __record_new_ref, sctx);
+       ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+                               sctx->cmp_key, 0, __record_new_ref, sctx);
        if (ret < 0)
                goto out;
        ret = 0;
@@ -3112,8 +3107,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
 {
        int ret;
 
-       ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
-                       sctx->cmp_key, 0, __record_deleted_ref, sctx);
+       ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+                               sctx->cmp_key, 0, __record_deleted_ref, sctx);
        if (ret < 0)
                goto out;
        ret = 0;
@@ -3142,8 +3137,7 @@ static int __find_iref(int num, u64 dir, int index,
        return 0;
 }
 
-static int find_iref(struct send_ctx *sctx,
-                    struct btrfs_root *root,
+static int find_iref(struct btrfs_root *root,
                     struct btrfs_path *path,
                     struct btrfs_key *key,
                     u64 dir, struct fs_path *name)
@@ -3155,7 +3149,7 @@ static int find_iref(struct send_ctx *sctx,
        ctx.name = name;
        ctx.found_idx = -1;
 
-       ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx);
+       ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
        if (ret < 0)
                return ret;
 
@@ -3172,7 +3166,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index,
        int ret;
        struct send_ctx *sctx = ctx;
 
-       ret = find_iref(sctx, sctx->parent_root, sctx->right_path,
+       ret = find_iref(sctx->parent_root, sctx->right_path,
                        sctx->cmp_key, dir, name);
        if (ret == -ENOENT)
                ret = __record_new_ref(num, dir, index, name, sctx);
@@ -3189,7 +3183,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index,
        int ret;
        struct send_ctx *sctx = ctx;
 
-       ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
+       ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
                        dir, name);
        if (ret == -ENOENT)
                ret = __record_deleted_ref(num, dir, index, name, sctx);
@@ -3203,11 +3197,11 @@ static int record_changed_ref(struct send_ctx *sctx)
 {
        int ret = 0;
 
-       ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
+       ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
                        sctx->cmp_key, 0, __record_changed_new_ref, sctx);
        if (ret < 0)
                goto out;
-       ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
+       ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
                        sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
        if (ret < 0)
                goto out;
@@ -3266,8 +3260,7 @@ static int process_all_refs(struct send_ctx *sctx,
                     found_key.type != BTRFS_INODE_EXTREF_KEY))
                        break;
 
-               ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
-                               sctx);
+               ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
                btrfs_release_path(path);
                if (ret < 0)
                        goto out;
@@ -3335,7 +3328,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
        struct fs_path *p;
        posix_acl_xattr_header dummy_acl;
 
-       p = fs_path_alloc(sctx);
+       p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
 
@@ -3362,7 +3355,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
        ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
 
 out:
-       fs_path_free(sctx, p);
+       fs_path_free(p);
        return ret;
 }
 
@@ -3375,7 +3368,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
        struct send_ctx *sctx = ctx;
        struct fs_path *p;
 
-       p = fs_path_alloc(sctx);
+       p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
 
@@ -3386,7 +3379,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
        ret = send_remove_xattr(sctx, p, name, name_len);
 
 out:
-       fs_path_free(sctx, p);
+       fs_path_free(p);
        return ret;
 }
 
@@ -3394,8 +3387,8 @@ static int process_new_xattr(struct send_ctx *sctx)
 {
        int ret = 0;
 
-       ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
-                       sctx->cmp_key, __process_new_xattr, sctx);
+       ret = iterate_dir_item(sctx->send_root, sctx->left_path,
+                              sctx->cmp_key, __process_new_xattr, sctx);
 
        return ret;
 }
@@ -3404,8 +3397,8 @@ static int process_deleted_xattr(struct send_ctx *sctx)
 {
        int ret;
 
-       ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
-                       sctx->cmp_key, __process_deleted_xattr, sctx);
+       ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
+                              sctx->cmp_key, __process_deleted_xattr, sctx);
 
        return ret;
 }
@@ -3429,17 +3422,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
            strncmp(name, ctx->name, name_len) == 0) {
                ctx->found_idx = num;
                ctx->found_data_len = data_len;
-               ctx->found_data = kmalloc(data_len, GFP_NOFS);
+               ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
                if (!ctx->found_data)
                        return -ENOMEM;
-               memcpy(ctx->found_data, data, data_len);
                return 1;
        }
        return 0;
 }
 
-static int find_xattr(struct send_ctx *sctx,
-                     struct btrfs_root *root,
+static int find_xattr(struct btrfs_root *root,
                      struct btrfs_path *path,
                      struct btrfs_key *key,
                      const char *name, int name_len,
@@ -3454,7 +3445,7 @@ static int find_xattr(struct send_ctx *sctx,
        ctx.found_data = NULL;
        ctx.found_data_len = 0;
 
-       ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx);
+       ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
        if (ret < 0)
                return ret;
 
@@ -3480,9 +3471,9 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
        char *found_data = NULL;
        int found_data_len  = 0;
 
-       ret = find_xattr(sctx, sctx->parent_root, sctx->right_path,
-                       sctx->cmp_key, name, name_len, &found_data,
-                       &found_data_len);
+       ret = find_xattr(sctx->parent_root, sctx->right_path,
+                        sctx->cmp_key, name, name_len, &found_data,
+                        &found_data_len);
        if (ret == -ENOENT) {
                ret = __process_new_xattr(num, di_key, name, name_len, data,
                                data_len, type, ctx);
@@ -3508,8 +3499,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
        int ret;
        struct send_ctx *sctx = ctx;
 
-       ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
-                       name, name_len, NULL, NULL);
+       ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
+                        name, name_len, NULL, NULL);
        if (ret == -ENOENT)
                ret = __process_deleted_xattr(num, di_key, name, name_len, data,
                                data_len, type, ctx);
@@ -3523,11 +3514,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
 {
        int ret = 0;
 
-       ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
+       ret = iterate_dir_item(sctx->send_root, sctx->left_path,
                        sctx->cmp_key, __process_changed_new_xattr, sctx);
        if (ret < 0)
                goto out;
-       ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
+       ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
                        sctx->cmp_key, __process_changed_deleted_xattr, sctx);
 
 out:
@@ -3572,8 +3563,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
                        goto out;
                }
 
-               ret = iterate_dir_item(sctx, root, path, &found_key,
-                               __process_new_xattr, sctx);
+               ret = iterate_dir_item(root, path, &found_key,
+                                      __process_new_xattr, sctx);
                if (ret < 0)
                        goto out;
 
@@ -3598,7 +3589,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
        int num_read = 0;
        mm_segment_t old_fs;
 
-       p = fs_path_alloc(sctx);
+       p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
 
@@ -3640,7 +3631,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 
 tlv_put_failure:
 out:
-       fs_path_free(sctx, p);
+       fs_path_free(p);
        set_fs(old_fs);
        if (ret < 0)
                return ret;
@@ -3663,7 +3654,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
                clone_root->root->objectid, clone_root->ino,
                clone_root->offset);
 
-       p = fs_path_alloc(sctx);
+       p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
 
@@ -3686,8 +3677,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
                        goto out;
                ret = get_cur_path(sctx, clone_root->ino, gen, p);
        } else {
-               ret = get_inode_path(sctx, clone_root->root,
-                               clone_root->ino, p);
+               ret = get_inode_path(clone_root->root, clone_root->ino, p);
        }
        if (ret < 0)
                goto out;
@@ -3704,7 +3694,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
 
 tlv_put_failure:
 out:
-       fs_path_free(sctx, p);
+       fs_path_free(p);
        return ret;
 }
 
@@ -3717,7 +3707,7 @@ static int send_update_extent(struct send_ctx *sctx,
        int ret = 0;
        struct fs_path *p;
 
-       p = fs_path_alloc(sctx);
+       p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
 
@@ -3737,7 +3727,7 @@ static int send_update_extent(struct send_ctx *sctx,
 
 tlv_put_failure:
 out:
-       fs_path_free(sctx, p);
+       fs_path_free(p);
        return ret;
 }
 
@@ -4579,6 +4569,41 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        send_root = BTRFS_I(file_inode(mnt_file))->root;
        fs_info = send_root->fs_info;
 
+       /*
+        * This is done when we lookup the root, it should already be complete
+        * by the time we get here.
+        */
+       WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
+
+       /*
+        * If we just created this root we need to make sure that the orphan
+        * cleanup has been done and committed since we search the commit root,
+        * so check its commit root transid with our otransid and if they match
+        * commit the transaction to make sure everything is updated.
+        */
+       down_read(&send_root->fs_info->extent_commit_sem);
+       if (btrfs_header_generation(send_root->commit_root) ==
+           btrfs_root_otransid(&send_root->root_item)) {
+               struct btrfs_trans_handle *trans;
+
+               up_read(&send_root->fs_info->extent_commit_sem);
+
+               trans = btrfs_attach_transaction_barrier(send_root);
+               if (IS_ERR(trans)) {
+                       if (PTR_ERR(trans) != -ENOENT) {
+                               ret = PTR_ERR(trans);
+                               goto out;
+                       }
+                       /* ENOENT means theres no transaction */
+               } else {
+                       ret = btrfs_commit_transaction(trans, send_root);
+                       if (ret)
+                               goto out;
+               }
+       } else {
+               up_read(&send_root->fs_info->extent_commit_sem);
+       }
+
        arg = memdup_user(arg_, sizeof(*arg));
        if (IS_ERR(arg)) {
                ret = PTR_ERR(arg);
@@ -4663,10 +4688,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                        key.type = BTRFS_ROOT_ITEM_KEY;
                        key.offset = (u64)-1;
                        clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
-                       if (!clone_root) {
-                               ret = -EINVAL;
-                               goto out;
-                       }
                        if (IS_ERR(clone_root)) {
                                ret = PTR_ERR(clone_root);
                                goto out;
@@ -4682,8 +4703,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                key.type = BTRFS_ROOT_ITEM_KEY;
                key.offset = (u64)-1;
                sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
-               if (!sctx->parent_root) {
-                       ret = -EINVAL;
+               if (IS_ERR(sctx->parent_root)) {
+                       ret = PTR_ERR(sctx->parent_root);
                        goto out;
                }
        }
index f0857e092a3cb1af485604850052440579a1bbf1..8eb6191d86da8c3f3bb31b838c97a755f5c96853 100644 (file)
@@ -51,7 +51,6 @@
 #include "print-tree.h"
 #include "xattr.h"
 #include "volumes.h"
-#include "version.h"
 #include "export.h"
 #include "compression.h"
 #include "rcu-string.h"
@@ -266,6 +265,9 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                return;
        }
        ACCESS_ONCE(trans->transaction->aborted) = errno;
+       /* Wake up anybody who may be waiting on this transaction */
+       wake_up(&root->fs_info->transaction_wait);
+       wake_up(&root->fs_info->transaction_blocked_wait);
        __btrfs_std_error(root->fs_info, function, line, errno, NULL);
 }
 /*
@@ -776,9 +778,6 @@ find_root:
        if (IS_ERR(new_root))
                return ERR_CAST(new_root);
 
-       if (btrfs_root_refs(&new_root->root_item) == 0)
-               return ERR_PTR(-ENOENT);
-
        dir_id = btrfs_root_dirid(&new_root->root_item);
 setup_root:
        location.objectid = dir_id;
@@ -866,7 +865,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
                return 0;
        }
 
-       btrfs_wait_ordered_extents(root, 1);
+       btrfs_wait_all_ordered_extents(fs_info, 1);
 
        trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
@@ -1685,6 +1684,18 @@ static void btrfs_interface_exit(void)
                printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
 }
 
+static void btrfs_print_info(void)
+{
+       printk(KERN_INFO "Btrfs loaded"
+#ifdef CONFIG_BTRFS_DEBUG
+                       ", debug=on"
+#endif
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+                       ", integrity-checker=on"
+#endif
+                       "\n");
+}
+
 static int __init init_btrfs_fs(void)
 {
        int err;
@@ -1733,11 +1744,9 @@ static int __init init_btrfs_fs(void)
 
        btrfs_init_lockdep();
 
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+       btrfs_print_info();
        btrfs_test_free_space_cache();
-#endif
 
-       printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
        return 0;
 
 unregister_ioctl:
index 0544587d74f4be48ece72380ee0f4335ad636f97..d58cce77fc6c581625c3dd0449c83b6836bf4894 100644 (file)
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
+static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+       [TRANS_STATE_RUNNING]           = 0U,
+       [TRANS_STATE_BLOCKED]           = (__TRANS_USERSPACE |
+                                          __TRANS_START),
+       [TRANS_STATE_COMMIT_START]      = (__TRANS_USERSPACE |
+                                          __TRANS_START |
+                                          __TRANS_ATTACH),
+       [TRANS_STATE_COMMIT_DOING]      = (__TRANS_USERSPACE |
+                                          __TRANS_START |
+                                          __TRANS_ATTACH |
+                                          __TRANS_JOIN),
+       [TRANS_STATE_UNBLOCKED]         = (__TRANS_USERSPACE |
+                                          __TRANS_START |
+                                          __TRANS_ATTACH |
+                                          __TRANS_JOIN |
+                                          __TRANS_JOIN_NOLOCK),
+       [TRANS_STATE_COMPLETED]         = (__TRANS_USERSPACE |
+                                          __TRANS_START |
+                                          __TRANS_ATTACH |
+                                          __TRANS_JOIN |
+                                          __TRANS_JOIN_NOLOCK),
+};
+
 static void put_transaction(struct btrfs_transaction *transaction)
 {
        WARN_ON(atomic_read(&transaction->use_count) == 0);
        if (atomic_dec_and_test(&transaction->use_count)) {
                BUG_ON(!list_empty(&transaction->list));
                WARN_ON(transaction->delayed_refs.root.rb_node);
+               while (!list_empty(&transaction->pending_chunks)) {
+                       struct extent_map *em;
+
+                       em = list_first_entry(&transaction->pending_chunks,
+                                             struct extent_map, list);
+                       list_del_init(&em->list);
+                       free_extent_map(em);
+               }
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
 }
@@ -50,18 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root)
        root->commit_root = btrfs_root_node(root);
 }
 
-static inline int can_join_transaction(struct btrfs_transaction *trans,
-                                      int type)
+static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
+                                        unsigned int type)
+{
+       if (type & TRANS_EXTWRITERS)
+               atomic_inc(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
+                                        unsigned int type)
+{
+       if (type & TRANS_EXTWRITERS)
+               atomic_dec(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_init(struct btrfs_transaction *trans,
+                                         unsigned int type)
+{
+       atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
+}
+
+static inline int extwriter_counter_read(struct btrfs_transaction *trans)
 {
-       return !(trans->in_commit &&
-                type != TRANS_JOIN &&
-                type != TRANS_JOIN_NOLOCK);
+       return atomic_read(&trans->num_extwriters);
 }
 
 /*
  * either allocate a new transaction or hop into the existing one
  */
-static noinline int join_transaction(struct btrfs_root *root, int type)
+static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
 {
        struct btrfs_transaction *cur_trans;
        struct btrfs_fs_info *fs_info = root->fs_info;
@@ -74,32 +122,19 @@ loop:
                return -EROFS;
        }
 
-       if (fs_info->trans_no_join) {
-               /* 
-                * If we are JOIN_NOLOCK we're already committing a current
-                * transaction, we just need a handle to deal with something
-                * when committing the transaction, such as inode cache and
-                * space cache. It is a special case.
-                */
-               if (type != TRANS_JOIN_NOLOCK) {
-                       spin_unlock(&fs_info->trans_lock);
-                       return -EBUSY;
-               }
-       }
-
        cur_trans = fs_info->running_transaction;
        if (cur_trans) {
                if (cur_trans->aborted) {
                        spin_unlock(&fs_info->trans_lock);
                        return cur_trans->aborted;
                }
-               if (!can_join_transaction(cur_trans, type)) {
+               if (btrfs_blocked_trans_types[cur_trans->state] & type) {
                        spin_unlock(&fs_info->trans_lock);
                        return -EBUSY;
                }
                atomic_inc(&cur_trans->use_count);
                atomic_inc(&cur_trans->num_writers);
-               cur_trans->num_joined++;
+               extwriter_counter_inc(cur_trans, type);
                spin_unlock(&fs_info->trans_lock);
                return 0;
        }
@@ -112,6 +147,12 @@ loop:
        if (type == TRANS_ATTACH)
                return -ENOENT;
 
+       /*
+        * JOIN_NOLOCK only happens during the transaction commit, so
+        * it is impossible that ->running_transaction is NULL
+        */
+       BUG_ON(type == TRANS_JOIN_NOLOCK);
+
        cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
        if (!cur_trans)
                return -ENOMEM;
@@ -120,7 +161,7 @@ loop:
        if (fs_info->running_transaction) {
                /*
                 * someone started a transaction after we unlocked.  Make sure
-                * to redo the trans_no_join checks above
+                * to redo the checks above
                 */
                kmem_cache_free(btrfs_transaction_cachep, cur_trans);
                goto loop;
@@ -131,17 +172,15 @@ loop:
        }
 
        atomic_set(&cur_trans->num_writers, 1);
-       cur_trans->num_joined = 0;
+       extwriter_counter_init(cur_trans, type);
        init_waitqueue_head(&cur_trans->writer_wait);
        init_waitqueue_head(&cur_trans->commit_wait);
-       cur_trans->in_commit = 0;
-       cur_trans->blocked = 0;
+       cur_trans->state = TRANS_STATE_RUNNING;
        /*
         * One for this trans handle, one so it will live on until we
         * commit the transaction.
         */
        atomic_set(&cur_trans->use_count, 2);
-       cur_trans->commit_done = 0;
        cur_trans->start_time = get_seconds();
 
        cur_trans->delayed_refs.root = RB_ROOT;
@@ -164,7 +203,6 @@ loop:
                        "creating a fresh transaction\n");
        atomic64_set(&fs_info->tree_mod_seq, 0);
 
-       spin_lock_init(&cur_trans->commit_lock);
        spin_lock_init(&cur_trans->delayed_refs.lock);
        atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
        atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
@@ -172,6 +210,7 @@ loop:
 
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
        INIT_LIST_HEAD(&cur_trans->ordered_operations);
+       INIT_LIST_HEAD(&cur_trans->pending_chunks);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(&cur_trans->dirty_pages,
                             fs_info->btree_inode->i_mapping);
@@ -269,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+static inline int is_transaction_blocked(struct btrfs_transaction *trans)
+{
+       return (trans->state >= TRANS_STATE_BLOCKED &&
+               trans->state < TRANS_STATE_UNBLOCKED &&
+               !trans->aborted);
+}
+
 /* wait for commit against the current transaction to become unblocked
  * when this is done, it is safe to start a new transaction, but the current
  * transaction might not be fully on disk.
@@ -279,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root)
 
        spin_lock(&root->fs_info->trans_lock);
        cur_trans = root->fs_info->running_transaction;
-       if (cur_trans && cur_trans->blocked) {
+       if (cur_trans && is_transaction_blocked(cur_trans)) {
                atomic_inc(&cur_trans->use_count);
                spin_unlock(&root->fs_info->trans_lock);
 
                wait_event(root->fs_info->transaction_wait,
-                          !cur_trans->blocked);
+                          cur_trans->state >= TRANS_STATE_UNBLOCKED ||
+                          cur_trans->aborted);
                put_transaction(cur_trans);
        } else {
                spin_unlock(&root->fs_info->trans_lock);
@@ -307,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
 }
 
 static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, int type,
+start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
                  enum btrfs_reserve_flush_enum flush)
 {
        struct btrfs_trans_handle *h;
@@ -320,7 +367,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
                return ERR_PTR(-EROFS);
 
        if (current->journal_info) {
-               WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+               WARN_ON(type & TRANS_EXTWRITERS);
                h = current->journal_info;
                h->use_count++;
                WARN_ON(h->use_count > 2);
@@ -366,7 +413,7 @@ again:
         * If we are ATTACH, it means we just want to catch the current
         * transaction and commit it, so we needn't do sb_start_intwrite(). 
         */
-       if (type < TRANS_JOIN_NOLOCK)
+       if (type & __TRANS_FREEZABLE)
                sb_start_intwrite(root->fs_info->sb);
 
        if (may_wait_transaction(root, type))
@@ -408,7 +455,8 @@ again:
        INIT_LIST_HEAD(&h->new_bgs);
 
        smp_mb();
-       if (cur_trans->blocked && may_wait_transaction(root, type)) {
+       if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+           may_wait_transaction(root, type)) {
                btrfs_commit_transaction(h, root);
                goto again;
        }
@@ -429,7 +477,7 @@ got_it:
        return h;
 
 join_fail:
-       if (type < TRANS_JOIN_NOLOCK)
+       if (type & __TRANS_FREEZABLE)
                sb_end_intwrite(root->fs_info->sb);
        kmem_cache_free(btrfs_trans_handle_cachep, h);
 alloc_fail:
@@ -490,7 +538,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
 }
 
 /*
- * btrfs_attach_transaction() - catch the running transaction
+ * btrfs_attach_transaction_barrier() - catch the running transaction
  *
  * It is similar to the above function, the differentia is this one
  * will wait for all the inactive transactions until they fully
@@ -512,7 +560,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
 static noinline void wait_for_commit(struct btrfs_root *root,
                                    struct btrfs_transaction *commit)
 {
-       wait_event(commit->commit_wait, commit->commit_done);
+       wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
 }
 
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -548,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
                spin_lock(&root->fs_info->trans_lock);
                list_for_each_entry_reverse(t, &root->fs_info->trans_list,
                                            list) {
-                       if (t->in_commit) {
-                               if (t->commit_done)
+                       if (t->state >= TRANS_STATE_COMMIT_START) {
+                               if (t->state == TRANS_STATE_COMPLETED)
                                        break;
                                cur_trans = t;
                                atomic_inc(&cur_trans->use_count);
@@ -576,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root)
 static int should_end_transaction(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root)
 {
-       int ret;
+       if (root->fs_info->global_block_rsv.space_info->full &&
+           btrfs_should_throttle_delayed_refs(trans, root))
+               return 1;
 
-       ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
-       return ret ? 1 : 0;
+       return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
 }
 
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
@@ -590,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
        int err;
 
        smp_mb();
-       if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+       if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+           cur_trans->delayed_refs.flushing)
                return 1;
 
        updates = trans->delayed_ref_updates;
@@ -609,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *info = root->fs_info;
-       int count = 0;
+       unsigned long cur = trans->delayed_ref_updates;
        int lock = (trans->type != TRANS_JOIN_NOLOCK);
        int err = 0;
 
@@ -638,17 +688,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        if (!list_empty(&trans->new_bgs))
                btrfs_create_pending_block_groups(trans, root);
 
-       while (count < 1) {
-               unsigned long cur = trans->delayed_ref_updates;
+       trans->delayed_ref_updates = 0;
+       if (btrfs_should_throttle_delayed_refs(trans, root)) {
+               cur = max_t(unsigned long, cur, 1);
                trans->delayed_ref_updates = 0;
-               if (cur &&
-                   trans->transaction->delayed_refs.num_heads_ready > 64) {
-                       trans->delayed_ref_updates = 0;
-                       btrfs_run_delayed_refs(trans, root, cur);
-               } else {
-                       break;
-               }
-               count++;
+               btrfs_run_delayed_refs(trans, root, cur);
        }
 
        btrfs_trans_release_metadata(trans, root);
@@ -658,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                btrfs_create_pending_block_groups(trans, root);
 
        if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
-           should_end_transaction(trans, root)) {
-               trans->transaction->blocked = 1;
-               smp_wmb();
+           should_end_transaction(trans, root) &&
+           ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
+               spin_lock(&info->trans_lock);
+               if (cur_trans->state == TRANS_STATE_RUNNING)
+                       cur_trans->state = TRANS_STATE_BLOCKED;
+               spin_unlock(&info->trans_lock);
        }
 
-       if (lock && cur_trans->blocked && !cur_trans->in_commit) {
+       if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
                if (throttle) {
                        /*
                         * We may race with somebody else here so end up having
@@ -677,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                }
        }
 
-       if (trans->type < TRANS_JOIN_NOLOCK)
+       if (trans->type & __TRANS_FREEZABLE)
                sb_end_intwrite(root->fs_info->sb);
 
        WARN_ON(cur_trans != info->running_transaction);
        WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
        atomic_dec(&cur_trans->num_writers);
+       extwriter_counter_dec(cur_trans, trans->type);
 
        smp_mb();
        if (waitqueue_active(&cur_trans->writer_wait))
@@ -736,9 +784,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
        struct extent_state *cached_state = NULL;
        u64 start = 0;
        u64 end;
-       struct blk_plug plug;
 
-       blk_start_plug(&plug);
        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
                                      mark, &cached_state)) {
                convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -752,7 +798,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
        }
        if (err)
                werr = err;
-       blk_finish_plug(&plug);
        return werr;
 }
 
@@ -797,8 +842,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 {
        int ret;
        int ret2;
+       struct blk_plug plug;
 
+       blk_start_plug(&plug);
        ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+       blk_finish_plug(&plug);
        ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
 
        if (ret)
@@ -1318,20 +1366,26 @@ static void update_super_roots(struct btrfs_root *root)
 
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
 {
+       struct btrfs_transaction *trans;
        int ret = 0;
+
        spin_lock(&info->trans_lock);
-       if (info->running_transaction)
-               ret = info->running_transaction->in_commit;
+       trans = info->running_transaction;
+       if (trans)
+               ret = (trans->state >= TRANS_STATE_COMMIT_START);
        spin_unlock(&info->trans_lock);
        return ret;
 }
 
 int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 {
+       struct btrfs_transaction *trans;
        int ret = 0;
+
        spin_lock(&info->trans_lock);
-       if (info->running_transaction)
-               ret = info->running_transaction->blocked;
+       trans = info->running_transaction;
+       if (trans)
+               ret = is_transaction_blocked(trans);
        spin_unlock(&info->trans_lock);
        return ret;
 }
@@ -1343,7 +1397,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 static void wait_current_trans_commit_start(struct btrfs_root *root,
                                            struct btrfs_transaction *trans)
 {
-       wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
+       wait_event(root->fs_info->transaction_blocked_wait,
+                  trans->state >= TRANS_STATE_COMMIT_START ||
+                  trans->aborted);
 }
 
 /*
@@ -1354,7 +1410,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
                                         struct btrfs_transaction *trans)
 {
        wait_event(root->fs_info->transaction_wait,
-                  trans->commit_done || (trans->in_commit && !trans->blocked));
+                  trans->state >= TRANS_STATE_UNBLOCKED ||
+                  trans->aborted);
 }
 
 /*
@@ -1450,26 +1507,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 
        spin_lock(&root->fs_info->trans_lock);
 
-       if (list_empty(&cur_trans->list)) {
-               spin_unlock(&root->fs_info->trans_lock);
-               btrfs_end_transaction(trans, root);
-               return;
-       }
+       /*
+        * If the transaction is removed from the list, it means this
+        * transaction has been committed successfully, so it is impossible
+        * to call the cleanup function.
+        */
+       BUG_ON(list_empty(&cur_trans->list));
 
        list_del_init(&cur_trans->list);
        if (cur_trans == root->fs_info->running_transaction) {
-               root->fs_info->trans_no_join = 1;
+               cur_trans->state = TRANS_STATE_COMMIT_DOING;
                spin_unlock(&root->fs_info->trans_lock);
                wait_event(cur_trans->writer_wait,
                           atomic_read(&cur_trans->num_writers) == 1);
 
                spin_lock(&root->fs_info->trans_lock);
-               root->fs_info->running_transaction = NULL;
        }
        spin_unlock(&root->fs_info->trans_lock);
 
        btrfs_cleanup_one_transaction(trans->transaction, root);
 
+       spin_lock(&root->fs_info->trans_lock);
+       if (cur_trans == root->fs_info->running_transaction)
+               root->fs_info->running_transaction = NULL;
+       spin_unlock(&root->fs_info->trans_lock);
+
        put_transaction(cur_trans);
        put_transaction(cur_trans);
 
@@ -1481,33 +1543,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
                current->journal_info = NULL;
 
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
-
-       spin_lock(&root->fs_info->trans_lock);
-       root->fs_info->trans_no_join = 0;
-       spin_unlock(&root->fs_info->trans_lock);
 }
 
 static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root)
 {
-       int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
-       int snap_pending = 0;
        int ret;
 
-       if (!flush_on_commit) {
-               spin_lock(&root->fs_info->trans_lock);
-               if (!list_empty(&trans->transaction->pending_snapshots))
-                       snap_pending = 1;
-               spin_unlock(&root->fs_info->trans_lock);
-       }
-
-       if (flush_on_commit || snap_pending) {
-               ret = btrfs_start_delalloc_inodes(root, 1);
-               if (ret)
-                       return ret;
-               btrfs_wait_ordered_extents(root, 1);
-       }
-
        ret = btrfs_run_delayed_items(trans, root);
        if (ret)
                return ret;
@@ -1531,23 +1573,25 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-/*
- * btrfs_transaction state sequence:
- *    in_commit = 0, blocked = 0  (initial)
- *    in_commit = 1, blocked = 1
- *    blocked = 0
- *    commit_done = 1
- */
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+       if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+               return btrfs_start_all_delalloc_inodes(fs_info, 1);
+       return 0;
+}
+
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+       if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+               btrfs_wait_all_ordered_extents(fs_info, 1);
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
-       unsigned long joined = 0;
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_transaction *prev_trans = NULL;
-       DEFINE_WAIT(wait);
        int ret;
-       int should_grow = 0;
-       unsigned long now = get_seconds();
 
        ret = btrfs_run_ordered_operations(trans, root, 0);
        if (ret) {
@@ -1586,6 +1630,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         * start sending their work down.
         */
        cur_trans->delayed_refs.flushing = 1;
+       smp_wmb();
 
        if (!list_empty(&trans->new_bgs))
                btrfs_create_pending_block_groups(trans, root);
@@ -1596,9 +1641,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                return ret;
        }
 
-       spin_lock(&cur_trans->commit_lock);
-       if (cur_trans->in_commit) {
-               spin_unlock(&cur_trans->commit_lock);
+       spin_lock(&root->fs_info->trans_lock);
+       if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+               spin_unlock(&root->fs_info->trans_lock);
                atomic_inc(&cur_trans->use_count);
                ret = btrfs_end_transaction(trans, root);
 
@@ -1609,16 +1654,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                return ret;
        }
 
-       trans->transaction->in_commit = 1;
-       trans->transaction->blocked = 1;
-       spin_unlock(&cur_trans->commit_lock);
+       cur_trans->state = TRANS_STATE_COMMIT_START;
        wake_up(&root->fs_info->transaction_blocked_wait);
 
-       spin_lock(&root->fs_info->trans_lock);
        if (cur_trans->list.prev != &root->fs_info->trans_list) {
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
-               if (!prev_trans->commit_done) {
+               if (prev_trans->state != TRANS_STATE_COMPLETED) {
                        atomic_inc(&prev_trans->use_count);
                        spin_unlock(&root->fs_info->trans_lock);
 
@@ -1632,42 +1674,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                spin_unlock(&root->fs_info->trans_lock);
        }
 
-       if (!btrfs_test_opt(root, SSD) &&
-           (now < cur_trans->start_time || now - cur_trans->start_time < 1))
-               should_grow = 1;
-
-       do {
-               joined = cur_trans->num_joined;
-
-               WARN_ON(cur_trans != trans->transaction);
-
-               ret = btrfs_flush_all_pending_stuffs(trans, root);
-               if (ret)
-                       goto cleanup_transaction;
+       extwriter_counter_dec(cur_trans, trans->type);
 
-               prepare_to_wait(&cur_trans->writer_wait, &wait,
-                               TASK_UNINTERRUPTIBLE);
+       ret = btrfs_start_delalloc_flush(root->fs_info);
+       if (ret)
+               goto cleanup_transaction;
 
-               if (atomic_read(&cur_trans->num_writers) > 1)
-                       schedule_timeout(MAX_SCHEDULE_TIMEOUT);
-               else if (should_grow)
-                       schedule_timeout(1);
+       ret = btrfs_flush_all_pending_stuffs(trans, root);
+       if (ret)
+               goto cleanup_transaction;
 
-               finish_wait(&cur_trans->writer_wait, &wait);
-       } while (atomic_read(&cur_trans->num_writers) > 1 ||
-                (should_grow && cur_trans->num_joined != joined));
+       wait_event(cur_trans->writer_wait,
+                  extwriter_counter_read(cur_trans) == 0);
 
+       /* some pending stuffs might be added after the previous flush. */
        ret = btrfs_flush_all_pending_stuffs(trans, root);
        if (ret)
                goto cleanup_transaction;
 
+       btrfs_wait_delalloc_flush(root->fs_info);
        /*
         * Ok now we need to make sure to block out any other joins while we
         * commit the transaction.  We could have started a join before setting
-        * no_join so make sure to wait for num_writers to == 1 again.
+        * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
         */
        spin_lock(&root->fs_info->trans_lock);
-       root->fs_info->trans_no_join = 1;
+       cur_trans->state = TRANS_STATE_COMMIT_DOING;
        spin_unlock(&root->fs_info->trans_lock);
        wait_event(cur_trans->writer_wait,
                   atomic_read(&cur_trans->num_writers) == 1);
@@ -1794,10 +1826,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
               sizeof(*root->fs_info->super_copy));
 
-       trans->transaction->blocked = 0;
        spin_lock(&root->fs_info->trans_lock);
+       cur_trans->state = TRANS_STATE_UNBLOCKED;
        root->fs_info->running_transaction = NULL;
-       root->fs_info->trans_no_join = 0;
        spin_unlock(&root->fs_info->trans_lock);
        mutex_unlock(&root->fs_info->reloc_mutex);
 
@@ -1825,10 +1856,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        btrfs_finish_extent_commit(trans, root);
 
-       cur_trans->commit_done = 1;
-
        root->fs_info->last_trans_committed = cur_trans->transid;
-
+       /*
+        * We needn't acquire the lock here because there is no other task
+        * which can change it.
+        */
+       cur_trans->state = TRANS_STATE_COMPLETED;
        wake_up(&cur_trans->commit_wait);
 
        spin_lock(&root->fs_info->trans_lock);
@@ -1838,7 +1871,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        put_transaction(cur_trans);
        put_transaction(cur_trans);
 
-       if (trans->type < TRANS_JOIN_NOLOCK)
+       if (trans->type & __TRANS_FREEZABLE)
                sb_end_intwrite(root->fs_info->sb);
 
        trace_btrfs_transaction_commit(root);
@@ -1885,11 +1918,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
 
-       if (fs_info->sb->s_flags & MS_RDONLY) {
-               pr_debug("btrfs: cleaner called for RO fs!\n");
-               return 0;
-       }
-
        spin_lock(&fs_info->trans_lock);
        if (list_empty(&fs_info->dead_roots)) {
                spin_unlock(&fs_info->trans_lock);
index 24c97335a59ffe061b0df684baeb69837aa75c77..005b0375d18cfc6131a4480a9e67e0cbe44ed46a 100644 (file)
 #include "delayed-ref.h"
 #include "ctree.h"
 
+enum btrfs_trans_state {
+       TRANS_STATE_RUNNING             = 0,
+       TRANS_STATE_BLOCKED             = 1,
+       TRANS_STATE_COMMIT_START        = 2,
+       TRANS_STATE_COMMIT_DOING        = 3,
+       TRANS_STATE_UNBLOCKED           = 4,
+       TRANS_STATE_COMPLETED           = 5,
+       TRANS_STATE_MAX                 = 6,
+};
+
 struct btrfs_transaction {
        u64 transid;
+       /*
+        * total external writers(USERSPACE/START/ATTACH) in this
+        * transaction, it must be zero before the transaction is
+        * being committed
+        */
+       atomic_t num_extwriters;
        /*
         * total writers in this transaction, it must be zero before the
         * transaction can end
@@ -31,12 +47,8 @@ struct btrfs_transaction {
        atomic_t num_writers;
        atomic_t use_count;
 
-       unsigned long num_joined;
-
-       spinlock_t commit_lock;
-       int in_commit;
-       int commit_done;
-       int blocked;
+       /* Be protected by fs_info->trans_lock when we want to change it. */
+       enum btrfs_trans_state state;
        struct list_head list;
        struct extent_io_tree dirty_pages;
        unsigned long start_time;
@@ -44,17 +56,27 @@ struct btrfs_transaction {
        wait_queue_head_t commit_wait;
        struct list_head pending_snapshots;
        struct list_head ordered_operations;
+       struct list_head pending_chunks;
        struct btrfs_delayed_ref_root delayed_refs;
        int aborted;
 };
 
-enum btrfs_trans_type {
-       TRANS_START,
-       TRANS_JOIN,
-       TRANS_USERSPACE,
-       TRANS_JOIN_NOLOCK,
-       TRANS_ATTACH,
-};
+#define __TRANS_FREEZABLE      (1U << 0)
+
+#define __TRANS_USERSPACE      (1U << 8)
+#define __TRANS_START          (1U << 9)
+#define __TRANS_ATTACH         (1U << 10)
+#define __TRANS_JOIN           (1U << 11)
+#define __TRANS_JOIN_NOLOCK    (1U << 12)
+
+#define TRANS_USERSPACE                (__TRANS_USERSPACE | __TRANS_FREEZABLE)
+#define TRANS_START            (__TRANS_START | __TRANS_FREEZABLE)
+#define TRANS_ATTACH           (__TRANS_ATTACH)
+#define TRANS_JOIN             (__TRANS_JOIN | __TRANS_FREEZABLE)
+#define TRANS_JOIN_NOLOCK      (__TRANS_JOIN_NOLOCK)
+
+#define TRANS_EXTWRITERS       (__TRANS_USERSPACE | __TRANS_START |    \
+                                __TRANS_ATTACH)
 
 struct btrfs_trans_handle {
        u64 transid;
@@ -70,7 +92,7 @@ struct btrfs_trans_handle {
        short aborted;
        short adding_csums;
        bool allocating_chunk;
-       enum btrfs_trans_type type;
+       unsigned int type;
        /*
         * this root is only needed to validate that the root passed to
         * start_transaction is the same as the one passed to end_transaction.
index c276ac9a0ec338c86973a752d74b9e93d22cc9a8..2c6791493637250423ac2c9bc21d9ca3f752ed11 100644 (file)
@@ -18,6 +18,7 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include <linux/list_sort.h>
 #include "ctree.h"
 #include "transaction.h"
@@ -279,11 +280,23 @@ static int process_one_buffer(struct btrfs_root *log,
 {
        int ret = 0;
 
+       /*
+        * If this fs is mixed then we need to be able to process the leaves to
+        * pin down any logged extents, so we have to read the block.
+        */
+       if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
+               ret = btrfs_read_buffer(eb, gen);
+               if (ret)
+                       return ret;
+       }
+
        if (wc->pin)
                ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
                                                      eb->start, eb->len);
 
        if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
+               if (wc->pin && btrfs_header_level(eb) == 0)
+                       ret = btrfs_exclude_logged_extents(log, eb);
                if (wc->write)
                        btrfs_write_tree_block(eb);
                if (wc->wait)
@@ -2016,13 +2029,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                                             eb, i, &key);
                        if (ret)
                                break;
-               } else if (key.type == BTRFS_INODE_REF_KEY) {
-                       ret = add_inode_ref(wc->trans, root, log, path,
-                                           eb, i, &key);
-                       if (ret && ret != -ENOENT)
-                               break;
-                       ret = 0;
-               } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
+               } else if (key.type == BTRFS_INODE_REF_KEY ||
+                          key.type == BTRFS_INODE_EXTREF_KEY) {
                        ret = add_inode_ref(wc->trans, root, log, path,
                                            eb, i, &key);
                        if (ret && ret != -ENOENT)
@@ -2358,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        struct btrfs_root *log = root->log_root;
        struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
        unsigned long log_transid = 0;
+       struct blk_plug plug;
 
        mutex_lock(&root->log_mutex);
        log_transid = root->log_transid;
@@ -2401,8 +2410,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        /* we start IO on  all the marked extents here, but we don't actually
         * wait for them until later.
         */
+       blk_start_plug(&plug);
        ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
        if (ret) {
+               blk_finish_plug(&plug);
                btrfs_abort_transaction(trans, root, ret);
                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&root->log_mutex);
@@ -2437,6 +2448,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        }
 
        if (ret) {
+               blk_finish_plug(&plug);
                if (ret != -ENOSPC) {
                        btrfs_abort_transaction(trans, root, ret);
                        mutex_unlock(&log_root_tree->log_mutex);
@@ -2452,6 +2464,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
        index2 = log_root_tree->log_transid % 2;
        if (atomic_read(&log_root_tree->log_commit[index2])) {
+               blk_finish_plug(&plug);
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
                wait_log_commit(trans, log_root_tree,
                                log_root_tree->log_transid);
@@ -2474,6 +2487,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * check the full commit flag again
         */
        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+               blk_finish_plug(&plug);
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
@@ -2481,9 +2495,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                goto out_wake_log_root;
        }
 
-       ret = btrfs_write_and_wait_marked_extents(log_root_tree,
-                               &log_root_tree->dirty_log_pages,
-                               EXTENT_DIRTY | EXTENT_NEW);
+       ret = btrfs_write_marked_extents(log_root_tree,
+                                        &log_root_tree->dirty_log_pages,
+                                        EXTENT_DIRTY | EXTENT_NEW);
+       blk_finish_plug(&plug);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
                btrfs_free_logged_extents(log, log_transid);
@@ -2491,6 +2506,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                goto out_wake_log_root;
        }
        btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+       btrfs_wait_marked_extents(log_root_tree,
+                                 &log_root_tree->dirty_log_pages,
+                                 EXTENT_NEW | EXTENT_DIRTY);
        btrfs_wait_logged_extents(log, log_transid);
 
        btrfs_set_super_log_root(root->fs_info->super_for_commit,
@@ -4016,8 +4034,7 @@ again:
                if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
                        break;
 
-               log = btrfs_read_fs_root_no_radix(log_root_tree,
-                                                 &found_key);
+               log = btrfs_read_fs_root(log_root_tree, &found_key);
                if (IS_ERR(log)) {
                        ret = PTR_ERR(log);
                        btrfs_error(fs_info, ret,
index 7b417e20efe26da50a83a1a695e62491d3077f85..b0a523b2c60ee8e73cd5165382892918ddad269e 100644 (file)
@@ -205,6 +205,10 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
                u64 new_alloced = ulist->nodes_alloced + 128;
                struct ulist_node *new_nodes;
                void *old = NULL;
+               int i;
+
+               for (i = 0; i < ulist->nnodes; i++)
+                       rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
 
                /*
                 * if nodes_alloced == ULIST_SIZE no memory has been allocated
@@ -224,6 +228,17 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 
                ulist->nodes = new_nodes;
                ulist->nodes_alloced = new_alloced;
+
+               /*
+                * krealloc actually uses memcpy, which does not copy rb_node
+                * pointers, so we have to do it ourselves.  Otherwise we may
+                * be bitten by crashes.
+                */
+               for (i = 0; i < ulist->nnodes; i++) {
+                       ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
+                       if (ret < 0)
+                               return ret;
+               }
        }
        ulist->nodes[ulist->nnodes].val = val;
        ulist->nodes[ulist->nnodes].aux = aux;
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
deleted file mode 100644 (file)
index 9bf3946..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __BTRFS_VERSION_H
-#define __BTRFS_VERSION_H
-#define BTRFS_BUILD_VERSION "Btrfs"
-#endif
index 8bffb9174afba04d8375b96f754256b68ff9b4ef..78b871753cb61e099abdfca27a0e316c37c329ee 100644 (file)
@@ -982,6 +982,35 @@ out:
        return ret;
 }
 
+static int contains_pending_extent(struct btrfs_trans_handle *trans,
+                                  struct btrfs_device *device,
+                                  u64 *start, u64 len)
+{
+       struct extent_map *em;
+       int ret = 0;
+
+       list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
+               struct map_lookup *map;
+               int i;
+
+               map = (struct map_lookup *)em->bdev;
+               for (i = 0; i < map->num_stripes; i++) {
+                       if (map->stripes[i].dev != device)
+                               continue;
+                       if (map->stripes[i].physical >= *start + len ||
+                           map->stripes[i].physical + em->orig_block_len <=
+                           *start)
+                               continue;
+                       *start = map->stripes[i].physical +
+                               em->orig_block_len;
+                       ret = 1;
+               }
+       }
+
+       return ret;
+}
+
+
 /*
  * find_free_dev_extent - find free space in the specified device
  * @device:    the device which we search the free space in
@@ -1002,7 +1031,8 @@ out:
  * But if we don't find suitable free space, it is used to store the size of
  * the max free space.
  */
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+                        struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *len)
 {
        struct btrfs_key key;
@@ -1026,21 +1056,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
         */
        search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
 
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+again:
        max_hole_start = search_start;
        max_hole_size = 0;
        hole_size = 0;
 
        if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
                ret = -ENOSPC;
-               goto error;
+               goto out;
        }
 
-       path = btrfs_alloc_path();
-       if (!path) {
-               ret = -ENOMEM;
-               goto error;
-       }
        path->reada = 2;
+       path->search_commit_root = 1;
+       path->skip_locking = 1;
 
        key.objectid = device->devid;
        key.offset = search_start;
@@ -1081,6 +1112,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                if (key.offset > search_start) {
                        hole_size = key.offset - search_start;
 
+                       /*
+                        * Have to check before we set max_hole_start, otherwise
+                        * we could end up sending back this offset anyway.
+                        */
+                       if (contains_pending_extent(trans, device,
+                                                   &search_start,
+                                                   hole_size))
+                               hole_size = 0;
+
                        if (hole_size > max_hole_size) {
                                max_hole_start = search_start;
                                max_hole_size = hole_size;
@@ -1124,6 +1164,11 @@ next:
                max_hole_size = hole_size;
        }
 
+       if (contains_pending_extent(trans, device, &search_start, hole_size)) {
+               btrfs_release_path(path);
+               goto again;
+       }
+
        /* See above. */
        if (hole_size < num_bytes)
                ret = -ENOSPC;
@@ -1132,7 +1177,6 @@ next:
 
 out:
        btrfs_free_path(path);
-error:
        *start = max_hole_start;
        if (len)
                *len = max_hole_size;
@@ -1244,47 +1288,22 @@ out:
        return ret;
 }
 
-static noinline int find_next_chunk(struct btrfs_root *root,
-                                   u64 objectid, u64 *offset)
+static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
 {
-       struct btrfs_path *path;
-       int ret;
-       struct btrfs_key key;
-       struct btrfs_chunk *chunk;
-       struct btrfs_key found_key;
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-       key.objectid = objectid;
-       key.offset = (u64)-1;
-       key.type = BTRFS_CHUNK_ITEM_KEY;
-
-       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-       if (ret < 0)
-               goto error;
-
-       BUG_ON(ret == 0); /* Corruption */
+       struct extent_map_tree *em_tree;
+       struct extent_map *em;
+       struct rb_node *n;
+       u64 ret = 0;
 
-       ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
-       if (ret) {
-               *offset = 0;
-       } else {
-               btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                                     path->slots[0]);
-               if (found_key.objectid != objectid)
-                       *offset = 0;
-               else {
-                       chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
-                                              struct btrfs_chunk);
-                       *offset = found_key.offset +
-                               btrfs_chunk_length(path->nodes[0], chunk);
-               }
+       em_tree = &fs_info->mapping_tree.map_tree;
+       read_lock(&em_tree->lock);
+       n = rb_last(&em_tree->map);
+       if (n) {
+               em = rb_entry(n, struct extent_map, rb_node);
+               ret = em->start + em->len;
        }
-       ret = 0;
-error:
-       btrfs_free_path(path);
+       read_unlock(&em_tree->lock);
+
        return ret;
 }
 
@@ -1462,31 +1481,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
 
        if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
-               printk(KERN_ERR "btrfs: unable to go below four devices "
-                      "on raid10\n");
-               ret = -EINVAL;
+               ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
                goto out;
        }
 
        if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
-               printk(KERN_ERR "btrfs: unable to go below two "
-                      "devices on raid1\n");
-               ret = -EINVAL;
+               ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
                goto out;
        }
 
        if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
            root->fs_info->fs_devices->rw_devices <= 2) {
-               printk(KERN_ERR "btrfs: unable to go below two "
-                      "devices on raid5\n");
-               ret = -EINVAL;
+               ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
                goto out;
        }
        if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
            root->fs_info->fs_devices->rw_devices <= 3) {
-               printk(KERN_ERR "btrfs: unable to go below three "
-                      "devices on raid6\n");
-               ret = -EINVAL;
+               ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
                goto out;
        }
 
@@ -1512,8 +1523,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                bh = NULL;
                disk_super = NULL;
                if (!device) {
-                       printk(KERN_ERR "btrfs: no missing devices found to "
-                              "remove\n");
+                       ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
                        goto out;
                }
        } else {
@@ -1535,15 +1545,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        }
 
        if (device->is_tgtdev_for_dev_replace) {
-               pr_err("btrfs: unable to remove the dev_replace target dev\n");
-               ret = -EINVAL;
+               ret = BTRFS_ERROR_DEV_TGT_REPLACE;
                goto error_brelse;
        }
 
        if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
-               printk(KERN_ERR "btrfs: unable to remove the only writeable "
-                      "device\n");
-               ret = -EINVAL;
+               ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
                goto error_brelse;
        }
 
@@ -3295,10 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
        }
 
        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
-       if (IS_ERR(tsk))
-               return PTR_ERR(tsk);
-
-       return 0;
+       return PTR_RET(tsk);
 }
 
 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
@@ -3681,10 +3685,8 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
 }
 
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *extent_root,
-                              struct map_lookup **map_ret,
-                              u64 *num_bytes_out, u64 *stripe_size_out,
-                              u64 start, u64 type)
+                              struct btrfs_root *extent_root, u64 start,
+                              u64 type)
 {
        struct btrfs_fs_info *info = extent_root->fs_info;
        struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -3791,7 +3793,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                if (total_avail == 0)
                        continue;
 
-               ret = find_free_dev_extent(device,
+               ret = find_free_dev_extent(trans, device,
                                           max_stripe_size * dev_stripes,
                                           &dev_offset, &max_avail);
                if (ret && ret != -ENOSPC)
@@ -3903,12 +3905,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        map->type = type;
        map->sub_stripes = sub_stripes;
 
-       *map_ret = map;
        num_bytes = stripe_size * data_stripes;
 
-       *stripe_size_out = stripe_size;
-       *num_bytes_out = num_bytes;
-
        trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
 
        em = alloc_extent_map();
@@ -3921,38 +3919,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        em->len = num_bytes;
        em->block_start = 0;
        em->block_len = em->len;
+       em->orig_block_len = stripe_size;
 
        em_tree = &extent_root->fs_info->mapping_tree.map_tree;
        write_lock(&em_tree->lock);
        ret = add_extent_mapping(em_tree, em, 0);
+       if (!ret) {
+               list_add_tail(&em->list, &trans->transaction->pending_chunks);
+               atomic_inc(&em->refs);
+       }
        write_unlock(&em_tree->lock);
        if (ret) {
                free_extent_map(em);
                goto error;
        }
 
-       for (i = 0; i < map->num_stripes; ++i) {
-               struct btrfs_device *device;
-               u64 dev_offset;
-
-               device = map->stripes[i].dev;
-               dev_offset = map->stripes[i].physical;
-
-               ret = btrfs_alloc_dev_extent(trans, device,
-                               info->chunk_root->root_key.objectid,
-                               BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                               start, dev_offset, stripe_size);
-               if (ret)
-                       goto error_dev_extent;
-       }
-
        ret = btrfs_make_block_group(trans, extent_root, 0, type,
                                     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
                                     start, num_bytes);
-       if (ret) {
-               i = map->num_stripes - 1;
-               goto error_dev_extent;
-       }
+       if (ret)
+               goto error_del_extent;
 
        free_extent_map(em);
        check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -3960,18 +3946,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        kfree(devices_info);
        return 0;
 
-error_dev_extent:
-       for (; i >= 0; i--) {
-               struct btrfs_device *device;
-               int err;
-
-               device = map->stripes[i].dev;
-               err = btrfs_free_dev_extent(trans, device, start);
-               if (err) {
-                       btrfs_abort_transaction(trans, extent_root, err);
-                       break;
-               }
-       }
+error_del_extent:
        write_lock(&em_tree->lock);
        remove_extent_mapping(em_tree, em);
        write_unlock(&em_tree->lock);
@@ -3986,33 +3961,68 @@ error:
        return ret;
 }
 
-static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
                                struct btrfs_root *extent_root,
-                               struct map_lookup *map, u64 chunk_offset,
-                               u64 chunk_size, u64 stripe_size)
+                               u64 chunk_offset, u64 chunk_size)
 {
-       u64 dev_offset;
        struct btrfs_key key;
        struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
        struct btrfs_device *device;
        struct btrfs_chunk *chunk;
        struct btrfs_stripe *stripe;
-       size_t item_size = btrfs_chunk_item_size(map->num_stripes);
-       int index = 0;
+       struct extent_map_tree *em_tree;
+       struct extent_map *em;
+       struct map_lookup *map;
+       size_t item_size;
+       u64 dev_offset;
+       u64 stripe_size;
+       int i = 0;
        int ret;
 
+       em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+       read_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
+       read_unlock(&em_tree->lock);
+
+       if (!em) {
+               btrfs_crit(extent_root->fs_info, "unable to find logical "
+                          "%Lu len %Lu", chunk_offset, chunk_size);
+               return -EINVAL;
+       }
+
+       if (em->start != chunk_offset || em->len != chunk_size) {
+               btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
+                         " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
+                         chunk_size, em->start, em->len);
+               free_extent_map(em);
+               return -EINVAL;
+       }
+
+       map = (struct map_lookup *)em->bdev;
+       item_size = btrfs_chunk_item_size(map->num_stripes);
+       stripe_size = em->orig_block_len;
+
        chunk = kzalloc(item_size, GFP_NOFS);
-       if (!chunk)
-               return -ENOMEM;
+       if (!chunk) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < map->num_stripes; i++) {
+               device = map->stripes[i].dev;
+               dev_offset = map->stripes[i].physical;
 
-       index = 0;
-       while (index < map->num_stripes) {
-               device = map->stripes[index].dev;
                device->bytes_used += stripe_size;
                ret = btrfs_update_device(trans, device);
                if (ret)
-                       goto out_free;
-               index++;
+                       goto out;
+               ret = btrfs_alloc_dev_extent(trans, device,
+                                            chunk_root->root_key.objectid,
+                                            BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+                                            chunk_offset, dev_offset,
+                                            stripe_size);
+               if (ret)
+                       goto out;
        }
 
        spin_lock(&extent_root->fs_info->free_chunk_lock);
@@ -4020,17 +4030,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
                                                   map->num_stripes);
        spin_unlock(&extent_root->fs_info->free_chunk_lock);
 
-       index = 0;
        stripe = &chunk->stripe;
-       while (index < map->num_stripes) {
-               device = map->stripes[index].dev;
-               dev_offset = map->stripes[index].physical;
+       for (i = 0; i < map->num_stripes; i++) {
+               device = map->stripes[i].dev;
+               dev_offset = map->stripes[i].physical;
 
                btrfs_set_stack_stripe_devid(stripe, device->devid);
                btrfs_set_stack_stripe_offset(stripe, dev_offset);
                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
                stripe++;
-               index++;
        }
 
        btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -4048,7 +4056,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
        key.offset = chunk_offset;
 
        ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
-
        if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
                /*
                 * TODO: Cleanup of inserted chunk root in case of
@@ -4058,8 +4065,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
                                             item_size);
        }
 
-out_free:
+out:
        kfree(chunk);
+       free_extent_map(em);
        return ret;
 }
 
@@ -4074,27 +4082,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                      struct btrfs_root *extent_root, u64 type)
 {
        u64 chunk_offset;
-       u64 chunk_size;
-       u64 stripe_size;
-       struct map_lookup *map;
-       struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
-       int ret;
-
-       ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                             &chunk_offset);
-       if (ret)
-               return ret;
 
-       ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
-                                 &stripe_size, chunk_offset, type);
-       if (ret)
-               return ret;
-
-       ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
-                                  chunk_size, stripe_size);
-       if (ret)
-               return ret;
-       return 0;
+       chunk_offset = find_next_chunk(extent_root->fs_info);
+       return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
 }
 
 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -4103,66 +4093,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 {
        u64 chunk_offset;
        u64 sys_chunk_offset;
-       u64 chunk_size;
-       u64 sys_chunk_size;
-       u64 stripe_size;
-       u64 sys_stripe_size;
        u64 alloc_profile;
-       struct map_lookup *map;
-       struct map_lookup *sys_map;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_root *extent_root = fs_info->extent_root;
        int ret;
 
-       ret = find_next_chunk(fs_info->chunk_root,
-                             BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
-       if (ret)
-               return ret;
-
+       chunk_offset = find_next_chunk(fs_info);
        alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
-       ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
-                                 &stripe_size, chunk_offset, alloc_profile);
+       ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
+                                 alloc_profile);
        if (ret)
                return ret;
 
-       sys_chunk_offset = chunk_offset + chunk_size;
-
+       sys_chunk_offset = find_next_chunk(root->fs_info);
        alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
-       ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
-                                 &sys_chunk_size, &sys_stripe_size,
-                                 sys_chunk_offset, alloc_profile);
+       ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
+                                 alloc_profile);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
                goto out;
        }
 
        ret = btrfs_add_device(trans, fs_info->chunk_root, device);
-       if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
-               goto out;
-       }
-
-       /*
-        * Modifying chunk tree needs allocating new blocks from both
-        * system block group and metadata block group. So we only can
-        * do operations require modifying the chunk tree after both
-        * block groups were created.
-        */
-       ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
-                                  chunk_size, stripe_size);
-       if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
-               goto out;
-       }
-
-       ret = __finish_chunk_alloc(trans, extent_root, sys_map,
-                                  sys_chunk_offset, sys_chunk_size,
-                                  sys_stripe_size);
        if (ret)
                btrfs_abort_transaction(trans, root, ret);
-
 out:
-
        return ret;
 }
 
@@ -4435,9 +4390,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        map = (struct map_lookup *)em->bdev;
        offset = logical - em->start;
 
-       if (mirror_num > map->num_stripes)
-               mirror_num = 0;
-
        stripe_len = map->stripe_len;
        stripe_nr = offset;
        /*
@@ -5367,7 +5319,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
                return NULL;
        list_add(&device->dev_list,
                 &fs_devices->devices);
-       device->dev_root = root->fs_info->dev_root;
        device->devid = devid;
        device->work.func = pending_bios_fn;
        device->fs_devices = fs_devices;
@@ -5593,7 +5544,6 @@ static int read_one_dev(struct btrfs_root *root,
        }
 
        fill_device_from_item(leaf, dev_item, device);
-       device->dev_root = root->fs_info->dev_root;
        device->in_fs_metadata = 1;
        if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                device->fs_devices->total_rw_bytes += device->total_bytes;
@@ -5751,6 +5701,17 @@ error:
        return ret;
 }
 
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
+
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_devices->devices, dev_list)
+               device->dev_root = fs_info->dev_root;
+       mutex_unlock(&fs_devices->device_list_mutex);
+}
+
 static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
 {
        int i;
index f6247e2a47f7b643d8c88f2ee4c5b53f5c79f043..86705583480d61c9f88c46df734bb4b057abd2d8 100644 (file)
@@ -316,11 +316,13 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
 int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+                        struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *max_avail);
 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
 int btrfs_get_dev_stats(struct btrfs_root *root,
                        struct btrfs_ioctl_get_dev_stats *stats);
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info);
@@ -336,6 +338,9 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
 unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
                                    struct btrfs_mapping_tree *map_tree,
                                    u64 logical);
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *extent_root,
+                               u64 chunk_offset, u64 chunk_size);
 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
                                      int index)
 {
index 38b5c1bc6776ceeeab66b77bb683c3acb23716e0..5318a3b704f6d6f908520a9c1fc18b4dadc9a509 100644 (file)
@@ -439,13 +439,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        struct ceph_inode_info *ci;
        struct ceph_fs_client *fsc;
        struct ceph_osd_client *osdc;
-       loff_t page_off = page_offset(page);
-       int len = PAGE_CACHE_SIZE;
-       loff_t i_size;
-       int err = 0;
        struct ceph_snap_context *snapc, *oldest;
-       u64 snap_size = 0;
+       loff_t page_off = page_offset(page);
        long writeback_stat;
+       u64 truncate_size, snap_size = 0;
+       u32 truncate_seq;
+       int err = 0, len = PAGE_CACHE_SIZE;
 
        dout("writepage %p idx %lu\n", page, page->index);
 
@@ -475,13 +474,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        }
        ceph_put_snap_context(oldest);
 
+       spin_lock(&ci->i_ceph_lock);
+       truncate_seq = ci->i_truncate_seq;
+       truncate_size = ci->i_truncate_size;
+       if (!snap_size)
+               snap_size = i_size_read(inode);
+       spin_unlock(&ci->i_ceph_lock);
+
        /* is this a partial page at end of file? */
-       if (snap_size)
-               i_size = snap_size;
-       else
-               i_size = i_size_read(inode);
-       if (i_size < page_off + len)
-               len = i_size - page_off;
+       if (page_off >= snap_size) {
+               dout("%p page eof %llu\n", page, snap_size);
+               goto out;
+       }
+       if (snap_size < page_off + len)
+               len = snap_size - page_off;
 
        dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
             inode, page, page->index, page_off, len, snapc);
@@ -495,7 +501,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        err = ceph_osdc_writepages(osdc, ceph_vino(inode),
                                   &ci->i_layout, snapc,
                                   page_off, len,
-                                  ci->i_truncate_seq, ci->i_truncate_size,
+                                  truncate_seq, truncate_size,
                                   &inode->i_mtime, &page, 1);
        if (err < 0) {
                dout("writepage setting page/mapping error %d %p\n", err, page);
@@ -632,25 +638,6 @@ static void writepages_finish(struct ceph_osd_request *req,
        ceph_osdc_put_request(req);
 }
 
-static struct ceph_osd_request *
-ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len,
-                               struct ceph_snap_context *snapc, int num_ops)
-{
-       struct ceph_fs_client *fsc;
-       struct ceph_inode_info *ci;
-       struct ceph_vino vino;
-
-       fsc = ceph_inode_to_client(inode);
-       ci = ceph_inode(inode);
-       vino = ceph_vino(inode);
-       /* BUG_ON(vino.snap != CEPH_NOSNAP); */
-
-       return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
-                       vino, offset, len, num_ops, CEPH_OSD_OP_WRITE,
-                       CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK,
-                       snapc, ci->i_truncate_seq, ci->i_truncate_size, true);
-}
-
 /*
  * initiate async writeback
  */
@@ -659,7 +646,8 @@ static int ceph_writepages_start(struct address_space *mapping,
 {
        struct inode *inode = mapping->host;
        struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_fs_client *fsc;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+       struct ceph_vino vino = ceph_vino(inode);
        pgoff_t index, start, end;
        int range_whole = 0;
        int should_loop = 1;
@@ -671,22 +659,22 @@ static int ceph_writepages_start(struct address_space *mapping,
        unsigned wsize = 1 << inode->i_blkbits;
        struct ceph_osd_request *req = NULL;
        int do_sync;
-       u64 snap_size;
+       u64 truncate_size, snap_size;
+       u32 truncate_seq;
 
        /*
         * Include a 'sync' in the OSD request if this is a data
         * integrity write (e.g., O_SYNC write or fsync()), or if our
         * cap is being revoked.
         */
-       do_sync = wbc->sync_mode == WB_SYNC_ALL;
-       if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+       if ((wbc->sync_mode == WB_SYNC_ALL) ||
+               ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
                do_sync = 1;
        dout("writepages_start %p dosync=%d (mode=%s)\n",
             inode, do_sync,
             wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-       fsc = ceph_inode_to_client(inode);
        if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
                pr_warning("writepage_start %p on forced umount\n", inode);
                return -EIO; /* we're in a forced umount, don't write! */
@@ -729,6 +717,14 @@ retry:
                snap_size = i_size_read(inode);
        dout(" oldest snapc is %p seq %lld (%d snaps)\n",
             snapc, snapc->seq, snapc->num_snaps);
+
+       spin_lock(&ci->i_ceph_lock);
+       truncate_seq = ci->i_truncate_seq;
+       truncate_size = ci->i_truncate_size;
+       if (!snap_size)
+               snap_size = i_size_read(inode);
+       spin_unlock(&ci->i_ceph_lock);
+
        if (last_snapc && snapc != last_snapc) {
                /* if we switched to a newer snapc, restart our scan at the
                 * start of the original file range. */
@@ -740,7 +736,6 @@ retry:
 
        while (!done && index <= end) {
                int num_ops = do_sync ? 2 : 1;
-               struct ceph_vino vino;
                unsigned i;
                int first;
                pgoff_t next;
@@ -834,17 +829,18 @@ get_more_pages:
                         * that it will use.
                         */
                        if (locked_pages == 0) {
-                               size_t size;
-
                                BUG_ON(pages);
-
                                /* prepare async write request */
                                offset = (u64)page_offset(page);
                                len = wsize;
-                               req = ceph_writepages_osd_request(inode,
-                                                       offset, &len, snapc,
-                                                       num_ops);
-
+                               req = ceph_osdc_new_request(&fsc->client->osdc,
+                                                       &ci->i_layout, vino,
+                                                       offset, &len, num_ops,
+                                                       CEPH_OSD_OP_WRITE,
+                                                       CEPH_OSD_FLAG_WRITE |
+                                                       CEPH_OSD_FLAG_ONDISK,
+                                                       snapc, truncate_seq,
+                                                       truncate_size, true);
                                if (IS_ERR(req)) {
                                        rc = PTR_ERR(req);
                                        unlock_page(page);
@@ -855,8 +851,8 @@ get_more_pages:
                                req->r_inode = inode;
 
                                max_pages = calc_pages_for(0, (u64)len);
-                               size = max_pages * sizeof (*pages);
-                               pages = kmalloc(size, GFP_NOFS);
+                               pages = kmalloc(max_pages * sizeof (*pages),
+                                               GFP_NOFS);
                                if (!pages) {
                                        pool = fsc->wb_pagevec_pool;
                                        pages = mempool_alloc(pool, GFP_NOFS);
index da0f9b8a3bcb7c906d1beaca275921196860f325..25442b40c25a71761596e071612140f01279fb69 100644 (file)
@@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
        spin_unlock(&mdsc->caps_list_lock);
 }
 
-int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+void ceph_reserve_caps(struct ceph_mds_client *mdsc,
                      struct ceph_cap_reservation *ctx, int need)
 {
        int i;
@@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
        int have;
        int alloc = 0;
        LIST_HEAD(newcaps);
-       int ret = 0;
 
        dout("reserve caps ctx=%p need=%d\n", ctx, need);
 
@@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 
        for (i = have; i < need; i++) {
                cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
-               if (!cap) {
-                       ret = -ENOMEM;
-                       goto out_alloc_count;
-               }
+               if (!cap)
+                       break;
                list_add(&cap->caps_item, &newcaps);
                alloc++;
        }
-       BUG_ON(have + alloc != need);
+       /* we didn't manage to reserve as much as we needed */
+       if (have + alloc != need)
+               pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+                       ctx, need, have + alloc);
 
        spin_lock(&mdsc->caps_list_lock);
        mdsc->caps_total_count += alloc;
@@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
        dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
             ctx, mdsc->caps_total_count, mdsc->caps_use_count,
             mdsc->caps_reserve_count, mdsc->caps_avail_count);
-       return 0;
-
-out_alloc_count:
-       /* we didn't manage to reserve as much as we needed */
-       pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
-                  ctx, need, have);
-       return ret;
 }
 
 int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -612,9 +605,11 @@ retry:
                __cap_delay_requeue(mdsc, ci);
        }
 
-       if (flags & CEPH_CAP_FLAG_AUTH)
-               ci->i_auth_cap = cap;
-       else if (ci->i_auth_cap == cap) {
+       if (flags & CEPH_CAP_FLAG_AUTH) {
+               if (ci->i_auth_cap == NULL ||
+                   ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
+                       ci->i_auth_cap = cap;
+       } else if (ci->i_auth_cap == cap) {
                ci->i_auth_cap = NULL;
                spin_lock(&mdsc->cap_dirty_lock);
                if (!list_empty(&ci->i_dirty_item)) {
@@ -695,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
                if (implemented)
                        *implemented |= cap->implemented;
        }
+       /*
+        * exclude caps issued by non-auth MDS, but are been revoking
+        * by the auth MDS. The non-auth MDS should be revoking/exporting
+        * these caps, but the message is delayed.
+        */
+       if (ci->i_auth_cap) {
+               cap = ci->i_auth_cap;
+               have &= ~cap->implemented | cap->issued;
+       }
        return have;
 }
 
@@ -802,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 /*
  * Return true if mask caps are currently being revoked by an MDS.
  */
-int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+                              struct ceph_cap *ocap, int mask)
 {
-       struct inode *inode = &ci->vfs_inode;
        struct ceph_cap *cap;
        struct rb_node *p;
-       int ret = 0;
 
-       spin_lock(&ci->i_ceph_lock);
        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
                cap = rb_entry(p, struct ceph_cap, ci_node);
-               if (__cap_is_valid(cap) &&
-                   (cap->implemented & ~cap->issued & mask)) {
-                       ret = 1;
-                       break;
-               }
+               if (cap != ocap && __cap_is_valid(cap) &&
+                   (cap->implemented & ~cap->issued & mask))
+                       return 1;
        }
+       return 0;
+}
+
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+       struct inode *inode = &ci->vfs_inode;
+       int ret;
+
+       spin_lock(&ci->i_ceph_lock);
+       ret = __ceph_caps_revoking_other(ci, NULL, mask);
        spin_unlock(&ci->i_ceph_lock);
        dout("ceph_caps_revoking %p %s = %d\n", inode,
             ceph_cap_string(mask), ret);
@@ -1980,8 +1990,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
        cap = ci->i_auth_cap;
        dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
             ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+
        __ceph_flush_snaps(ci, &session, 1);
+
        if (ci->i_flushing_caps) {
+               spin_lock(&mdsc->cap_dirty_lock);
+               list_move_tail(&ci->i_flushing_item,
+                              &cap->session->s_cap_flushing);
+               spin_unlock(&mdsc->cap_dirty_lock);
+
                delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
                                     __ceph_caps_used(ci),
                                     __ceph_caps_wanted(ci),
@@ -2055,7 +2072,11 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
        /* finish pending truncate */
        while (ci->i_truncate_pending) {
                spin_unlock(&ci->i_ceph_lock);
-               __ceph_do_pending_vmtruncate(inode, !(need & CEPH_CAP_FILE_WR));
+               if (!(need & CEPH_CAP_FILE_WR))
+                       mutex_lock(&inode->i_mutex);
+               __ceph_do_pending_vmtruncate(inode);
+               if (!(need & CEPH_CAP_FILE_WR))
+                       mutex_unlock(&inode->i_mutex);
                spin_lock(&ci->i_ceph_lock);
        }
 
@@ -2473,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        } else {
                dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
                     ceph_cap_string(newcaps));
+               /* non-auth MDS is revoking the newly grant caps ? */
+               if (cap == ci->i_auth_cap &&
+                   __ceph_caps_revoking_other(ci, cap, newcaps))
+                   check_caps = 2;
+
                cap->issued = newcaps;
                cap->implemented |= newcaps; /* add bits only, to
                                              * avoid stepping on a
@@ -3042,21 +3068,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
                     (cap->issued & unless) == 0)) {
                        if ((cap->issued & drop) &&
                            (cap->issued & unless) == 0) {
-                               dout("encode_inode_release %p cap %p %s -> "
-                                    "%s\n", inode, cap,
+                               int wanted = __ceph_caps_wanted(ci);
+                               if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
+                                       wanted |= cap->mds_wanted;
+                               dout("encode_inode_release %p cap %p "
+                                    "%s -> %s, wanted %s -> %s\n", inode, cap,
                                     ceph_cap_string(cap->issued),
-                                    ceph_cap_string(cap->issued & ~drop));
+                                    ceph_cap_string(cap->issued & ~drop),
+                                    ceph_cap_string(cap->mds_wanted),
+                                    ceph_cap_string(wanted));
+
                                cap->issued &= ~drop;
                                cap->implemented &= ~drop;
-                               if (ci->i_ceph_flags & CEPH_I_NODELAY) {
-                                       int wanted = __ceph_caps_wanted(ci);
-                                       dout("  wanted %s -> %s (act %s)\n",
-                                            ceph_cap_string(cap->mds_wanted),
-                                            ceph_cap_string(cap->mds_wanted &
-                                                            ~wanted),
-                                            ceph_cap_string(wanted));
-                                       cap->mds_wanted &= wanted;
-                               }
+                               cap->mds_wanted = wanted;
                        } else {
                                dout("encode_inode_release %p cap %p %s"
                                     " (force)\n", inode, cap,
index 16c989d3e23c762e52904419a81beae943332ea0..2ddf061c1c4af730885365b07dcb9388d7af98f9 100644 (file)
@@ -716,7 +716,6 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EROFS;
 
-       sb_start_write(inode->i_sb);
        mutex_lock(&inode->i_mutex);
        hold_mutex = true;
 
@@ -809,7 +808,6 @@ retry_snap:
 out:
        if (hold_mutex)
                mutex_unlock(&inode->i_mutex);
-       sb_end_write(inode->i_sb);
        current->backing_dev_info = NULL;
 
        return written ? written : err;
@@ -824,7 +822,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
        int ret;
 
        mutex_lock(&inode->i_mutex);
-       __ceph_do_pending_vmtruncate(inode, false);
+       __ceph_do_pending_vmtruncate(inode);
 
        if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
                ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
index be0f7e20d62ed230186019d65297e8f45f2f4ab2..f3a2abf28a77df362faf5c38dc471a64dcbfdffc 100644 (file)
@@ -903,8 +903,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
        } else if (realdn) {
                dout("dn %p (%d) spliced with %p (%d) "
                     "inode %p ino %llx.%llx\n",
-                    dn, dn->d_count,
-                    realdn, realdn->d_count,
+                    dn, d_count(dn),
+                    realdn, d_count(realdn),
                     realdn->d_inode, ceph_vinop(realdn->d_inode));
                dput(dn);
                dn = realdn;
@@ -1465,7 +1465,9 @@ static void ceph_vmtruncate_work(struct work_struct *work)
        struct inode *inode = &ci->vfs_inode;
 
        dout("vmtruncate_work %p\n", inode);
-       __ceph_do_pending_vmtruncate(inode, true);
+       mutex_lock(&inode->i_mutex);
+       __ceph_do_pending_vmtruncate(inode);
+       mutex_unlock(&inode->i_mutex);
        iput(inode);
 }
 
@@ -1492,7 +1494,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
  * Make sure any pending truncation is applied before doing anything
  * that may depend on it.
  */
-void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock)
+void __ceph_do_pending_vmtruncate(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        u64 to;
@@ -1525,11 +1527,7 @@ retry:
             ci->i_truncate_pending, to);
        spin_unlock(&ci->i_ceph_lock);
 
-       if (needlock)
-               mutex_lock(&inode->i_mutex);
        truncate_inode_pages(inode->i_mapping, to);
-       if (needlock)
-               mutex_unlock(&inode->i_mutex);
 
        spin_lock(&ci->i_ceph_lock);
        if (to == ci->i_truncate_size) {
@@ -1588,7 +1586,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EROFS;
 
-       __ceph_do_pending_vmtruncate(inode, false);
+       __ceph_do_pending_vmtruncate(inode);
 
        err = inode_change_ok(inode, attr);
        if (err != 0)
@@ -1770,7 +1768,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
             ceph_cap_string(dirtied), mask);
 
        ceph_mdsc_put_request(req);
-       __ceph_do_pending_vmtruncate(inode, false);
+       __ceph_do_pending_vmtruncate(inode);
        return err;
 out:
        spin_unlock(&ci->i_ceph_lock);
index 690f73f42425f1021727bad475fef24ae92b5f55..ae6d14e82b0f439153b5e62adfd4d0c2ebed73a1 100644 (file)
@@ -169,7 +169,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 }
 
 /**
- * Must be called with BKL already held. Fills in the passed
+ * Must be called with lock_flocks() already held. Fills in the passed
  * counter variables, so you can prepare pagelist metadata before calling
  * ceph_encode_locks.
  */
index 74fd2898b2ab43012ab1c3fbf25f619fbd7df3b2..187bf214444da8c8fc9c6a8603b699a258f773f8 100644 (file)
@@ -1391,6 +1391,7 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
        num = le32_to_cpu(head->num);
        dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
        head->num = cpu_to_le32(0);
+       msg->front.iov_len = sizeof(*head);
        session->s_num_cap_releases += num;
 
        /* requeue completed messages */
@@ -1553,7 +1554,7 @@ retry:
        *base = ceph_ino(temp->d_inode);
        *plen = len;
        dout("build_path on %p %d built %llx '%.*s'\n",
-            dentry, dentry->d_count, *base, len, path);
+            dentry, d_count(dentry), *base, len, path);
        return path;
 }
 
@@ -2454,6 +2455,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
        spin_lock(&ci->i_ceph_lock);
        cap->seq = 0;        /* reset cap seq */
        cap->issue_seq = 0;  /* and issue_seq */
+       cap->mseq = 0;       /* and migrate_seq */
 
        if (recon_state->flock) {
                rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -3040,8 +3042,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
        fsc->mdsc = mdsc;
        mutex_init(&mdsc->mutex);
        mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
-       if (mdsc->mdsmap == NULL)
+       if (mdsc->mdsmap == NULL) {
+               kfree(mdsc);
                return -ENOMEM;
+       }
 
        init_completion(&mdsc->safe_umount_waiters);
        init_waitqueue_head(&mdsc->session_close_wq);
index 9278dec9e9400aa222c6e73fcaa144405644c056..132b64eeecd494b01d224d1d58394db498ed70a9 100644 (file)
@@ -92,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                u32 num_export_targets;
                void *pexport_targets = NULL;
                struct ceph_timespec laggy_since;
+               struct ceph_mds_info *info;
 
                ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
                global_id = ceph_decode_64(p);
@@ -126,24 +127,27 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                     i+1, n, global_id, mds, inc,
                     ceph_pr_addr(&addr.in_addr),
                     ceph_mds_state_name(state));
-               if (mds >= 0 && mds < m->m_max_mds && state > 0) {
-                       m->m_info[mds].global_id = global_id;
-                       m->m_info[mds].state = state;
-                       m->m_info[mds].addr = addr;
-                       m->m_info[mds].laggy =
-                               (laggy_since.tv_sec != 0 ||
-                                laggy_since.tv_nsec != 0);
-                       m->m_info[mds].num_export_targets = num_export_targets;
-                       if (num_export_targets) {
-                               m->m_info[mds].export_targets =
-                                       kcalloc(num_export_targets, sizeof(u32),
-                                               GFP_NOFS);
-                               for (j = 0; j < num_export_targets; j++)
-                                       m->m_info[mds].export_targets[j] =
-                                              ceph_decode_32(&pexport_targets);
-                       } else {
-                               m->m_info[mds].export_targets = NULL;
-                       }
+
+               if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+                       continue;
+
+               info = &m->m_info[mds];
+               info->global_id = global_id;
+               info->state = state;
+               info->addr = addr;
+               info->laggy = (laggy_since.tv_sec != 0 ||
+                              laggy_since.tv_nsec != 0);
+               info->num_export_targets = num_export_targets;
+               if (num_export_targets) {
+                       info->export_targets = kcalloc(num_export_targets,
+                                                      sizeof(u32), GFP_NOFS);
+                       if (info->export_targets == NULL)
+                               goto badmem;
+                       for (j = 0; j < num_export_targets; j++)
+                               info->export_targets[j] =
+                                      ceph_decode_32(&pexport_targets);
+               } else {
+                       info->export_targets = NULL;
                }
        }
 
@@ -170,7 +174,7 @@ bad:
                       DUMP_PREFIX_OFFSET, 16, 1,
                       start, end - start, true);
        ceph_mdsmap_destroy(m);
-       return ERR_PTR(-EINVAL);
+       return ERR_PTR(err);
 }
 
 void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
index 7d377c9a5e35a6f05a9224495fb91574acd1d30f..6627b26a800ca0e74649ecf439076bb9c6a4b095 100644 (file)
@@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
        }
        err = -EINVAL;
        dev_name_end--;         /* back up to ':' separator */
-       if (*dev_name_end != ':') {
+       if (dev_name_end < dev_name || *dev_name_end != ':') {
                pr_err("device name is missing path (no : separator in %s)\n",
                                dev_name);
                goto out;
index 7ccfdb4aea2e008e63f7ba00ab080c0fda2d0d09..cbded572345e77a107e539aa4e433d6f6f7964c0 100644 (file)
@@ -534,7 +534,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
 extern void ceph_caps_init(struct ceph_mds_client *mdsc);
 extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
 extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
-extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
                             struct ceph_cap_reservation *ctx, int need);
 extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
                               struct ceph_cap_reservation *ctx);
@@ -692,7 +692,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 extern int ceph_inode_holds_cap(struct inode *inode, int mask);
 
 extern int ceph_inode_set_size(struct inode *inode, loff_t size);
-extern void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
 extern void ceph_queue_vmtruncate(struct inode *inode);
 
 extern void ceph_queue_invalidate(struct inode *inode);
index 9b6b2b6dd164c5fd047f691a68aac03fd8de69d5..be661d8f532adcea4b44d2b42aae52b788d4753e 100644 (file)
@@ -675,17 +675,18 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
        if (!ceph_is_valid_xattr(name))
                return -ENODATA;
 
-       spin_lock(&ci->i_ceph_lock);
-       dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
-            ci->i_xattrs.version, ci->i_xattrs.index_version);
 
        /* let's see if a virtual xattr was requested */
        vxattr = ceph_match_vxattr(inode, name);
        if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
                err = vxattr->getxattr_cb(ci, value, size);
-               goto out;
+               return err;
        }
 
+       spin_lock(&ci->i_ceph_lock);
+       dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+            ci->i_xattrs.version, ci->i_xattrs.index_version);
+
        if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
            (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
                goto get_xattr;
index 14a14808320cf1e555d8ca14fdd604590d8b723f..190effc6a6fae2e09f0826979c201c532003b1b8 100644 (file)
@@ -526,7 +526,7 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
        if (cii->c_flags & C_FLUSH) 
                coda_flag_inode_children(inode, C_FLUSH);
 
-       if (de->d_count > 1)
+       if (d_count(de) > 1)
                /* pretend it's valid, but don't change the flags */
                goto out;
 
index 64e5323cbbb014c4c2240c733d667b2e704bcdcc..5e7c60c1cb63ff541b855bfe227dbf9fde5b1708 100644 (file)
@@ -387,7 +387,7 @@ static void remove_dir(struct dentry * d)
        if (d->d_inode)
                simple_rmdir(parent->d_inode,d);
 
-       pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
+       pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d));
 
        dput(parent);
 }
index a2f2bb2c256dd24cf8a9e597cbbe7f97a290f282..67e9b6339691f9cc01a378564fa5aa0527da1977 100644 (file)
@@ -358,7 +358,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
 
        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
        fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
-       BUG_ON(!lower_dentry->d_count);
+       BUG_ON(!d_count(lower_dentry));
 
        ecryptfs_set_dentry_private(dentry, dentry_info);
        ecryptfs_set_dentry_lower(dentry, lower_dentry);
index b31dbd4c46ad3f1ae9acd41dc274a9aa870656e9..1cb9c7e10c6f22a76b3f781c6beaa5b79acd1c5b 100644 (file)
@@ -48,9 +48,13 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
        trace_ext3_sync_file_enter(file, datasync);
 
-       if (inode->i_sb->s_flags & MS_RDONLY)
+       if (inode->i_sb->s_flags & MS_RDONLY) {
+               /* Make sure that we read updated state */
+               smp_rmb();
+               if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
+                       return -EROFS;
                return 0;
-
+       }
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (ret)
                goto out;
index 6356665a74bb006a096023399fe8dca5363f1435..c47f147507227fda66d6b1a59e6922ab60e36ff9 100644 (file)
@@ -174,6 +174,11 @@ static void ext3_handle_error(struct super_block *sb)
        if (test_opt (sb, ERRORS_RO)) {
                ext3_msg(sb, KERN_CRIT,
                        "error: remounting filesystem read-only");
+               /*
+                * Make sure updated value of ->s_mount_state will be visible
+                * before ->s_flags update.
+                */
+               smp_wmb();
                sb->s_flags |= MS_RDONLY;
        }
        ext3_commit_super(sb, es, 1);
@@ -291,8 +296,14 @@ void ext3_abort(struct super_block *sb, const char *function,
        ext3_msg(sb, KERN_CRIT,
                "error: remounting filesystem read-only");
        EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
-       sb->s_flags |= MS_RDONLY;
        set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
+       /*
+        * Make sure updated value of ->s_mount_state will be visible
+        * before ->s_flags update.
+        */
+       smp_wmb();
+       sb->s_flags |= MS_RDONLY;
+
        if (EXT3_SB(sb)->s_journal)
                journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
index 9d1cd423450d1ba12049a987a0e5478424a33850..62f0d5977c64f3526e0720653fe33039c85355e3 100644 (file)
@@ -610,13 +610,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 {
        struct inode *inode = file_inode(file);
        unsigned long npages = dir_blocks(inode);
-       unsigned int bit_pos = 0, start_bit_pos = 0;
+       unsigned int bit_pos = 0;
        struct f2fs_dentry_block *dentry_blk = NULL;
        struct f2fs_dir_entry *de = NULL;
        struct page *dentry_page = NULL;
        unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
        unsigned char d_type = DT_UNKNOWN;
-       int slots;
 
        bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
 
@@ -625,7 +624,6 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
                if (IS_ERR(dentry_page))
                        continue;
 
-               start_bit_pos = bit_pos;
                dentry_blk = kmap(dentry_page);
                while (bit_pos < NR_DENTRY_IN_BLOCK) {
                        bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
@@ -634,19 +632,19 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
                        if (bit_pos >= NR_DENTRY_IN_BLOCK)
                                break;
 
-                       ctx->pos += bit_pos - start_bit_pos;
                        de = &dentry_blk->dentry[bit_pos];
                        if (de->file_type < F2FS_FT_MAX)
                                d_type = f2fs_filetype_table[de->file_type];
                        else
                                d_type = DT_UNKNOWN;
                        if (!dir_emit(ctx,
-                                     dentry_blk->filename[bit_pos],
-                                     le16_to_cpu(de->name_len),
-                                     le32_to_cpu(de->ino), d_type))
-                               goto success;
-                       slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
-                       bit_pos += slots;
+                                       dentry_blk->filename[bit_pos],
+                                       le16_to_cpu(de->name_len),
+                                       le32_to_cpu(de->ino), d_type))
+                               goto stop;
+
+                       bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+                       ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos;
                }
                bit_pos = 0;
                ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
@@ -654,7 +652,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
                f2fs_put_page(dentry_page, 1);
                dentry_page = NULL;
        }
-success:
+stop:
        if (dentry_page && !IS_ERR(dentry_page)) {
                kunmap(dentry_page);
                f2fs_put_page(dentry_page, 1);
index 04e2c1fdb157afeb0bc8fe278a74909cd239126b..b27a3005d78df9fccd74343bd4a9f93f9f6753b1 100644 (file)
 #include <linux/rcupdate.h>
 #include <linux/pid_namespace.h>
 #include <linux/hashtable.h>
+#include <linux/percpu.h>
+#include <linux/lglock.h>
 
 #include <asm/uaccess.h>
 
@@ -155,11 +157,13 @@ int lease_break_time = 45;
        for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
 
 /*
- * The global file_lock_list is only used for displaying /proc/locks. Protected
- * by the file_lock_lock.
+ * The global file_lock_list is only used for displaying /proc/locks, so we
+ * keep a list on each CPU, with each list protected by its own spinlock via
+ * the file_lock_lglock. Note that alterations to the list also require that
+ * the relevant i_lock is held.
  */
-static HLIST_HEAD(file_lock_list);
-static DEFINE_SPINLOCK(file_lock_lock);
+DEFINE_STATIC_LGLOCK(file_lock_lglock);
+static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
 
 /*
  * The blocked_hash is used to find POSIX lock loops for deadlock detection.
@@ -506,20 +510,30 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
        return fl1->fl_owner == fl2->fl_owner;
 }
 
+/* Must be called with the i_lock held! */
 static inline void
 locks_insert_global_locks(struct file_lock *fl)
 {
-       spin_lock(&file_lock_lock);
-       hlist_add_head(&fl->fl_link, &file_lock_list);
-       spin_unlock(&file_lock_lock);
+       lg_local_lock(&file_lock_lglock);
+       fl->fl_link_cpu = smp_processor_id();
+       hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
+       lg_local_unlock(&file_lock_lglock);
 }
 
+/* Must be called with the i_lock held! */
 static inline void
 locks_delete_global_locks(struct file_lock *fl)
 {
-       spin_lock(&file_lock_lock);
+       /*
+        * Avoid taking lock if already unhashed. This is safe since this check
+        * is done while holding the i_lock, and new insertions into the list
+        * also require that it be held.
+        */
+       if (hlist_unhashed(&fl->fl_link))
+               return;
+       lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
        hlist_del_init(&fl->fl_link);
-       spin_unlock(&file_lock_lock);
+       lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
 }
 
 static unsigned long
@@ -1454,7 +1468,7 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
        if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
                goto out;
        if ((arg == F_WRLCK)
-           && ((dentry->d_count > 1)
+           && ((d_count(dentry) > 1)
                || (atomic_read(&inode->i_count) > 1)))
                goto out;
 
@@ -2243,6 +2257,11 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
+struct locks_iterator {
+       int     li_cpu;
+       loff_t  li_pos;
+};
+
 static void lock_get_status(struct seq_file *f, struct file_lock *fl,
                            loff_t id, char *pfx)
 {
@@ -2316,39 +2335,41 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 
 static int locks_show(struct seq_file *f, void *v)
 {
+       struct locks_iterator *iter = f->private;
        struct file_lock *fl, *bfl;
 
        fl = hlist_entry(v, struct file_lock, fl_link);
 
-       lock_get_status(f, fl, *((loff_t *)f->private), "");
+       lock_get_status(f, fl, iter->li_pos, "");
 
        list_for_each_entry(bfl, &fl->fl_block, fl_block)
-               lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
+               lock_get_status(f, bfl, iter->li_pos, " ->");
 
        return 0;
 }
 
 static void *locks_start(struct seq_file *f, loff_t *pos)
 {
-       loff_t *p = f->private;
+       struct locks_iterator *iter = f->private;
 
-       spin_lock(&file_lock_lock);
+       iter->li_pos = *pos + 1;
+       lg_global_lock(&file_lock_lglock);
        spin_lock(&blocked_lock_lock);
-       *p = (*pos + 1);
-       return seq_hlist_start(&file_lock_list, *pos);
+       return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
 }
 
 static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 {
-       loff_t *p = f->private;
-       ++*p;
-       return seq_hlist_next(v, &file_lock_list, pos);
+       struct locks_iterator *iter = f->private;
+
+       ++iter->li_pos;
+       return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
 }
 
 static void locks_stop(struct seq_file *f, void *v)
 {
        spin_unlock(&blocked_lock_lock);
-       spin_unlock(&file_lock_lock);
+       lg_global_unlock(&file_lock_lglock);
 }
 
 static const struct seq_operations locks_seq_operations = {
@@ -2360,7 +2381,8 @@ static const struct seq_operations locks_seq_operations = {
 
 static int locks_open(struct inode *inode, struct file *filp)
 {
-       return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
+       return seq_open_private(filp, &locks_seq_operations,
+                                       sizeof(struct locks_iterator));
 }
 
 static const struct file_operations proc_locks_operations = {
@@ -2460,9 +2482,16 @@ EXPORT_SYMBOL(lock_may_write);
 
 static int __init filelock_init(void)
 {
+       int i;
+
        filelock_cache = kmem_cache_create("file_lock_cache",
                        sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
 
+       lg_lock_init(&file_lock_lglock, "file_lock_lglock");
+
+       for_each_possible_cpu(i)
+               INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
+
        return 0;
 }
 
index 13ca196385f5faff6c88fd32e00d317cf68dcbd7..b5e80b0af315b065954fe42c2a832e85ff57b23f 100644 (file)
@@ -104,6 +104,15 @@ config NFS_V4_1
 
          If unsure, say N.
 
+config NFS_V4_2
+       bool "NFS client support for NFSv4.2"
+       depends on NFS_V4_1
+       help
+         This option enables support for minor version 2 of the NFSv4 protocol
+         in the kernel's NFS client.
+
+         If unsure, say N.
+
 config PNFS_FILE_LAYOUT
        tristate
        depends on NFS_V4_1
@@ -131,6 +140,11 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
          If the NFS client is unchanged from the upstream kernel, this
          option should be set to the default "kernel.org".
 
+config NFS_V4_SECURITY_LABEL
+       bool
+       depends on NFS_V4_2 && SECURITY
+       default y
+
 config ROOT_NFS
        bool "Root file system on NFS"
        depends on NFS_FS=y && IP_PNP
index cce2c057bd2d61142d63ed6feb6232bbfd29fcab..e0bb048e9576209181fb127d109f9e353e802984 100644 (file)
@@ -6,8 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
 
 nfs-y                  := client.o dir.o file.o getroot.o inode.o super.o \
                           direct.o pagelist.o read.o symlink.o unlink.o \
-                          write.o namespace.o mount_clnt.o \
-                          dns_resolve.o cache_lib.o
+                          write.o namespace.o mount_clnt.o
 nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
 nfs-$(CONFIG_SYSCTL)   += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
@@ -22,7 +21,8 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
 obj-$(CONFIG_NFS_V4) += nfsv4.o
 nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
          delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
-         nfs4namespace.o nfs4getroot.o nfs4client.o
+         nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o
+nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
 nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
 nfsv4-$(CONFIG_NFS_V4_1)       += nfs4session.o pnfs.o pnfs_dev.o
 
index 434b93ec0970b80ac221fcf78e92087da75c3f42..e242bbf729723d1d45ae0cac7370952167b026cd 100644 (file)
@@ -1089,9 +1089,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
        dev->pgbase = 0;
        dev->pglen = PAGE_SIZE * max_pages;
        dev->mincount = 0;
+       dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
 
        dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
-       rc = nfs4_proc_getdeviceinfo(server, dev);
+       rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
        dprintk("%s getdevice info returns %d\n", __func__, rc);
        if (rc) {
                rv = ERR_PTR(rc);
index da6a43d19aa3a04f09eb3b9276bce3decea31ecb..67cd73213168f3f7fd5f30b8f217fd22fcc7d7f2 100644 (file)
@@ -281,6 +281,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
                        ret = nfs4_callback_up_net(serv, net);
                        break;
                case 1:
+               case 2:
                        ret = nfs41_callback_up_net(serv, net);
                        break;
                default:
index efd54f0a4c468f9c7e7726b751df36c6f94d1d16..84326e9fb47aa476e3f2ad7a072055172b3f6ab0 100644 (file)
@@ -32,6 +32,8 @@ enum nfs4_callback_opnum {
        OP_CB_WANTS_CANCELLED = 12,
        OP_CB_NOTIFY_LOCK   = 13,
        OP_CB_NOTIFY_DEVICEID = 14,
+/* Callback operations new to NFSv4.2 */
+       OP_CB_OFFLOAD = 15,
        OP_CB_ILLEGAL = 10044,
 };
 
@@ -39,6 +41,7 @@ struct cb_process_state {
        __be32                  drc_status;
        struct nfs_client       *clp;
        u32                     slotid;
+       u32                     minorversion;
        struct net              *net;
 };
 
index 0bc27684ebfa338d0b77c295427c31472ec66049..e6ebc4c38c812c01c79c587862ac9192d721765a 100644 (file)
@@ -406,7 +406,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        int i;
        __be32 status = htonl(NFS4ERR_BADSESSION);
 
-       clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid);
+       clp = nfs4_find_client_sessionid(cps->net, args->csa_addr,
+                                        &args->csa_sessionid, cps->minorversion);
        if (clp == NULL)
                goto out;
 
index a35582c9d4440f8fe907192427b5c4f9a9a3b061..f4ccfe6521ec80f80fd4b9096bcc7ed49ab7d795 100644 (file)
@@ -166,9 +166,9 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
        if (unlikely(p == NULL))
                return htonl(NFS4ERR_RESOURCE);
        hdr->minorversion = ntohl(*p++);
-       /* Check minor version is zero or one. */
-       if (hdr->minorversion <= 1) {
-               hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
+       /* Check for minor version support */
+       if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) {
+               hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */
        } else {
                pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
                        "illegal minor version %u!\n",
@@ -786,6 +786,26 @@ static void nfs4_cb_free_slot(struct cb_process_state *cps)
 }
 #endif /* CONFIG_NFS_V4_1 */
 
+#ifdef CONFIG_NFS_V4_2
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+       __be32 status = preprocess_nfs41_op(nop, op_nr, op);
+       if (status != htonl(NFS4ERR_OP_ILLEGAL))
+               return status;
+
+       if (op_nr == OP_CB_OFFLOAD)
+               return htonl(NFS4ERR_NOTSUPP);
+       return htonl(NFS4ERR_OP_ILLEGAL);
+}
+#else /* CONFIG_NFS_V4_2 */
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+       return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+#endif /* CONFIG_NFS_V4_2 */
+
 static __be32
 preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 {
@@ -801,8 +821,7 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
        return htonl(NFS_OK);
 }
 
-static __be32 process_op(uint32_t minorversion, int nop,
-               struct svc_rqst *rqstp,
+static __be32 process_op(int nop, struct svc_rqst *rqstp,
                struct xdr_stream *xdr_in, void *argp,
                struct xdr_stream *xdr_out, void *resp,
                struct cb_process_state *cps)
@@ -819,10 +838,22 @@ static __be32 process_op(uint32_t minorversion, int nop,
                return status;
 
        dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
-               __func__, minorversion, nop, op_nr);
+               __func__, cps->minorversion, nop, op_nr);
+
+       switch (cps->minorversion) {
+       case 0:
+               status = preprocess_nfs4_op(op_nr, &op);
+               break;
+       case 1:
+               status = preprocess_nfs41_op(nop, op_nr, &op);
+               break;
+       case 2:
+               status = preprocess_nfs42_op(nop, op_nr, &op);
+               break;
+       default:
+               status = htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+       }
 
-       status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
-                               preprocess_nfs4_op(op_nr, &op);
        if (status == htonl(NFS4ERR_OP_ILLEGAL))
                op_nr = OP_CB_ILLEGAL;
        if (status)
@@ -885,14 +916,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
                        return rpc_drop_reply;
        }
 
+       cps.minorversion = hdr_arg.minorversion;
        hdr_res.taglen = hdr_arg.taglen;
        hdr_res.tag = hdr_arg.tag;
        if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
                return rpc_system_err;
 
        while (status == 0 && nops != hdr_arg.nops) {
-               status = process_op(hdr_arg.minorversion, nops, rqstp,
-                                   &xdr_in, argp, &xdr_out, resp, &cps);
+               status = process_op(nops, rqstp, &xdr_in,
+                                   argp, &xdr_out, resp, &cps);
                nops++;
        }
 
index c513b0cc835f9f32db99def2a6868fdf7848e5fe..340b1eff02679ad3485f51f6c526ba3363b2ef53 100644 (file)
@@ -753,8 +753,6 @@ static int nfs_init_server(struct nfs_server *server,
                        data->timeo, data->retrans);
        if (data->flags & NFS_MOUNT_NORESVPORT)
                set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
-       if (server->options & NFS_OPTION_MIGRATION)
-               set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
 
        /* Allocate or find a client reference we can use */
        clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
@@ -1076,7 +1074,7 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
        }
 
        if (!(fattr->valid & NFS_ATTR_FATTR)) {
-               error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr);
+               error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL);
                if (error < 0) {
                        dprintk("nfs_create_server: getattr error = %d\n", -error);
                        goto error;
index d7ed697133f0b6c6ac667c4e3e334646a6a67b63..0fac2cb1ea18648d8befee687f3086f771b8b74a 100644 (file)
@@ -437,6 +437,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
        struct dentry *alias;
        struct inode *dir = parent->d_inode;
        struct inode *inode;
+       int status;
 
        if (filename.name[0] == '.') {
                if (filename.len == 1)
@@ -449,7 +450,9 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
        dentry = d_lookup(parent, &filename);
        if (dentry != NULL) {
                if (nfs_same_file(dentry, entry)) {
-                       nfs_refresh_inode(dentry->d_inode, entry->fattr);
+                       status = nfs_refresh_inode(dentry->d_inode, entry->fattr);
+                       if (!status)
+                               nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);
                        goto out;
                } else {
                        if (d_invalidate(dentry) != 0)
@@ -462,7 +465,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
        if (dentry == NULL)
                return;
 
-       inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+       inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
        if (IS_ERR(inode))
                goto out;
 
@@ -587,10 +590,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        if (entry.fh == NULL || entry.fattr == NULL)
                goto out;
 
+       entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+       if (IS_ERR(entry.label)) {
+               status = PTR_ERR(entry.label);
+               goto out;
+       }
+
        array = nfs_readdir_get_array(page);
        if (IS_ERR(array)) {
                status = PTR_ERR(array);
-               goto out;
+               goto out_label_free;
        }
        memset(array, 0, sizeof(struct nfs_cache_array));
        array->eof_index = -1;
@@ -616,6 +625,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        nfs_readdir_free_large_page(pages_ptr, pages, array_size);
 out_release_array:
        nfs_readdir_release_array(page);
+out_label_free:
+       nfs4_label_free(entry.label);
 out:
        nfs_free_fattr(entry.fattr);
        nfs_free_fhandle(entry.fh);
@@ -1040,6 +1051,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
        struct dentry *parent;
        struct nfs_fh *fhandle = NULL;
        struct nfs_fattr *fattr = NULL;
+       struct nfs4_label *label = NULL;
        int error;
 
        if (flags & LOOKUP_RCU)
@@ -1082,7 +1094,11 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
        if (fhandle == NULL || fattr == NULL)
                goto out_error;
 
-       error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+       label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+       if (IS_ERR(label))
+               goto out_error;
+
+       error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
        if (error)
                goto out_bad;
        if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1090,8 +1106,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
        if ((error = nfs_refresh_inode(inode, fattr)) != 0)
                goto out_bad;
 
+       nfs_setsecurity(inode, fattr, label);
+
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
+       nfs4_label_free(label);
+
 out_set_verifier:
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
  out_valid:
@@ -1108,6 +1128,7 @@ out_zap_parent:
  out_bad:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
+       nfs4_label_free(label);
        nfs_mark_for_revalidate(dir);
        if (inode && S_ISDIR(inode->i_mode)) {
                /* Purge readdir caches. */
@@ -1128,6 +1149,7 @@ out_zap_parent:
 out_error:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
+       nfs4_label_free(label);
        dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
                        __func__, dentry->d_parent->d_name.name,
@@ -1256,6 +1278,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
        struct inode *inode = NULL;
        struct nfs_fh *fhandle = NULL;
        struct nfs_fattr *fattr = NULL;
+       struct nfs4_label *label = NULL;
        int error;
 
        dfprintk(VFS, "NFS: lookup(%s/%s)\n",
@@ -1282,17 +1305,21 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
        if (fhandle == NULL || fattr == NULL)
                goto out;
 
+       label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT);
+       if (IS_ERR(label))
+               goto out;
+
        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
        nfs_block_sillyrename(parent);
-       error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+       error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
        if (error == -ENOENT)
                goto no_entry;
        if (error < 0) {
                res = ERR_PTR(error);
                goto out_unblock_sillyrename;
        }
-       inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+       inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
        res = ERR_CAST(inode);
        if (IS_ERR(res))
                goto out_unblock_sillyrename;
@@ -1310,6 +1337,7 @@ no_entry:
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_unblock_sillyrename:
        nfs_unblock_sillyrename(parent);
+       nfs4_label_free(label);
 out:
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
@@ -1357,18 +1385,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
 {
        int err;
 
-       if (ctx->dentry != dentry) {
-               dput(ctx->dentry);
-               ctx->dentry = dget(dentry);
-       }
-
-       /* If the open_intent is for execute, we have an extra check to make */
-       if (ctx->mode & FMODE_EXEC) {
-               err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags);
-               if (err < 0)
-                       goto out;
-       }
-
        err = finish_open(file, dentry, do_open, opened);
        if (err)
                goto out;
@@ -1427,13 +1443,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 
        nfs_block_sillyrename(dentry->d_parent);
        inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
-       d_drop(dentry);
+       nfs_unblock_sillyrename(dentry->d_parent);
        if (IS_ERR(inode)) {
-               nfs_unblock_sillyrename(dentry->d_parent);
                put_nfs_open_context(ctx);
                err = PTR_ERR(inode);
                switch (err) {
                case -ENOENT:
+                       d_drop(dentry);
                        d_add(dentry, NULL);
                        break;
                case -EISDIR:
@@ -1449,16 +1465,8 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
                }
                goto out;
        }
-       res = d_add_unique(dentry, inode);
-       if (res != NULL)
-               dentry = res;
-
-       nfs_unblock_sillyrename(dentry->d_parent);
-       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-
-       err = nfs_finish_open(ctx, dentry, file, open_flags, opened);
 
-       dput(res);
+       err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened);
 out:
        return err;
 
@@ -1528,7 +1536,8 @@ no_open:
  * Code common to create, mkdir, and mknod.
  */
 int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
-                               struct nfs_fattr *fattr)
+                               struct nfs_fattr *fattr,
+                               struct nfs4_label *label)
 {
        struct dentry *parent = dget_parent(dentry);
        struct inode *dir = parent->d_inode;
@@ -1541,18 +1550,18 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
        if (dentry->d_inode)
                goto out;
        if (fhandle->size == 0) {
-               error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+               error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
                if (error)
                        goto out_error;
        }
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        if (!(fattr->valid & NFS_ATTR_FATTR)) {
                struct nfs_server *server = NFS_SB(dentry->d_sb);
-               error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
+               error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL);
                if (error < 0)
                        goto out_error;
        }
-       inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+       inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
        error = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_error;
@@ -1721,7 +1730,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
                dir->i_ino, dentry->d_name.name);
 
        spin_lock(&dentry->d_lock);
-       if (dentry->d_count > 1) {
+       if (d_count(dentry) > 1) {
                spin_unlock(&dentry->d_lock);
                /* Start asynchronous writeout of the inode */
                write_inode_now(dentry->d_inode, 0);
@@ -1866,7 +1875,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
                 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
-                new_dentry->d_count);
+                d_count(new_dentry));
 
        /*
         * For non-directories, check whether the target is busy and if so,
@@ -1884,7 +1893,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        rehash = new_dentry;
                }
 
-               if (new_dentry->d_count > 2) {
+               if (d_count(new_dentry) > 2) {
                        int err;
 
                        /* copy the target dentry's name */
index 9455270922958f420a7f31101cac1b6ac7b14bfb..fc0f95ec73587f9fbcfa7fca24438da29012b5b9 100644 (file)
@@ -29,7 +29,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
        kfree(ip_addr);
        return ret;
 }
-EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
 
 #else
 
@@ -351,7 +350,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,
                ret = -ESRCH;
        return ret;
 }
-EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
 
 static struct cache_detail nfs_dns_resolve_template = {
        .owner          = THIS_MODULE,
@@ -396,6 +394,21 @@ void nfs_dns_resolver_cache_destroy(struct net *net)
        cache_destroy_net(nn->nfs_dns_resolve, net);
 }
 
+static int nfs4_dns_net_init(struct net *net)
+{
+       return nfs_dns_resolver_cache_init(net);
+}
+
+static void nfs4_dns_net_exit(struct net *net)
+{
+       nfs_dns_resolver_cache_destroy(net);
+}
+
+static struct pernet_operations nfs4_dns_resolver_ops = {
+       .init = nfs4_dns_net_init,
+       .exit = nfs4_dns_net_exit,
+};
+
 static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
                           void *ptr)
 {
@@ -432,11 +445,24 @@ static struct notifier_block nfs_dns_resolver_block = {
 
 int nfs_dns_resolver_init(void)
 {
-       return rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+       int err;
+
+       err = register_pernet_subsys(&nfs4_dns_resolver_ops);
+       if (err < 0)
+               goto out;
+       err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+       if (err < 0)
+               goto out1;
+       return 0;
+out1:
+       unregister_pernet_subsys(&nfs4_dns_resolver_ops);
+out:
+       return err;
 }
 
 void nfs_dns_resolver_destroy(void)
 {
        rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
+       unregister_pernet_subsys(&nfs4_dns_resolver_ops);
 }
 #endif
index 44efaa8c5f78f38bda40753542740874b50c7155..66984a9aafaad9ad74d81415d5737cbafad81618 100644 (file)
@@ -95,7 +95,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
                goto out;
        }
 
-       inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
+       inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL);
        if (IS_ERR(inode)) {
                dprintk("nfs_get_root: get root inode failed\n");
                ret = ERR_CAST(inode);
index c516da5873fd12df0d3c9877b2c3189423c0dec4..c2c4163d56832fd94193c2865faeb64df11e8b74 100644 (file)
@@ -262,29 +262,42 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
        return desclen;
 }
 
-static ssize_t nfs_idmap_request_key(struct key_type *key_type,
-                                    const char *name, size_t namelen,
-                                    const char *type, void *data,
-                                    size_t data_size, struct idmap *idmap)
+static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
+                                        const char *type, struct idmap *idmap)
 {
-       const struct cred *saved_cred;
-       struct key *rkey;
        char *desc;
-       struct user_key_payload *payload;
+       struct key *rkey;
        ssize_t ret;
 
        ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
        if (ret <= 0)
-               goto out;
+               return ERR_PTR(ret);
+
+       rkey = request_key(&key_type_id_resolver, desc, "");
+       if (IS_ERR(rkey)) {
+               mutex_lock(&idmap->idmap_mutex);
+               rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
+                                               desc, "", 0, idmap);
+               mutex_unlock(&idmap->idmap_mutex);
+       }
+
+       kfree(desc);
+       return rkey;
+}
+
+static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
+                                const char *type, void *data,
+                                size_t data_size, struct idmap *idmap)
+{
+       const struct cred *saved_cred;
+       struct key *rkey;
+       struct user_key_payload *payload;
+       ssize_t ret;
 
        saved_cred = override_creds(id_resolver_cache);
-       if (idmap)
-               rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap);
-       else
-               rkey = request_key(&key_type_id_resolver, desc, "");
+       rkey = nfs_idmap_request_key(name, namelen, type, idmap);
        revert_creds(saved_cred);
 
-       kfree(desc);
        if (IS_ERR(rkey)) {
                ret = PTR_ERR(rkey);
                goto out;
@@ -316,23 +329,6 @@ out:
        return ret;
 }
 
-static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
-                                const char *type, void *data,
-                                size_t data_size, struct idmap *idmap)
-{
-       ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver,
-                                           name, namelen, type, data,
-                                           data_size, NULL);
-       if (ret < 0) {
-               mutex_lock(&idmap->idmap_mutex);
-               ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
-                                           name, namelen, type, data,
-                                           data_size, idmap);
-               mutex_unlock(&idmap->idmap_mutex);
-       }
-       return ret;
-}
-
 /* ID -> Name */
 static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
                                     size_t buflen, struct idmap *idmap)
index ce727047ee87786bcf4d20d8573a24a0990e2e62..c93639e6cf6827ed9e9b1bd7320bdc297886e933 100644 (file)
@@ -48,7 +48,6 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
-#include "dns_resolve.h"
 #include "pnfs.h"
 #include "nfs.h"
 #include "netns.h"
@@ -162,11 +161,19 @@ static void nfs_zap_caches_locked(struct inode *inode)
 
        memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
        if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
-               nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
                nfs_fscache_invalidate(inode);
-       } else {
-               nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
-       }
+               nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+                                       | NFS_INO_INVALID_LABEL
+                                       | NFS_INO_INVALID_DATA
+                                       | NFS_INO_INVALID_ACCESS
+                                       | NFS_INO_INVALID_ACL
+                                       | NFS_INO_REVAL_PAGECACHE;
+       } else
+               nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+                                       | NFS_INO_INVALID_LABEL
+                                       | NFS_INO_INVALID_ACCESS
+                                       | NFS_INO_INVALID_ACL
+                                       | NFS_INO_REVAL_PAGECACHE;
 }
 
 void nfs_zap_caches(struct inode *inode)
@@ -257,12 +264,72 @@ nfs_init_locked(struct inode *inode, void *opaque)
        return 0;
 }
 
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+                                       struct nfs4_label *label)
+{
+       int error;
+
+       if (label == NULL)
+               return;
+
+       if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0)
+               return;
+
+       if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2)
+               return;
+
+       if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
+               error = security_inode_notifysecctx(inode, label->label,
+                               label->len);
+               if (error)
+                       printk(KERN_ERR "%s() %s %d "
+                                       "security_inode_notifysecctx() %d\n",
+                                       __func__,
+                                       (char *)label->label,
+                                       label->len, error);
+       }
+}
+
+struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
+{
+       struct nfs4_label *label = NULL;
+       int minor_version = server->nfs_client->cl_minorversion;
+
+       if (minor_version < 2)
+               return label;
+
+       if (!(server->caps & NFS_CAP_SECURITY_LABEL))
+               return label;
+
+       label = kzalloc(sizeof(struct nfs4_label), flags);
+       if (label == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       label->label = kzalloc(NFS4_MAXLABELLEN, flags);
+       if (label->label == NULL) {
+               kfree(label);
+               return ERR_PTR(-ENOMEM);
+       }
+       label->len = NFS4_MAXLABELLEN;
+
+       return label;
+}
+EXPORT_SYMBOL_GPL(nfs4_label_alloc);
+#else
+void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+                                       struct nfs4_label *label)
+{
+}
+#endif
+EXPORT_SYMBOL_GPL(nfs_setsecurity);
+
 /*
  * This is our front-end to iget that looks up inodes by file handle
  * instead of inode number.
  */
 struct inode *
-nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct nfs_find_desc desc = {
                .fh     = fh,
@@ -384,6 +451,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                         */
                        inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
                }
+
+               nfs_setsecurity(inode, fattr, label);
+
                nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
                nfsi->attrtimeo_timestamp = now;
                nfsi->access_cache = RB_ROOT;
@@ -393,6 +463,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                unlock_new_inode(inode);
        } else
                nfs_refresh_inode(inode, fattr);
+               nfs_setsecurity(inode, fattr, label);
        dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",
                inode->i_sb->s_id,
                (long long)NFS_FILEID(inode),
@@ -449,7 +520,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
                NFS_PROTO(inode)->return_delegation(inode);
        error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
        if (error == 0)
-               nfs_refresh_inode(inode, fattr);
+               error = nfs_refresh_inode(inode, fattr);
        nfs_free_fattr(fattr);
 out:
        return error;
@@ -713,16 +784,23 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context);
  * Ensure that mmap has a recent RPC credential for use when writing out
  * shared pages
  */
-void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
 {
-       struct inode *inode = file_inode(filp);
+       struct inode *inode = ctx->dentry->d_inode;
        struct nfs_inode *nfsi = NFS_I(inode);
 
-       filp->private_data = get_nfs_open_context(ctx);
        spin_lock(&inode->i_lock);
        list_add(&ctx->list, &nfsi->open_files);
        spin_unlock(&inode->i_lock);
 }
+EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
+
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+{
+       filp->private_data = get_nfs_open_context(ctx);
+       if (list_empty(&ctx->list))
+               nfs_inode_attach_open_context(ctx);
+}
 EXPORT_SYMBOL_GPL(nfs_file_set_open_context);
 
 /*
@@ -748,10 +826,11 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
 
 static void nfs_file_clear_open_context(struct file *filp)
 {
-       struct inode *inode = file_inode(filp);
        struct nfs_open_context *ctx = nfs_file_open_context(filp);
 
        if (ctx) {
+               struct inode *inode = ctx->dentry->d_inode;
+
                filp->private_data = NULL;
                spin_lock(&inode->i_lock);
                list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -790,6 +869,7 @@ int
 __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
        int              status = -ESTALE;
+       struct nfs4_label *label = NULL;
        struct nfs_fattr *fattr = NULL;
        struct nfs_inode *nfsi = NFS_I(inode);
 
@@ -807,7 +887,14 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                goto out;
 
        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-       status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
+
+       label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+       if (IS_ERR(label)) {
+               status = PTR_ERR(label);
+               goto out;
+       }
+
+       status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
        if (status != 0) {
                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
                         inode->i_sb->s_id,
@@ -817,7 +904,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                        if (!S_ISDIR(inode->i_mode))
                                set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
                }
-               goto out;
+               goto err_out;
        }
 
        status = nfs_refresh_inode(inode, fattr);
@@ -825,7 +912,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
                         inode->i_sb->s_id,
                         (long long)NFS_FILEID(inode), status);
-               goto out;
+               goto err_out;
        }
 
        if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
@@ -835,7 +922,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                inode->i_sb->s_id,
                (long long)NFS_FILEID(inode));
 
- out:
+err_out:
+       nfs4_label_free(label);
+out:
        nfs_free_fattr(fattr);
        return status;
 }
@@ -863,7 +952,8 @@ static int nfs_attribute_cache_expired(struct inode *inode)
  */
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
-       if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
+       if (!(NFS_I(inode)->cache_validity &
+                       (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
                        && !nfs_attribute_cache_expired(inode))
                return NFS_STALE(inode) ? -ESTALE : 0;
        return __nfs_revalidate_inode(server, inode);
@@ -1243,6 +1333,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        spin_lock(&inode->i_lock);
        status = nfs_post_op_update_inode_locked(inode, fattr);
        spin_unlock(&inode->i_lock);
+
        return status;
 }
 EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
@@ -1483,7 +1574,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                inode->i_blocks = fattr->du.nfs2.blocks;
 
        /* Update attrtimeo value if we're out of the unstable period */
-       if (invalid & NFS_INO_INVALID_ATTR) {
+       if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) {
                nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
                nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
                nfsi->attrtimeo_timestamp = now;
@@ -1496,6 +1587,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                }
        }
        invalid &= ~NFS_INO_INVALID_ATTR;
+       invalid &= ~NFS_INO_INVALID_LABEL;
        /* Don't invalidate the data if we were to blame */
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
                                || S_ISLNK(inode->i_mode)))
@@ -1638,12 +1730,11 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
 static int nfs_net_init(struct net *net)
 {
        nfs_clients_init(net);
-       return nfs_dns_resolver_cache_init(net);
+       return 0;
 }
 
 static void nfs_net_exit(struct net *net)
 {
-       nfs_dns_resolver_cache_destroy(net);
        nfs_cleanup_cb_ident_idr(net);
 }
 
@@ -1661,10 +1752,6 @@ static int __init init_nfs_fs(void)
 {
        int err;
 
-       err = nfs_dns_resolver_init();
-       if (err < 0)
-               goto out10;;
-
        err = register_pernet_subsys(&nfs_net_ops);
        if (err < 0)
                goto out9;
@@ -1730,8 +1817,6 @@ out7:
 out8:
        unregister_pernet_subsys(&nfs_net_ops);
 out9:
-       nfs_dns_resolver_destroy();
-out10:
        return err;
 }
 
@@ -1744,7 +1829,6 @@ static void __exit exit_nfs_fs(void)
        nfs_destroy_nfspagecache();
        nfs_fscache_unregister();
        unregister_pernet_subsys(&nfs_net_ops);
-       nfs_dns_resolver_destroy();
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister(&init_net, "nfs");
 #endif
index 91e59a39fc08dcfd3b6b788cd9a3610e5d4b8984..3c8373f90ab3150f2530a795b977c1489a344771 100644 (file)
@@ -165,7 +165,7 @@ extern void nfs_free_client(struct nfs_client *);
 extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
 extern struct nfs_client *
 nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
-                               struct nfs4_sessionid *);
+                               struct nfs4_sessionid *, u32);
 extern struct nfs_server *nfs_create_server(struct nfs_mount_info *,
                                        struct nfs_subversion *);
 extern struct nfs_server *nfs4_create_server(
@@ -255,6 +255,7 @@ extern int nfs4_decode_dirent(struct xdr_stream *,
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
 extern const u32 nfs41_maxwrite_overhead;
+extern const u32 nfs41_maxgetdevinfo_overhead;
 #endif
 
 /* nfs4proc.c */
index 91a6faf811accb8e8e35ab6c3644e16058356a2a..99a45283b9ee5abfe2bb680c51c2b4805f459d2b 100644 (file)
@@ -139,7 +139,10 @@ struct mnt_fhstatus {
  * nfs_mount - Obtain an NFS file handle for the given host and path
  * @info: pointer to mount request arguments
  *
- * Uses default timeout parameters specified by underlying transport.
+ * Uses default timeout parameters specified by underlying transport. On
+ * successful return, the auth_flavs list and auth_flav_len will be populated
+ * with the list from the server or a faked-up list if the server didn't
+ * provide one.
  */
 int nfs_mount(struct nfs_mount_request *info)
 {
@@ -195,6 +198,15 @@ int nfs_mount(struct nfs_mount_request *info)
        dprintk("NFS: MNT request succeeded\n");
        status = 0;
 
+       /*
+        * If the server didn't provide a flavor list, allow the
+        * client to try any flavor.
+        */
+       if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) {
+               dprintk("NFS: Faking up auth_flavs list\n");
+               info->auth_flavs[0] = RPC_AUTH_NULL;
+               *info->auth_flav_len = 1;
+       }
 out:
        return status;
 
index fc8dc20fdeb9c90274d5acb8ef2bb831af297425..348b535cd7866d9e18cfa6f9412b650266244f0a 100644 (file)
@@ -280,7 +280,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
        struct dentry *parent = dget_parent(dentry);
 
        /* Look it up again to get its attributes */
-       err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr);
+       err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL);
        dput(parent);
        if (err != 0)
                return ERR_PTR(err);
index ce90eb4775c2f888547bd98911062f8eede4af97..f5c84c3efbca24df1ee53947a86dfe28c992cf46 100644 (file)
@@ -98,7 +98,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
  */
 static int
 nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
-               struct nfs_fattr *fattr)
+               struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_GETATTR],
@@ -143,7 +143,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 
 static int
 nfs3_proc_lookup(struct inode *dir, struct qstr *name,
-                struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+                struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+                struct nfs4_label *label)
 {
        struct nfs3_diropargs   arg = {
                .fh             = NFS_FH(dir),
@@ -300,7 +301,7 @@ static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_
        status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
        nfs_post_op_update_inode(dir, data->res.dir_attr);
        if (status == 0)
-               status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+               status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
        return status;
 }
 
index a1dd768d0a350fd93498ed7a96f068a070df35b7..ee81e354bce7a9d7fbc36023c993fe312ef3e255 100644 (file)
@@ -194,7 +194,7 @@ struct nfs4_state_recovery_ops {
        int (*recover_lock)(struct nfs4_state *, struct file_lock *);
        int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
        struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
-       int (*reclaim_complete)(struct nfs_client *);
+       int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
        int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
                struct rpc_cred *);
 };
@@ -303,10 +303,10 @@ is_ds_client(struct nfs_client *clp)
 extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
 
 extern const u32 nfs4_fattr_bitmap[3];
-extern const u32 nfs4_statfs_bitmap[2];
-extern const u32 nfs4_pathconf_bitmap[2];
+extern const u32 nfs4_statfs_bitmap[3];
+extern const u32 nfs4_pathconf_bitmap[3];
 extern const u32 nfs4_fsinfo_bitmap[3];
-extern const u32 nfs4_fs_locations_bitmap[2];
+extern const u32 nfs4_fs_locations_bitmap[3];
 
 void nfs4_free_client(struct nfs_client *);
 
index 4cbad5d6b276f8c481984c166869e2679c2e241e..90dce91dd5b5c7aa61a7a17c34242619bbfc5b8d 100644 (file)
@@ -66,6 +66,11 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
        if (err)
                goto error;
 
+       if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) {
+               err = -EINVAL;
+               goto error;
+       }
+
        spin_lock_init(&clp->cl_lock);
        INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
        rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -562,14 +567,14 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
  */
 struct nfs_client *
 nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
-                          struct nfs4_sessionid *sid)
+                          struct nfs4_sessionid *sid, u32 minorversion)
 {
        struct nfs_client *clp;
        struct nfs_net *nn = net_generic(net, nfs_net_id);
 
        spin_lock(&nn->nfs_client_lock);
        list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
-               if (nfs4_cb_match_client(addr, clp, 1) == false)
+               if (nfs4_cb_match_client(addr, clp, minorversion) == false)
                        continue;
 
                if (!nfs4_has_session(clp))
@@ -592,7 +597,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
 
 struct nfs_client *
 nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
-                          struct nfs4_sessionid *sid)
+                          struct nfs4_sessionid *sid, u32 minorversion)
 {
        return NULL;
 }
@@ -626,6 +631,8 @@ static int nfs4_set_client(struct nfs_server *server,
 
        if (server->flags & NFS_MOUNT_NORESVPORT)
                set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+       if (server->options & NFS_OPTION_MIGRATION)
+               set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
 
        /* Allocate or find a client reference we can use */
        clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
@@ -730,7 +737,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
                return -ENOMEM;
 
        /* We must ensure the session is initialised first */
-       error = nfs4_init_session(server);
+       error = nfs4_init_session(server->nfs_client);
        if (error < 0)
                goto out;
 
index 13e6bb3e3fe59c03b1430f118a2afd4033c19a1d..e5b804dd944c16a8adf4de17ee6588562cec55e8 100644 (file)
@@ -69,7 +69,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
                        goto out_drop;
                }
        }
-       iput(inode);
        if (inode != dentry->d_inode)
                goto out_drop;
 
index 22d10623f5ee3b591de10631044cf90cef4d353a..17ed87ef9de809cf76f7ea6e4af4e2b0f3f3f614 100644 (file)
@@ -643,7 +643,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
                                   NFS_SERVER(lo->plh_inode)->nfs_client, id);
        if (d == NULL) {
-               dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags);
+               dsaddr = filelayout_get_device_info(lo->plh_inode, id,
+                               lo->plh_lc_cred, gfp_flags);
                if (dsaddr == NULL)
                        goto out;
        } else
index 235ff952d3c8620cfe9e5786abf1f3b41f364e84..cebd20e7e923a28d475f6de9df5006cc93d07cb9 100644 (file)
@@ -150,6 +150,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
 extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
+filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
+               struct rpc_cred *cred, gfp_t gfp_flags);
 
 #endif /* FS_NFS_NFS4FILELAYOUT_H */
index 661a0f6112156ce554be5124c4cadec2294ce6cb..95604f64cab86632d7a166ce588ea9ffd5d87e95 100644 (file)
@@ -668,7 +668,10 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
  * of available devices, and return it.
  */
 struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags)
+filelayout_get_device_info(struct inode *inode,
+               struct nfs4_deviceid *dev_id,
+               struct rpc_cred *cred,
+               gfp_t gfp_flags)
 {
        struct pnfs_device *pdev = NULL;
        u32 max_resp_sz;
@@ -708,8 +711,9 @@ filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gf
        pdev->pgbase = 0;
        pdev->pglen = max_resp_sz;
        pdev->mincount = 0;
+       pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
 
-       rc = nfs4_proc_getdeviceinfo(server, pdev);
+       rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
        dprintk("%s getdevice info returns %d\n", __func__, rc);
        if (rc)
                goto out_free;
index 28241a42f363581341b3ed81de137fc0da1b428e..cf11799297c42d50dc33aca126e6da11f033ce4b 100644 (file)
@@ -77,15 +77,68 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
 static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
-static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *);
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                            struct nfs_fattr *fattr, struct iattr *sattr,
-                           struct nfs4_state *state);
+                           struct nfs4_state *state, struct nfs4_label *ilabel,
+                           struct nfs4_label *olabel);
 #ifdef CONFIG_NFS_V4_1
-static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *);
-static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *);
+static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
+               struct rpc_cred *);
+static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
+               struct rpc_cred *);
 #endif
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+       struct iattr *sattr, struct nfs4_label *label)
+{
+       int err;
+
+       if (label == NULL)
+               return NULL;
+
+       if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
+               return NULL;
+
+       if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2)
+               return NULL;
+
+       err = security_dentry_init_security(dentry, sattr->ia_mode,
+                               &dentry->d_name, (void **)&label->label, &label->len);
+       if (err == 0)
+               return label;
+
+       return NULL;
+}
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{
+       if (label)
+               security_release_secctx(label->label, label->len);
+}
+static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{
+       if (label)
+               return server->attr_bitmask;
+
+       return server->attr_bitmask_nl;
+}
+#else
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+       struct iattr *sattr, struct nfs4_label *l)
+{ return NULL; }
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{ return; }
+static inline u32 *
+nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{ return server->attr_bitmask; }
+#endif
+
 /* Prevent leaks of NFSv4 errors into userland */
 static int nfs4_map_errors(int err)
 {
@@ -134,7 +187,10 @@ const u32 nfs4_fattr_bitmap[3] = {
        | FATTR4_WORD1_SPACE_USED
        | FATTR4_WORD1_TIME_ACCESS
        | FATTR4_WORD1_TIME_METADATA
-       | FATTR4_WORD1_TIME_MODIFY
+       | FATTR4_WORD1_TIME_MODIFY,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+       FATTR4_WORD2_SECURITY_LABEL
+#endif
 };
 
 static const u32 nfs4_pnfs_open_bitmap[3] = {
@@ -161,7 +217,7 @@ static const u32 nfs4_open_noattr_bitmap[3] = {
        | FATTR4_WORD0_FILEID,
 };
 
-const u32 nfs4_statfs_bitmap[2] = {
+const u32 nfs4_statfs_bitmap[3] = {
        FATTR4_WORD0_FILES_AVAIL
        | FATTR4_WORD0_FILES_FREE
        | FATTR4_WORD0_FILES_TOTAL,
@@ -170,7 +226,7 @@ const u32 nfs4_statfs_bitmap[2] = {
        | FATTR4_WORD1_SPACE_TOTAL
 };
 
-const u32 nfs4_pathconf_bitmap[2] = {
+const u32 nfs4_pathconf_bitmap[3] = {
        FATTR4_WORD0_MAXLINK
        | FATTR4_WORD0_MAXNAME,
        0
@@ -185,7 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
                        FATTR4_WORD2_LAYOUT_BLKSIZE
 };
 
-const u32 nfs4_fs_locations_bitmap[2] = {
+const u32 nfs4_fs_locations_bitmap[3] = {
        FATTR4_WORD0_TYPE
        | FATTR4_WORD0_CHANGE
        | FATTR4_WORD0_SIZE
@@ -201,7 +257,7 @@ const u32 nfs4_fs_locations_bitmap[2] = {
        | FATTR4_WORD1_TIME_ACCESS
        | FATTR4_WORD1_TIME_METADATA
        | FATTR4_WORD1_TIME_MODIFY
-       | FATTR4_WORD1_MOUNTED_ON_FILEID
+       | FATTR4_WORD1_MOUNTED_ON_FILEID,
 };
 
 static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
@@ -762,6 +818,7 @@ struct nfs4_opendata {
        struct nfs4_string owner_name;
        struct nfs4_string group_name;
        struct nfs_fattr f_attr;
+       struct nfs4_label *f_label;
        struct dentry *dir;
        struct dentry *dentry;
        struct nfs4_state_owner *owner;
@@ -807,6 +864,7 @@ nfs4_map_atomic_open_claim(struct nfs_server *server,
 static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 {
        p->o_res.f_attr = &p->f_attr;
+       p->o_res.f_label = p->f_label;
        p->o_res.seqid = p->o_arg.seqid;
        p->c_res.seqid = p->c_arg.seqid;
        p->o_res.server = p->o_arg.server;
@@ -818,6 +876,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
                struct nfs4_state_owner *sp, fmode_t fmode, int flags,
                const struct iattr *attrs,
+               struct nfs4_label *label,
                enum open_claim_type4 claim,
                gfp_t gfp_mask)
 {
@@ -829,9 +888,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        p = kzalloc(sizeof(*p), gfp_mask);
        if (p == NULL)
                goto err;
+
+       p->f_label = nfs4_label_alloc(server, gfp_mask);
+       if (IS_ERR(p->f_label))
+               goto err_free_p;
+
        p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
        if (p->o_arg.seqid == NULL)
-               goto err_free;
+               goto err_free_label;
        nfs_sb_active(dentry->d_sb);
        p->dentry = dget(dentry);
        p->dir = parent;
@@ -852,8 +916,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
        p->o_arg.name = &dentry->d_name;
        p->o_arg.server = server;
-       p->o_arg.bitmask = server->attr_bitmask;
+       p->o_arg.bitmask = nfs4_bitmask(server, label);
        p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
+       p->o_arg.label = label;
        p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
        switch (p->o_arg.claim) {
        case NFS4_OPEN_CLAIM_NULL:
@@ -884,7 +949,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        nfs4_init_opendata_res(p);
        kref_init(&p->kref);
        return p;
-err_free:
+
+err_free_label:
+       nfs4_label_free(p->f_label);
+err_free_p:
        kfree(p);
 err:
        dput(parent);
@@ -901,6 +969,9 @@ static void nfs4_opendata_free(struct kref *kref)
        if (p->state != NULL)
                nfs4_put_open_state(p->state);
        nfs4_put_state_owner(p->owner);
+
+       nfs4_label_free(p->f_label);
+
        dput(p->dir);
        dput(p->dentry);
        nfs_sb_deactive(sb);
@@ -1179,6 +1250,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
        if (ret)
                goto err;
 
+       nfs_setsecurity(inode, &data->f_attr, data->f_label);
+
        if (data->o_res.delegation_type != 0)
                nfs4_opendata_check_deleg(data, state);
        update_open_stateid(state, &data->o_res.stateid, NULL,
@@ -1205,7 +1278,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
        ret = -EAGAIN;
        if (!(data->f_attr.valid & NFS_ATTR_FATTR))
                goto err;
-       inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr);
+       inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);
        ret = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto err;
@@ -1258,7 +1331,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
        struct nfs4_opendata *opendata;
 
        opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
-                       NULL, claim, GFP_NOFS);
+                       NULL, NULL, claim, GFP_NOFS);
        if (opendata == NULL)
                return ERR_PTR(-ENOMEM);
        opendata->state = state;
@@ -1784,7 +1857,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
                        return status;
        }
        if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
-               _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr);
+               _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
        return 0;
 }
 
@@ -1855,18 +1928,30 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        nfs4_stateid *stateid = &state->stateid;
-       int status;
+       struct nfs_delegation *delegation;
+       struct rpc_cred *cred = NULL;
+       int status = -NFS4ERR_BAD_STATEID;
 
        /* If a state reset has been done, test_stateid is unneeded */
        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
                return;
 
-       status = nfs41_test_stateid(server, stateid);
+       /* Get the delegation credential for use by test/free_stateid */
+       rcu_read_lock();
+       delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+       if (delegation != NULL &&
+           nfs4_stateid_match(&delegation->stateid, stateid)) {
+               cred = get_rpccred(delegation->cred);
+               rcu_read_unlock();
+               status = nfs41_test_stateid(server, stateid, cred);
+       } else
+               rcu_read_unlock();
+
        if (status != NFS_OK) {
                /* Free the stateid unless the server explicitly
                 * informs us the stateid is unrecognized. */
                if (status != -NFS4ERR_BAD_STATEID)
-                       nfs41_free_stateid(server, stateid);
+                       nfs41_free_stateid(server, stateid, cred);
                nfs_remove_bad_delegation(state->inode);
 
                write_seqlock(&state->seqlock);
@@ -1874,6 +1959,9 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
                write_sequnlock(&state->seqlock);
                clear_bit(NFS_DELEGATED_STATE, &state->flags);
        }
+
+       if (cred != NULL)
+               put_rpccred(cred);
 }
 
 /**
@@ -1888,6 +1976,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        nfs4_stateid *stateid = &state->open_stateid;
+       struct rpc_cred *cred = state->owner->so_cred;
        int status;
 
        /* If a state reset has been done, test_stateid is unneeded */
@@ -1896,12 +1985,12 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
            (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
                return -NFS4ERR_BAD_STATEID;
 
-       status = nfs41_test_stateid(server, stateid);
+       status = nfs41_test_stateid(server, stateid, cred);
        if (status != NFS_OK) {
                /* Free the stateid unless the server explicitly
                 * informs us the stateid is unrecognized. */
                if (status != -NFS4ERR_BAD_STATEID)
-                       nfs41_free_stateid(server, stateid);
+                       nfs41_free_stateid(server, stateid, cred);
 
                clear_bit(NFS_O_RDONLY_STATE, &state->flags);
                clear_bit(NFS_O_WRONLY_STATE, &state->flags);
@@ -1942,10 +2031,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
 static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
                fmode_t fmode,
                int flags,
-               struct nfs4_state **res)
+               struct nfs_open_context *ctx)
 {
        struct nfs4_state_owner *sp = opendata->owner;
        struct nfs_server *server = sp->so_server;
+       struct dentry *dentry;
        struct nfs4_state *state;
        unsigned int seq;
        int ret;
@@ -1963,13 +2053,31 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
        if (server->caps & NFS_CAP_POSIX_LOCK)
                set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
 
+       dentry = opendata->dentry;
+       if (dentry->d_inode == NULL) {
+               /* FIXME: Is this d_drop() ever needed? */
+               d_drop(dentry);
+               dentry = d_add_unique(dentry, igrab(state->inode));
+               if (dentry == NULL) {
+                       dentry = opendata->dentry;
+               } else if (dentry != ctx->dentry) {
+                       dput(ctx->dentry);
+                       ctx->dentry = dget(dentry);
+               }
+               nfs_set_verifier(dentry,
+                               nfs_save_change_attribute(opendata->dir->d_inode));
+       }
+
        ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
        if (ret != 0)
                goto out;
 
-       if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
-               nfs4_schedule_stateid_recovery(server, state);
-       *res = state;
+       ctx->state = state;
+       if (dentry->d_inode == state->inode) {
+               nfs_inode_attach_open_context(ctx);
+               if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+                       nfs4_schedule_stateid_recovery(server, state);
+       }
 out:
        return ret;
 }
@@ -1978,19 +2086,21 @@ out:
  * Returns a referenced nfs4_state
  */
 static int _nfs4_do_open(struct inode *dir,
-                       struct dentry *dentry,
-                       fmode_t fmode,
+                       struct nfs_open_context *ctx,
                        int flags,
                        struct iattr *sattr,
-                       struct rpc_cred *cred,
-                       struct nfs4_state **res,
-                       struct nfs4_threshold **ctx_th)
+                       struct nfs4_label *label)
 {
        struct nfs4_state_owner  *sp;
        struct nfs4_state     *state = NULL;
        struct nfs_server       *server = NFS_SERVER(dir);
        struct nfs4_opendata *opendata;
+       struct dentry *dentry = ctx->dentry;
+       struct rpc_cred *cred = ctx->cred;
+       struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
+       fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
        enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
+       struct nfs4_label *olabel = NULL;
        int status;
 
        /* Protect against reboot recovery conflicts */
@@ -2009,22 +2119,31 @@ static int _nfs4_do_open(struct inode *dir,
        if (dentry->d_inode)
                claim = NFS4_OPEN_CLAIM_FH;
        opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
-                       claim, GFP_KERNEL);
+                       label, claim, GFP_KERNEL);
        if (opendata == NULL)
                goto err_put_state_owner;
 
+       if (label) {
+               olabel = nfs4_label_alloc(server, GFP_KERNEL);
+               if (IS_ERR(olabel)) {
+                       status = PTR_ERR(olabel);
+                       goto err_opendata_put;
+               }
+       }
+
        if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
                opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
                if (!opendata->f_attr.mdsthreshold)
-                       goto err_opendata_put;
+                       goto err_free_label;
                opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
        }
        if (dentry->d_inode != NULL)
                opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
 
-       status = _nfs4_open_and_get_state(opendata, fmode, flags, &state);
+       status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx);
        if (status != 0)
-               goto err_opendata_put;
+               goto err_free_label;
+       state = ctx->state;
 
        if ((opendata->o_arg.open_flags & O_EXCL) &&
            (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
@@ -2033,10 +2152,12 @@ static int _nfs4_do_open(struct inode *dir,
                nfs_fattr_init(opendata->o_res.f_attr);
                status = nfs4_do_setattr(state->inode, cred,
                                opendata->o_res.f_attr, sattr,
-                               state);
-               if (status == 0)
+                               state, label, olabel);
+               if (status == 0) {
                        nfs_setattr_update_inode(state->inode, sattr);
-               nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+                       nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+                       nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+               }
        }
 
        if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
@@ -2045,38 +2166,37 @@ static int _nfs4_do_open(struct inode *dir,
                kfree(opendata->f_attr.mdsthreshold);
        opendata->f_attr.mdsthreshold = NULL;
 
+       nfs4_label_free(olabel);
+
        nfs4_opendata_put(opendata);
        nfs4_put_state_owner(sp);
-       *res = state;
        return 0;
+err_free_label:
+       nfs4_label_free(olabel);
 err_opendata_put:
        kfree(opendata->f_attr.mdsthreshold);
        nfs4_opendata_put(opendata);
 err_put_state_owner:
        nfs4_put_state_owner(sp);
 out_err:
-       *res = NULL;
        return status;
 }
 
 
 static struct nfs4_state *nfs4_do_open(struct inode *dir,
-                                       struct dentry *dentry,
-                                       fmode_t fmode,
+                                       struct nfs_open_context *ctx,
                                        int flags,
                                        struct iattr *sattr,
-                                       struct rpc_cred *cred,
-                                       struct nfs4_threshold **ctx_th)
+                                       struct nfs4_label *label)
 {
        struct nfs_server *server = NFS_SERVER(dir);
        struct nfs4_exception exception = { };
        struct nfs4_state *res;
        int status;
 
-       fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC;
        do {
-               status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred,
-                                      &res, ctx_th);
+               status = _nfs4_do_open(dir, ctx, flags, sattr, label);
+               res = ctx->state;
                if (status == 0)
                        break;
                /* NOTE: BAD_SEQID means the server and client disagree about the
@@ -2122,7 +2242,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
 
 static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                            struct nfs_fattr *fattr, struct iattr *sattr,
-                           struct nfs4_state *state)
+                           struct nfs4_state *state, struct nfs4_label *ilabel,
+                           struct nfs4_label *olabel)
 {
        struct nfs_server *server = NFS_SERVER(inode);
         struct nfs_setattrargs  arg = {
@@ -2130,9 +2251,11 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                 .iap            = sattr,
                .server         = server,
                .bitmask = server->attr_bitmask,
+               .label          = ilabel,
         };
         struct nfs_setattrres  res = {
                .fattr          = fattr,
+               .label          = olabel,
                .server         = server,
         };
         struct rpc_message msg = {
@@ -2146,6 +2269,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        bool truncate;
        int status;
 
+       arg.bitmask = nfs4_bitmask(server, ilabel);
+       if (ilabel)
+               arg.bitmask = nfs4_bitmask(server, olabel);
+
        nfs_fattr_init(fattr);
 
        /* Servers should only apply open mode checks for file size changes */
@@ -2172,7 +2299,8 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                           struct nfs_fattr *fattr, struct iattr *sattr,
-                          struct nfs4_state *state)
+                          struct nfs4_state *state, struct nfs4_label *ilabel,
+                          struct nfs4_label *olabel)
 {
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs4_exception exception = {
@@ -2181,7 +2309,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        };
        int err;
        do {
-               err = _nfs4_do_setattr(inode, cred, fattr, sattr, state);
+               err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
                switch (err) {
                case -NFS4ERR_OPENMODE:
                        if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -2426,14 +2554,18 @@ static struct inode *
 nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
 {
        struct nfs4_state *state;
+       struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
+
+       label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
 
        /* Protect against concurrent sillydeletes */
-       state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr,
-                            ctx->cred, &ctx->mdsthreshold);
+       state = nfs4_do_open(dir, ctx, open_flags, attr, label);
+
+       nfs4_label_release_security(label);
+
        if (IS_ERR(state))
                return ERR_CAST(state);
-       ctx->state = state;
-       return igrab(state->inode);
+       return state->inode;
 }
 
 static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2489,7 +2621,17 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
                        server->caps |= NFS_CAP_CTIME;
                if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
                        server->caps |= NFS_CAP_MTIME;
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+               if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
+                       server->caps |= NFS_CAP_SECURITY_LABEL;
+#endif
+               memcpy(server->attr_bitmask_nl, res.attr_bitmask,
+                               sizeof(server->attr_bitmask));
 
+               if (server->caps & NFS_CAP_SECURITY_LABEL) {
+                       server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+                       res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+               }
                memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
                server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
                server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
@@ -2515,8 +2657,9 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
                struct nfs_fsinfo *info)
 {
+       u32 bitmask[3];
        struct nfs4_lookup_root_arg args = {
-               .bitmask = nfs4_fattr_bitmap,
+               .bitmask = bitmask,
        };
        struct nfs4_lookup_res res = {
                .server = server,
@@ -2529,6 +2672,13 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
                .rpc_resp = &res,
        };
 
+       bitmask[0] = nfs4_fattr_bitmap[0];
+       bitmask[1] = nfs4_fattr_bitmap[1];
+       /*
+        * Process the label in the upcoming getfattr
+        */
+       bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL;
+
        nfs_fattr_init(info->fattr);
        return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 }
@@ -2648,6 +2798,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
 {
        int error;
        struct nfs_fattr *fattr = info->fattr;
+       struct nfs4_label *label = NULL;
 
        error = nfs4_server_capabilities(server, mntfh);
        if (error < 0) {
@@ -2655,16 +2806,23 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
                return error;
        }
 
-       error = nfs4_proc_getattr(server, mntfh, fattr);
+       label = nfs4_label_alloc(server, GFP_KERNEL);
+       if (IS_ERR(label))
+               return PTR_ERR(label);
+
+       error = nfs4_proc_getattr(server, mntfh, fattr, label);
        if (error < 0) {
                dprintk("nfs4_get_root: getattr error = %d\n", -error);
-               return error;
+               goto err_free_label;
        }
 
        if (fattr->valid & NFS_ATTR_FATTR_FSID &&
            !nfs_fsid_equal(&server->fsid, &fattr->fsid))
                memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
 
+err_free_label:
+       nfs4_label_free(label);
+
        return error;
 }
 
@@ -2711,7 +2869,8 @@ out:
        return status;
 }
 
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+                               struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct nfs4_getattr_arg args = {
                .fh = fhandle,
@@ -2719,6 +2878,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
        };
        struct nfs4_getattr_res res = {
                .fattr = fattr,
+               .label = label,
                .server = server,
        };
        struct rpc_message msg = {
@@ -2726,18 +2886,21 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
                .rpc_argp = &args,
                .rpc_resp = &res,
        };
-       
+
+       args.bitmask = nfs4_bitmask(server, label);
+
        nfs_fattr_init(fattr);
        return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 }
 
-static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+                               struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct nfs4_exception exception = { };
        int err;
        do {
                err = nfs4_handle_exception(server,
-                               _nfs4_proc_getattr(server, fhandle, fattr),
+                               _nfs4_proc_getattr(server, fhandle, fattr, label),
                                &exception);
        } while (exception.retry);
        return err;
@@ -2767,6 +2930,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
        struct inode *inode = dentry->d_inode;
        struct rpc_cred *cred = NULL;
        struct nfs4_state *state = NULL;
+       struct nfs4_label *label = NULL;
        int status;
 
        if (pnfs_ld_layoutret_on_setattr(inode))
@@ -2793,15 +2957,22 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
                }
        }
 
-       status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
-       if (status == 0)
+       label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+       if (IS_ERR(label))
+               return PTR_ERR(label);
+
+       status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
+       if (status == 0) {
                nfs_setattr_update_inode(inode, sattr);
+               nfs_setsecurity(inode, fattr, label);
+       }
+       nfs4_label_free(label);
        return status;
 }
 
 static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
                const struct qstr *name, struct nfs_fh *fhandle,
-               struct nfs_fattr *fattr)
+               struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct nfs_server *server = NFS_SERVER(dir);
        int                    status;
@@ -2813,6 +2984,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
        struct nfs4_lookup_res res = {
                .server = server,
                .fattr = fattr,
+               .label = label,
                .fh = fhandle,
        };
        struct rpc_message msg = {
@@ -2821,6 +2993,8 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
                .rpc_resp = &res,
        };
 
+       args.bitmask = nfs4_bitmask(server, label);
+
        nfs_fattr_init(fattr);
 
        dprintk("NFS call  lookup %s\n", name->name);
@@ -2839,13 +3013,13 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
 
 static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
                                   struct qstr *name, struct nfs_fh *fhandle,
-                                  struct nfs_fattr *fattr)
+                                  struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct nfs4_exception exception = { };
        struct rpc_clnt *client = *clnt;
        int err;
        do {
-               err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr);
+               err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label);
                switch (err) {
                case -NFS4ERR_BADNAME:
                        err = -ENOENT;
@@ -2879,12 +3053,13 @@ out:
 }
 
 static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
-                           struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+                           struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+                           struct nfs4_label *label)
 {
        int status;
        struct rpc_clnt *client = NFS_CLIENT(dir);
 
-       status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr);
+       status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label);
        if (client != NFS_CLIENT(dir)) {
                rpc_shutdown_client(client);
                nfs_fixup_secinfo_attributes(fattr);
@@ -2899,7 +3074,7 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name,
        int status;
        struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir));
 
-       status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr);
+       status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL);
        if (status < 0) {
                rpc_shutdown_client(client);
                return ERR_PTR(status);
@@ -2924,7 +3099,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
                .rpc_cred = entry->cred,
        };
        int mode = entry->mask;
-       int status;
+       int status = 0;
 
        /*
         * Determine which access bits we want to ask for...
@@ -3029,6 +3204,7 @@ static int
 nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                 int flags)
 {
+       struct nfs4_label l, *ilabel = NULL;
        struct nfs_open_context *ctx;
        struct nfs4_state *state;
        int status = 0;
@@ -3037,19 +3213,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        if (IS_ERR(ctx))
                return PTR_ERR(ctx);
 
+       ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
+
        sattr->ia_mode &= ~current_umask();
-       state = nfs4_do_open(dir, dentry, ctx->mode,
-                       flags, sattr, ctx->cred,
-                       &ctx->mdsthreshold);
-       d_drop(dentry);
+       state = nfs4_do_open(dir, ctx, flags, sattr, ilabel);
        if (IS_ERR(state)) {
                status = PTR_ERR(state);
                goto out;
        }
-       d_add(dentry, igrab(state->inode));
-       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-       ctx->state = state;
 out:
+       nfs4_label_release_security(ilabel);
        put_nfs_open_context(ctx);
        return status;
 }
@@ -3098,6 +3271,8 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
        res->server = server;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
        nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
+
+       nfs_fattr_init(res->dir_attr);
 }
 
 static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
@@ -3173,7 +3348,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
                .rpc_resp = &res,
        };
        int status = -ENOMEM;
-       
+
        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
        if (!status) {
                update_changeattr(old_dir, &res.old_cinfo);
@@ -3207,6 +3382,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
        };
        struct nfs4_link_res res = {
                .server = server,
+               .label = NULL,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
@@ -3219,11 +3395,24 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
        if (res.fattr == NULL)
                goto out;
 
+       res.label = nfs4_label_alloc(server, GFP_KERNEL);
+       if (IS_ERR(res.label)) {
+               status = PTR_ERR(res.label);
+               goto out;
+       }
+       arg.bitmask = nfs4_bitmask(server, res.label);
+
        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
        if (!status) {
                update_changeattr(dir, &res.cinfo);
-               nfs_post_op_update_inode(inode, res.fattr);
+               status = nfs_post_op_update_inode(inode, res.fattr);
+               if (!status)
+                       nfs_setsecurity(inode, res.fattr, res.label);
        }
+
+
+       nfs4_label_free(res.label);
+
 out:
        nfs_free_fattr(res.fattr);
        return status;
@@ -3247,6 +3436,7 @@ struct nfs4_createdata {
        struct nfs4_create_res res;
        struct nfs_fh fh;
        struct nfs_fattr fattr;
+       struct nfs4_label *label;
 };
 
 static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
@@ -3258,6 +3448,10 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
        if (data != NULL) {
                struct nfs_server *server = NFS_SERVER(dir);
 
+               data->label = nfs4_label_alloc(server, GFP_KERNEL);
+               if (IS_ERR(data->label))
+                       goto out_free;
+
                data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
                data->msg.rpc_argp = &data->arg;
                data->msg.rpc_resp = &data->res;
@@ -3266,13 +3460,17 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
                data->arg.name = name;
                data->arg.attrs = sattr;
                data->arg.ftype = ftype;
-               data->arg.bitmask = server->attr_bitmask;
+               data->arg.bitmask = nfs4_bitmask(server, data->label);
                data->res.server = server;
                data->res.fh = &data->fh;
                data->res.fattr = &data->fattr;
+               data->res.label = data->label;
                nfs_fattr_init(data->res.fattr);
        }
        return data;
+out_free:
+       kfree(data);
+       return NULL;
 }
 
 static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
@@ -3281,18 +3479,20 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
                                    &data->arg.seq_args, &data->res.seq_res, 1);
        if (status == 0) {
                update_changeattr(dir, &data->res.dir_cinfo);
-               status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+               status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
        }
        return status;
 }
 
 static void nfs4_free_createdata(struct nfs4_createdata *data)
 {
+       nfs4_label_free(data->label);
        kfree(data);
 }
 
 static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
-               struct page *page, unsigned int len, struct iattr *sattr)
+               struct page *page, unsigned int len, struct iattr *sattr,
+               struct nfs4_label *label)
 {
        struct nfs4_createdata *data;
        int status = -ENAMETOOLONG;
@@ -3308,6 +3508,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
        data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
        data->arg.u.symlink.pages = &page;
        data->arg.u.symlink.len = len;
+       data->arg.label = label;
        
        status = nfs4_do_create(dir, dentry, data);
 
@@ -3320,18 +3521,24 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
                struct page *page, unsigned int len, struct iattr *sattr)
 {
        struct nfs4_exception exception = { };
+       struct nfs4_label l, *label = NULL;
        int err;
+
+       label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
                                _nfs4_proc_symlink(dir, dentry, page,
-                                                       len, sattr),
+                                                       len, sattr, label),
                                &exception);
        } while (exception.retry);
+
+       nfs4_label_release_security(label);
        return err;
 }
 
 static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
-               struct iattr *sattr)
+               struct iattr *sattr, struct nfs4_label *label)
 {
        struct nfs4_createdata *data;
        int status = -ENOMEM;
@@ -3340,6 +3547,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
        if (data == NULL)
                goto out;
 
+       data->arg.label = label;
        status = nfs4_do_create(dir, dentry, data);
 
        nfs4_free_createdata(data);
@@ -3351,14 +3559,19 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
                struct iattr *sattr)
 {
        struct nfs4_exception exception = { };
+       struct nfs4_label l, *label = NULL;
        int err;
 
+       label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
-                               _nfs4_proc_mkdir(dir, dentry, sattr),
+                               _nfs4_proc_mkdir(dir, dentry, sattr, label),
                                &exception);
        } while (exception.retry);
+       nfs4_label_release_security(label);
+
        return err;
 }
 
@@ -3416,7 +3629,7 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 }
 
 static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
-               struct iattr *sattr, dev_t rdev)
+               struct iattr *sattr, struct nfs4_label *label, dev_t rdev)
 {
        struct nfs4_createdata *data;
        int mode = sattr->ia_mode;
@@ -3441,7 +3654,8 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
                status = -EINVAL;
                goto out_free;
        }
-       
+
+       data->arg.label = label;
        status = nfs4_do_create(dir, dentry, data);
 out_free:
        nfs4_free_createdata(data);
@@ -3453,14 +3667,20 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
                struct iattr *sattr, dev_t rdev)
 {
        struct nfs4_exception exception = { };
+       struct nfs4_label l, *label = NULL;
        int err;
 
+       label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
        sattr->ia_mode &= ~current_umask();
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
-                               _nfs4_proc_mknod(dir, dentry, sattr, rdev),
+                               _nfs4_proc_mknod(dir, dentry, sattr, label, rdev),
                                &exception);
        } while (exception.retry);
+
+       nfs4_label_release_security(label);
+
        return err;
 }
 
@@ -4187,6 +4407,155 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
        return err;
 }
 
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static int _nfs4_get_security_label(struct inode *inode, void *buf,
+                                       size_t buflen)
+{
+       struct nfs_server *server = NFS_SERVER(inode);
+       struct nfs_fattr fattr;
+       struct nfs4_label label = {0, 0, buflen, buf};
+
+       u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+       struct nfs4_getattr_arg args = {
+               .fh             = NFS_FH(inode),
+               .bitmask        = bitmask,
+       };
+       struct nfs4_getattr_res res = {
+               .fattr          = &fattr,
+               .label          = &label,
+               .server         = server,
+       };
+       struct rpc_message msg = {
+               .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
+               .rpc_argp       = &args,
+               .rpc_resp       = &res,
+       };
+       int ret;
+
+       nfs_fattr_init(&fattr);
+
+       ret = rpc_call_sync(server->client, &msg, 0);
+       if (ret)
+               return ret;
+       if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
+               return -ENOENT;
+       if (buflen < label.len)
+               return -ERANGE;
+       return 0;
+}
+
+static int nfs4_get_security_label(struct inode *inode, void *buf,
+                                       size_t buflen)
+{
+       struct nfs4_exception exception = { };
+       int err;
+
+       if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+               return -EOPNOTSUPP;
+
+       do {
+               err = nfs4_handle_exception(NFS_SERVER(inode),
+                               _nfs4_get_security_label(inode, buf, buflen),
+                               &exception);
+       } while (exception.retry);
+       return err;
+}
+
+static int _nfs4_do_set_security_label(struct inode *inode,
+               struct nfs4_label *ilabel,
+               struct nfs_fattr *fattr,
+               struct nfs4_label *olabel)
+{
+
+       struct iattr sattr = {0};
+       struct nfs_server *server = NFS_SERVER(inode);
+       const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+       struct nfs_setattrargs args = {
+               .fh             = NFS_FH(inode),
+               .iap            = &sattr,
+               .server         = server,
+               .bitmask        = bitmask,
+               .label          = ilabel,
+       };
+       struct nfs_setattrres res = {
+               .fattr          = fattr,
+               .label          = olabel,
+               .server         = server,
+       };
+       struct rpc_message msg = {
+               .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+               .rpc_argp       = &args,
+               .rpc_resp       = &res,
+       };
+       int status;
+
+       nfs4_stateid_copy(&args.stateid, &zero_stateid);
+
+       status = rpc_call_sync(server->client, &msg, 0);
+       if (status)
+               dprintk("%s failed: %d\n", __func__, status);
+
+       return status;
+}
+
+static int nfs4_do_set_security_label(struct inode *inode,
+               struct nfs4_label *ilabel,
+               struct nfs_fattr *fattr,
+               struct nfs4_label *olabel)
+{
+       struct nfs4_exception exception = { };
+       int err;
+
+       do {
+               err = nfs4_handle_exception(NFS_SERVER(inode),
+                               _nfs4_do_set_security_label(inode, ilabel,
+                               fattr, olabel),
+                               &exception);
+       } while (exception.retry);
+       return err;
+}
+
+static int
+nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
+{
+       struct nfs4_label ilabel, *olabel = NULL;
+       struct nfs_fattr fattr;
+       struct rpc_cred *cred;
+       struct inode *inode = dentry->d_inode;
+       int status;
+
+       if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+               return -EOPNOTSUPP;
+
+       nfs_fattr_init(&fattr);
+
+       ilabel.pi = 0;
+       ilabel.lfs = 0;
+       ilabel.label = (char *)buf;
+       ilabel.len = buflen;
+
+       cred = rpc_lookup_cred();
+       if (IS_ERR(cred))
+               return PTR_ERR(cred);
+
+       olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+       if (IS_ERR(olabel)) {
+               status = -PTR_ERR(olabel);
+               goto out;
+       }
+
+       status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel);
+       if (status == 0)
+               nfs_setsecurity(inode, &fattr, olabel);
+
+       nfs4_label_free(olabel);
+out:
+       put_rpccred(cred);
+       return status;
+}
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+
+
 static int
 nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
 {
@@ -4345,7 +4714,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
        /* cb_client4 */
        rcu_read_lock();
        setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
-                               sizeof(setclientid.sc_netid),
+                               sizeof(setclientid.sc_netid), "%s",
                                rpc_peeraddr2str(clp->cl_rpcclient,
                                                        RPC_DISPLAY_NETID));
        rcu_read_unlock();
@@ -5056,13 +5425,18 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
 
        list_for_each_entry(lsp, &state->lock_states, ls_locks) {
                if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-                       status = nfs41_test_stateid(server, &lsp->ls_stateid);
+                       struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+
+                       status = nfs41_test_stateid(server,
+                                       &lsp->ls_stateid,
+                                       cred);
                        if (status != NFS_OK) {
                                /* Free the stateid unless the server
                                 * informs us the stateid is unrecognized. */
                                if (status != -NFS4ERR_BAD_STATEID)
                                        nfs41_free_stateid(server,
-                                                       &lsp->ls_stateid);
+                                                       &lsp->ls_stateid,
+                                                       cred);
                                clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
                                ret = status;
                        }
@@ -5295,6 +5669,53 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
        return len;
 }
 
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline int nfs4_server_supports_labels(struct nfs_server *server)
+{
+       return server->caps & NFS_CAP_SECURITY_LABEL;
+}
+
+static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
+                                  const void *buf, size_t buflen,
+                                  int flags, int type)
+{
+       if (security_ismaclabel(key))
+               return nfs4_set_security_label(dentry, buf, buflen);
+
+       return -EOPNOTSUPP;
+}
+
+static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
+                                  void *buf, size_t buflen, int type)
+{
+       if (security_ismaclabel(key))
+               return nfs4_get_security_label(dentry->d_inode, buf, buflen);
+       return -EOPNOTSUPP;
+}
+
+static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
+                                      size_t list_len, const char *name,
+                                      size_t name_len, int type)
+{
+       size_t len = 0;
+
+       if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) {
+               len = security_inode_listsecurity(dentry->d_inode, NULL, 0);
+               if (list && len <= list_len)
+                       security_inode_listsecurity(dentry->d_inode, list, len);
+       }
+       return len;
+}
+
+static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
+       .prefix = XATTR_SECURITY_PREFIX,
+       .list   = nfs4_xattr_list_nfs4_label,
+       .get    = nfs4_xattr_get_nfs4_label,
+       .set    = nfs4_xattr_set_nfs4_label,
+};
+#endif
+
+
 /*
  * nfs_fhget will use either the mounted_on_fileid or the fileid
  */
@@ -5318,7 +5739,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
                                   struct page *page)
 {
        struct nfs_server *server = NFS_SERVER(dir);
-       u32 bitmask[2] = {
+       u32 bitmask[3] = {
                [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
        };
        struct nfs4_fs_locations_arg args = {
@@ -5505,7 +5926,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        struct nfs41_exchange_id_args args = {
                .verifier = &verifier,
                .client = clp,
-               .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
+               .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+                       EXCHGID4_FLAG_BIND_PRINC_STATEID,
        };
        struct nfs41_exchange_id_res res = {
                0
@@ -5762,17 +6184,14 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
  */
 static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 {
-       struct nfs4_session *session = args->client->cl_session;
-       unsigned int mxrqst_sz = session->fc_target_max_rqst_sz,
-                    mxresp_sz = session->fc_target_max_resp_sz;
+       unsigned int max_rqst_sz, max_resp_sz;
+
+       max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
+       max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
 
-       if (mxrqst_sz == 0)
-               mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
-       if (mxresp_sz == 0)
-               mxresp_sz = NFS_MAX_FILE_IO_SIZE;
        /* Fore channel attributes */
-       args->fc_attrs.max_rqst_sz = mxrqst_sz;
-       args->fc_attrs.max_resp_sz = mxresp_sz;
+       args->fc_attrs.max_rqst_sz = max_rqst_sz;
+       args->fc_attrs.max_resp_sz = max_resp_sz;
        args->fc_attrs.max_ops = NFS4_MAX_OPS;
        args->fc_attrs.max_reqs = max_session_slots;
 
@@ -6159,12 +6578,14 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
 /*
  * Issue a global reclaim complete.
  */
-static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
+static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
+               struct rpc_cred *cred)
 {
        struct nfs4_reclaim_complete_data *calldata;
        struct rpc_task *task;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
+               .rpc_cred = cred,
        };
        struct rpc_task_setup task_setup_data = {
                .rpc_client = clp->cl_rpcclient,
@@ -6348,6 +6769,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
                .rpc_argp = &lgp->args,
                .rpc_resp = &lgp->res,
+               .rpc_cred = lgp->cred,
        };
        struct rpc_task_setup task_setup_data = {
                .rpc_client = server->client,
@@ -6451,6 +6873,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
                .rpc_argp = &lrp->args,
                .rpc_resp = &lrp->res,
+               .rpc_cred = lrp->cred,
        };
        struct rpc_task_setup task_setup_data = {
                .rpc_client = lrp->clp->cl_rpcclient,
@@ -6520,7 +6943,9 @@ int nfs4_proc_getdevicelist(struct nfs_server *server,
 EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
 
 static int
-_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+_nfs4_proc_getdeviceinfo(struct nfs_server *server,
+               struct pnfs_device *pdev,
+               struct rpc_cred *cred)
 {
        struct nfs4_getdeviceinfo_args args = {
                .pdev = pdev,
@@ -6532,6 +6957,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
                .rpc_argp = &args,
                .rpc_resp = &res,
+               .rpc_cred = cred,
        };
        int status;
 
@@ -6542,14 +6968,16 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
        return status;
 }
 
-int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+               struct pnfs_device *pdev,
+               struct rpc_cred *cred)
 {
        struct nfs4_exception exception = { };
        int err;
 
        do {
                err = nfs4_handle_exception(server,
-                                       _nfs4_proc_getdeviceinfo(server, pdev),
+                                       _nfs4_proc_getdeviceinfo(server, pdev, cred),
                                        &exception);
        } while (exception.retry);
        return err;
@@ -6733,7 +7161,9 @@ out:
        return err;
 }
 
-static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int _nfs41_test_stateid(struct nfs_server *server,
+               nfs4_stateid *stateid,
+               struct rpc_cred *cred)
 {
        int status;
        struct nfs41_test_stateid_args args = {
@@ -6744,6 +7174,7 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
                .rpc_argp = &args,
                .rpc_resp = &res,
+               .rpc_cred = cred,
        };
 
        dprintk("NFS call  test_stateid %p\n", stateid);
@@ -6764,17 +7195,20 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
  *
  * @server: server / transport on which to perform the operation
  * @stateid: state ID to test
+ * @cred: credential
  *
  * Returns NFS_OK if the server recognizes that "stateid" is valid.
  * Otherwise a negative NFS4ERR value is returned if the operation
  * failed or the state ID is not currently valid.
  */
-static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int nfs41_test_stateid(struct nfs_server *server,
+               nfs4_stateid *stateid,
+               struct rpc_cred *cred)
 {
        struct nfs4_exception exception = { };
        int err;
        do {
-               err = _nfs41_test_stateid(server, stateid);
+               err = _nfs41_test_stateid(server, stateid, cred);
                if (err != -NFS4ERR_DELAY)
                        break;
                nfs4_handle_exception(server, err, &exception);
@@ -6823,10 +7257,12 @@ const struct rpc_call_ops nfs41_free_stateid_ops = {
 
 static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
                nfs4_stateid *stateid,
+               struct rpc_cred *cred,
                bool privileged)
 {
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
+               .rpc_cred = cred,
        };
        struct rpc_task_setup task_setup = {
                .rpc_client = server->client,
@@ -6859,16 +7295,19 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
  *
  * @server: server / transport on which to perform the operation
  * @stateid: state ID to release
+ * @cred: credential
  *
  * Returns NFS_OK if the server freed "stateid".  Otherwise a
  * negative NFS4ERR value is returned.
  */
-static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int nfs41_free_stateid(struct nfs_server *server,
+               nfs4_stateid *stateid,
+               struct rpc_cred *cred)
 {
        struct rpc_task *task;
        int ret;
 
-       task = _nfs41_free_stateid(server, stateid, true);
+       task = _nfs41_free_stateid(server, stateid, cred, true);
        if (IS_ERR(task))
                return PTR_ERR(task);
        ret = rpc_wait_for_completion_task(task);
@@ -6881,8 +7320,9 @@ static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
 static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
        struct rpc_task *task;
+       struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
 
-       task = _nfs41_free_stateid(server, &lsp->ls_stateid, false);
+       task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
        nfs4_free_lock_state(server, lsp);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -7004,11 +7444,33 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 };
 #endif
 
+#if defined(CONFIG_NFS_V4_2)
+static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
+       .minor_version = 2,
+       .init_caps = NFS_CAP_READDIRPLUS
+               | NFS_CAP_ATOMIC_OPEN
+               | NFS_CAP_CHANGE_ATTR
+               | NFS_CAP_POSIX_LOCK
+               | NFS_CAP_STATEID_NFSV41
+               | NFS_CAP_ATOMIC_OPEN_V1,
+       .call_sync = nfs4_call_sync_sequence,
+       .match_stateid = nfs41_match_stateid,
+       .find_root_sec = nfs41_find_root_sec,
+       .free_lock_state = nfs41_free_lock_state,
+       .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+       .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+       .state_renewal_ops = &nfs41_state_renewal_ops,
+};
+#endif
+
 const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
        [0] = &nfs_v4_0_minor_ops,
 #if defined(CONFIG_NFS_V4_1)
        [1] = &nfs_v4_1_minor_ops,
 #endif
+#if defined(CONFIG_NFS_V4_2)
+       [2] = &nfs_v4_2_minor_ops,
+#endif
 };
 
 const struct inode_operations nfs4_dir_inode_operations = {
@@ -7108,6 +7570,9 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
 
 const struct xattr_handler *nfs4_xattr_handlers[] = {
        &nfs4_xattr_nfs4_acl_handler,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+       &nfs4_xattr_nfs4_label_handler,
+#endif
        NULL
 };
 
index c4e225e4a9afc382b1eb689314ec2e7875cc1a45..36e21cb29d65971dff3f1b104d5685a8cae27d83 100644 (file)
@@ -478,48 +478,12 @@ static int nfs41_check_session_ready(struct nfs_client *clp)
        return 0;
 }
 
-int nfs4_init_session(struct nfs_server *server)
+int nfs4_init_session(struct nfs_client *clp)
 {
-       struct nfs_client *clp = server->nfs_client;
-       struct nfs4_session *session;
-       unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE;
-       unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE;
-
        if (!nfs4_has_session(clp))
                return 0;
 
-       if (server->rsize != 0)
-               target_max_resp_sz = server->rsize;
-       target_max_resp_sz += nfs41_maxread_overhead;
-
-       if (server->wsize != 0)
-               target_max_rqst_sz = server->wsize;
-       target_max_rqst_sz += nfs41_maxwrite_overhead;
-
-       session = clp->cl_session;
-       spin_lock(&clp->cl_lock);
-       if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
-               /* Initialise targets and channel attributes */
-               session->fc_target_max_rqst_sz = target_max_rqst_sz;
-               session->fc_attrs.max_rqst_sz = target_max_rqst_sz;
-               session->fc_target_max_resp_sz = target_max_resp_sz;
-               session->fc_attrs.max_resp_sz = target_max_resp_sz;
-       } else {
-               /* Just adjust the targets */
-               if (target_max_rqst_sz > session->fc_target_max_rqst_sz) {
-                       session->fc_target_max_rqst_sz = target_max_rqst_sz;
-                       set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
-               }
-               if (target_max_resp_sz > session->fc_target_max_resp_sz) {
-                       session->fc_target_max_resp_sz = target_max_resp_sz;
-                       set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
-               }
-       }
-       spin_unlock(&clp->cl_lock);
-
-       if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
-               nfs4_schedule_lease_recovery(clp);
-
+       clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state);
        return nfs41_check_session_ready(clp);
 }
 
index ff7d9f0f8a65179fbf9bc795c4b1cdbf1ee56bf1..3a153d82b90c638215b5d01c68b68116a454d515 100644 (file)
@@ -66,9 +66,6 @@ struct nfs4_session {
        struct nfs4_channel_attrs       bc_attrs;
        struct nfs4_slot_table          bc_slot_table;
        struct nfs_client               *clp;
-       /* Create session arguments */
-       unsigned int                    fc_target_max_rqst_sz;
-       unsigned int                    fc_target_max_resp_sz;
 };
 
 enum nfs4_session_state {
@@ -89,7 +86,7 @@ extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
 
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern void nfs4_destroy_session(struct nfs4_session *session);
-extern int nfs4_init_session(struct nfs_server *server);
+extern int nfs4_init_session(struct nfs_client *clp);
 extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
 
 extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
@@ -122,7 +119,7 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
 
 #else /* defined(CONFIG_NFS_V4_1) */
 
-static inline int nfs4_init_session(struct nfs_server *server)
+static inline int nfs4_init_session(struct nfs_client *clp)
 {
        return 0;
 }
index 55418811a55aba706cf4682bd38f13818d816759..e22862f13564486ab535a437f4d46e6709a30ed9 100644 (file)
@@ -228,19 +228,8 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
        return status;
 }
 
-/*
- * Back channel returns NFS4ERR_DELAY for new requests when
- * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
- * is ended.
- */
-static void nfs4_end_drain_session(struct nfs_client *clp)
+static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
 {
-       struct nfs4_session *ses = clp->cl_session;
-       struct nfs4_slot_table *tbl;
-
-       if (ses == NULL)
-               return;
-       tbl = &ses->fc_slot_table;
        if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
                spin_lock(&tbl->slot_tbl_lock);
                nfs41_wake_slot_table(tbl);
@@ -248,6 +237,16 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
        }
 }
 
+static void nfs4_end_drain_session(struct nfs_client *clp)
+{
+       struct nfs4_session *ses = clp->cl_session;
+
+       if (ses != NULL) {
+               nfs4_end_drain_slot_table(&ses->bc_slot_table);
+               nfs4_end_drain_slot_table(&ses->fc_slot_table);
+       }
+}
+
 /*
  * Signal state manager thread if session fore channel is drained
  */
@@ -1563,11 +1562,12 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
 }
 
 static void nfs4_reclaim_complete(struct nfs_client *clp,
-                                const struct nfs4_state_recovery_ops *ops)
+                                const struct nfs4_state_recovery_ops *ops,
+                                struct rpc_cred *cred)
 {
        /* Notify the server we're done reclaiming our state */
        if (ops->reclaim_complete)
-               (void)ops->reclaim_complete(clp);
+               (void)ops->reclaim_complete(clp, cred);
 }
 
 static void nfs4_clear_reclaim_server(struct nfs_server *server)
@@ -1612,9 +1612,15 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
 
 static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
 {
+       const struct nfs4_state_recovery_ops *ops;
+       struct rpc_cred *cred;
+
        if (!nfs4_state_clear_reclaim_reboot(clp))
                return;
-       nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
+       ops = clp->cl_mvops->reboot_recovery_ops;
+       cred = ops->get_clid_cred(clp);
+       nfs4_reclaim_complete(clp, ops, cred);
+       put_rpccred(cred);
 }
 
 static void nfs_delegation_clear_all(struct nfs_client *clp)
index a5e1a3026d489240cb7a44ae87eec07df019ce3a..5dbe2d269210f000132547d3c21daa33e7e1b224 100644 (file)
@@ -9,6 +9,7 @@
 #include "delegation.h"
 #include "internal.h"
 #include "nfs4_fs.h"
+#include "dns_resolve.h"
 #include "pnfs.h"
 #include "nfs.h"
 
@@ -331,18 +332,24 @@ static int __init init_nfs_v4(void)
 {
        int err;
 
-       err = nfs_idmap_init();
+       err = nfs_dns_resolver_init();
        if (err)
                goto out;
 
-       err = nfs4_register_sysctl();
+       err = nfs_idmap_init();
        if (err)
                goto out1;
 
+       err = nfs4_register_sysctl();
+       if (err)
+               goto out2;
+
        register_nfs_version(&nfs_v4);
        return 0;
-out1:
+out2:
        nfs_idmap_quit();
+out1:
+       nfs_dns_resolver_destroy();
 out:
        return err;
 }
@@ -352,6 +359,7 @@ static void __exit exit_nfs_v4(void)
        unregister_nfs_version(&nfs_v4);
        nfs4_unregister_sysctl();
        nfs_idmap_quit();
+       nfs_dns_resolver_destroy();
 }
 
 MODULE_LICENSE("GPL");
index 4be8d135ed61b19bc14f511b0013382211a64c0c..0abfb8466e796cb8c0894ae6fbc38180865904f4 100644 (file)
@@ -102,12 +102,23 @@ static int nfs4_stat_to_errno(int);
 #define nfs4_path_maxsz                (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
 #define nfs4_owner_maxsz       (1 + XDR_QUADLEN(IDMAP_NAMESZ))
 #define nfs4_group_maxsz       (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
+#define        nfs4_label_maxsz        (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
+#define encode_readdir_space 24
+#define encode_readdir_bitmask_sz 3
+#else
+#define        nfs4_label_maxsz        0
+#define encode_readdir_space 20
+#define encode_readdir_bitmask_sz 2
+#endif
 /* We support only one layout type per file system */
 #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
 /* This is based on getfattr, which uses the most attributes: */
 #define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
                                3 + 3 + 3 + nfs4_owner_maxsz + \
-                               nfs4_group_maxsz + decode_mdsthreshold_maxsz))
+                               nfs4_group_maxsz + nfs4_label_maxsz + \
+                                decode_mdsthreshold_maxsz))
 #define nfs4_fattr_maxsz       (nfs4_fattr_bitmap_maxsz + \
                                nfs4_fattr_value_maxsz)
 #define decode_getattr_maxsz    (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
@@ -115,6 +126,7 @@ static int nfs4_stat_to_errno(int);
                                 1 + 2 + 1 + \
                                nfs4_owner_maxsz + \
                                nfs4_group_maxsz + \
+                               nfs4_label_maxsz + \
                                4 + 4)
 #define encode_savefh_maxsz     (op_encode_hdr_maxsz)
 #define decode_savefh_maxsz     (op_decode_hdr_maxsz)
@@ -192,9 +204,11 @@ static int nfs4_stat_to_errno(int);
                                 encode_stateid_maxsz + 3)
 #define decode_read_maxsz      (op_decode_hdr_maxsz + 2)
 #define encode_readdir_maxsz   (op_encode_hdr_maxsz + \
-                                2 + encode_verifier_maxsz + 5)
+                                2 + encode_verifier_maxsz + 5 + \
+                               nfs4_label_maxsz)
 #define decode_readdir_maxsz   (op_decode_hdr_maxsz + \
-                                decode_verifier_maxsz)
+                                decode_verifier_maxsz + \
+                               nfs4_label_maxsz + nfs4_fattr_maxsz)
 #define encode_readlink_maxsz  (op_encode_hdr_maxsz)
 #define decode_readlink_maxsz  (op_decode_hdr_maxsz + 1)
 #define encode_write_maxsz     (op_encode_hdr_maxsz + \
@@ -853,6 +867,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
                                     decode_sequence_maxsz +
                                     decode_putfh_maxsz) *
                                    XDR_UNIT);
+
+const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH +
+                                          compound_decode_hdr_maxsz +
+                                          decode_sequence_maxsz) *
+                                         XDR_UNIT);
+EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead);
 #endif /* CONFIG_NFS_V4_1 */
 
 static const umode_t nfs_type2fmt[] = {
@@ -968,7 +988,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
        encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
 }
 
-static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
+                               const struct nfs4_label *label,
+                               const struct nfs_server *server)
 {
        char owner_name[IDMAP_NAMESZ];
        char owner_group[IDMAP_NAMESZ];
@@ -979,15 +1001,16 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
        int len;
        uint32_t bmval0 = 0;
        uint32_t bmval1 = 0;
+       uint32_t bmval2 = 0;
 
        /*
         * We reserve enough space to write the entire attribute buffer at once.
         * In the worst-case, this would be
-        *   12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
-        *          = 36 bytes, plus any contribution from variable-length fields
+        * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
+        * = 40 bytes, plus any contribution from variable-length fields
         *            such as owner/group.
         */
-       len = 16;
+       len = 20;
 
        /* Sigh */
        if (iap->ia_valid & ATTR_SIZE)
@@ -1017,6 +1040,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                }
                len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);
        }
+       if (label)
+               len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
        if (iap->ia_valid & ATTR_ATIME_SET)
                len += 16;
        else if (iap->ia_valid & ATTR_ATIME)
@@ -1031,9 +1056,9 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
         * We write the bitmap length now, but leave the bitmap and the attribute
         * buffer length to be backfilled at the end of this routine.
         */
-       *p++ = cpu_to_be32(2);
+       *p++ = cpu_to_be32(3);
        q = p;
-       p += 3;
+       p += 4;
 
        if (iap->ia_valid & ATTR_SIZE) {
                bmval0 |= FATTR4_WORD0_SIZE;
@@ -1071,6 +1096,13 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
                *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
        }
+       if (label) {
+               bmval2 |= FATTR4_WORD2_SECURITY_LABEL;
+               *p++ = cpu_to_be32(label->lfs);
+               *p++ = cpu_to_be32(label->pi);
+               *p++ = cpu_to_be32(label->len);
+               p = xdr_encode_opaque_fixed(p, label->label, label->len);
+       }
 
        /*
         * Now we backfill the bitmap and the attribute buffer length.
@@ -1080,9 +1112,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                                len, ((char *)p - (char *)q) + 4);
                BUG();
        }
-       len = (char *)p - (char *)q - 12;
+       len = (char *)p - (char *)q - 16;
        *q++ = htonl(bmval0);
        *q++ = htonl(bmval1);
+       *q++ = htonl(bmval2);
        *q = htonl(len);
 
 /* out: */
@@ -1136,7 +1169,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
        }
 
        encode_string(xdr, create->name->len, create->name->name);
-       encode_attrs(xdr, create->attrs, create->server);
+       encode_attrs(xdr, create->attrs, create->label, create->server);
 }
 
 static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1188,8 +1221,10 @@ encode_getattr_three(struct xdr_stream *xdr,
 
 static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-       encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
-                          bitmask[1] & nfs4_fattr_bitmap[1], hdr);
+       encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
+                          bitmask[1] & nfs4_fattr_bitmap[1],
+                          bitmask[2] & nfs4_fattr_bitmap[2],
+                          hdr);
 }
 
 static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
@@ -1367,11 +1402,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
        switch(arg->createmode) {
        case NFS4_CREATE_UNCHECKED:
                *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
-               encode_attrs(xdr, arg->u.attrs, arg->server);
+               encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
                break;
        case NFS4_CREATE_GUARDED:
                *p = cpu_to_be32(NFS4_CREATE_GUARDED);
-               encode_attrs(xdr, arg->u.attrs, arg->server);
+               encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
                break;
        case NFS4_CREATE_EXCLUSIVE:
                *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1381,7 +1416,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
                *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
                encode_nfs4_verifier(xdr, &arg->u.verifier);
                dummy.ia_valid = 0;
-               encode_attrs(xdr, &dummy, arg->server);
+               encode_attrs(xdr, &dummy, arg->label, arg->server);
        }
 }
 
@@ -1532,7 +1567,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 
 static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
-       uint32_t attrs[2] = {
+       uint32_t attrs[3] = {
                FATTR4_WORD0_RDATTR_ERROR,
                FATTR4_WORD1_MOUNTED_ON_FILEID,
        };
@@ -1555,20 +1590,26 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
        encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
        encode_uint64(xdr, readdir->cookie);
        encode_nfs4_verifier(xdr, &readdir->verifier);
-       p = reserve_space(xdr, 20);
+       p = reserve_space(xdr, encode_readdir_space);
        *p++ = cpu_to_be32(dircount);
        *p++ = cpu_to_be32(readdir->count);
-       *p++ = cpu_to_be32(2);
-
+       *p++ = cpu_to_be32(encode_readdir_bitmask_sz);
        *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
-       *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+       *p   = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+       if (encode_readdir_bitmask_sz > 2) {
+               if (hdr->minorversion > 1)
+                       attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
+               p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]);
+       }
        memcpy(verf, readdir->verifier.data, sizeof(verf));
-       dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
+
+       dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
                        __func__,
                        (unsigned long long)readdir->cookie,
                        verf[0], verf[1],
                        attrs[0] & readdir->bitmask[0],
-                       attrs[1] & readdir->bitmask[1]);
+                       attrs[1] & readdir->bitmask[1],
+                       attrs[2] & readdir->bitmask[2]);
 }
 
 static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1627,7 +1668,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 {
        encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
        encode_nfs4_stateid(xdr, &arg->stateid);
-       encode_attrs(xdr, arg->iap, server);
+       encode_attrs(xdr, arg->iap, arg->label, server);
 }
 
 static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -1889,7 +1930,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
        p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
                                    NFS4_DEVICEID4_SIZE);
        *p++ = cpu_to_be32(args->pdev->layout_type);
-       *p++ = cpu_to_be32(args->pdev->pglen);          /* gdia_maxcount */
+       *p++ = cpu_to_be32(args->pdev->maxcount);       /* gdia_maxcount */
        *p++ = cpu_to_be32(0);                          /* bitmap length 0 */
 }
 
@@ -4038,6 +4079,56 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
        return status;
 }
 
+static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
+                                       struct nfs4_label *label)
+{
+       uint32_t pi = 0;
+       uint32_t lfs = 0;
+       __u32 len;
+       __be32 *p;
+       int status = 0;
+
+       if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U)))
+               return -EIO;
+       if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
+               p = xdr_inline_decode(xdr, 4);
+               if (unlikely(!p))
+                       goto out_overflow;
+               lfs = be32_to_cpup(p++);
+               p = xdr_inline_decode(xdr, 4);
+               if (unlikely(!p))
+                       goto out_overflow;
+               pi = be32_to_cpup(p++);
+               p = xdr_inline_decode(xdr, 4);
+               if (unlikely(!p))
+                       goto out_overflow;
+               len = be32_to_cpup(p++);
+               p = xdr_inline_decode(xdr, len);
+               if (unlikely(!p))
+                       goto out_overflow;
+               if (len < NFS4_MAXLABELLEN) {
+                       if (label) {
+                               memcpy(label->label, p, len);
+                               label->len = len;
+                               label->pi = pi;
+                               label->lfs = lfs;
+                               status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
+                       }
+                       bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+               } else
+                       printk(KERN_WARNING "%s: label too long (%u)!\n",
+                                       __func__, len);
+       }
+       if (label && label->label)
+               dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
+                       (char *)label->label, label->len, label->pi, label->lfs);
+       return status;
+
+out_overflow:
+       print_overflow_msg(__func__, xdr);
+       return -EIO;
+}
+
 static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
 {
        int status = 0;
@@ -4380,7 +4471,7 @@ out_overflow:
 
 static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
                struct nfs_fattr *fattr, struct nfs_fh *fh,
-               struct nfs4_fs_locations *fs_loc,
+               struct nfs4_fs_locations *fs_loc, struct nfs4_label *label,
                const struct nfs_server *server)
 {
        int status;
@@ -4488,6 +4579,13 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
        if (status < 0)
                goto xdr_error;
 
+       if (label) {
+               status = decode_attr_security_label(xdr, bitmap, label);
+               if (status < 0)
+                       goto xdr_error;
+               fattr->valid |= status;
+       }
+
 xdr_error:
        dprintk("%s: xdr returned %d\n", __func__, -status);
        return status;
@@ -4495,7 +4593,7 @@ xdr_error:
 
 static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
                struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
-               const struct nfs_server *server)
+               struct nfs4_label *label, const struct nfs_server *server)
 {
        unsigned int savep;
        uint32_t attrlen,
@@ -4514,7 +4612,8 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
        if (status < 0)
                goto xdr_error;
 
-       status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server);
+       status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc,
+                                       label, server);
        if (status < 0)
                goto xdr_error;
 
@@ -4524,10 +4623,16 @@ xdr_error:
        return status;
 }
 
+static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+               struct nfs4_label *label, const struct nfs_server *server)
+{
+       return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server);
+}
+
 static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
                const struct nfs_server *server)
 {
-       return decode_getfattr_generic(xdr, fattr, NULL, NULL, server);
+       return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);
 }
 
 /*
@@ -5919,7 +6024,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_getfh(xdr, res->fh);
        if (status)
                goto out;
-       status = decode_getfattr(xdr, res->fattr, res->server);
+       status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
        return status;
 }
@@ -5945,7 +6050,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
                goto out;
        status = decode_getfh(xdr, res->fh);
        if (status == 0)
-               status = decode_getfattr(xdr, res->fattr, res->server);
+               status = decode_getfattr_label(xdr, res->fattr,
+                                               res->label, res->server);
 out:
        return status;
 }
@@ -6036,7 +6142,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_restorefh(xdr);
        if (status)
                goto out;
-       decode_getfattr(xdr, res->fattr, res->server);
+       decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
        return status;
 }
@@ -6065,7 +6171,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_getfh(xdr, res->fh);
        if (status)
                goto out;
-       decode_getfattr(xdr, res->fattr, res->server);
+       decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
        return status;
 }
@@ -6097,7 +6203,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_putfh(xdr);
        if (status)
                goto out;
-       status = decode_getfattr(xdr, res->fattr, res->server);
+       status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
        return status;
 }
@@ -6230,7 +6336,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
                goto out;
        if (res->access_request)
                decode_access(xdr, &res->access_supported, &res->access_result);
-       decode_getfattr(xdr, res->f_attr, res->server);
+       decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
 out:
        return status;
 }
@@ -6307,7 +6413,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
        status = decode_setattr(xdr);
        if (status)
                goto out;
-       decode_getfattr(xdr, res->fattr, res->server);
+       decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
        return status;
 }
@@ -6696,7 +6802,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
        xdr_enter_page(xdr, PAGE_SIZE);
        status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
                                         NULL, res->fs_locations,
-                                        res->fs_locations->server);
+                                        NULL, res->fs_locations->server);
 out:
        return status;
 }
@@ -7109,7 +7215,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
                goto out_overflow;
 
        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
-                                 NULL, entry->server) < 0)
+                       NULL, entry->label, entry->server) < 0)
                goto out_overflow;
        if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
                entry->ino = entry->fattr->mounted_on_fileid;
index a9ebd817278b7beb301c222cb3e7abd1c183c3b4..e4f9cbfec67bfb2d497099a4221067de6939fe61 100644 (file)
@@ -613,8 +613,10 @@ int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
        pd.pgbase = 0;
        pd.pglen = PAGE_SIZE;
        pd.mincount = 0;
+       pd.maxcount = PAGE_SIZE;
 
-       err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+       err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
+                       pnfslay->plh_lc_cred);
        dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
        if (err)
                goto err_out;
index c5bd758e563768d76b2a43c830dcba8b53e175cf..3a3a79d6bf15c4fa4dfabdf10802a0ee4be9c5aa 100644 (file)
@@ -360,7 +360,7 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 }
 EXPORT_SYMBOL_GPL(pnfs_put_lseg);
 
-static inline u64
+static u64
 end_offset(u64 start, u64 len)
 {
        u64 end;
@@ -376,9 +376,9 @@ end_offset(u64 start, u64 len)
  *           start2           end2
  *           [----------------)
  */
-static inline int
-lo_seg_contained(struct pnfs_layout_range *l1,
-                struct pnfs_layout_range *l2)
+static bool
+pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
+                const struct pnfs_layout_range *l2)
 {
        u64 start1 = l1->offset;
        u64 end1 = end_offset(start1, l1->length);
@@ -395,9 +395,9 @@ lo_seg_contained(struct pnfs_layout_range *l1,
  *                              start2           end2
  *                              [----------------)
  */
-static inline int
-lo_seg_intersecting(struct pnfs_layout_range *l1,
-                   struct pnfs_layout_range *l2)
+static bool
+pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
+                   const struct pnfs_layout_range *l2)
 {
        u64 start1 = l1->offset;
        u64 end1 = end_offset(start1, l1->length);
@@ -409,12 +409,12 @@ lo_seg_intersecting(struct pnfs_layout_range *l1,
 }
 
 static bool
-should_free_lseg(struct pnfs_layout_range *lseg_range,
-                struct pnfs_layout_range *recall_range)
+should_free_lseg(const struct pnfs_layout_range *lseg_range,
+                const struct pnfs_layout_range *recall_range)
 {
        return (recall_range->iomode == IOMODE_ANY ||
                lseg_range->iomode == recall_range->iomode) &&
-              lo_seg_intersecting(lseg_range, recall_range);
+              pnfs_lseg_range_intersecting(lseg_range, recall_range);
 }
 
 static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
@@ -766,6 +766,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        lgp->args.inode = ino;
        lgp->args.ctx = get_nfs_open_context(ctx);
        lgp->gfp_flags = gfp_flags;
+       lgp->cred = lo->plh_lc_cred;
 
        /* Synchronously retrieve layout information from server and
         * store in lseg.
@@ -860,6 +861,7 @@ _pnfs_return_layout(struct inode *ino)
        lrp->args.inode = ino;
        lrp->args.layout = lo;
        lrp->clp = NFS_SERVER(ino)->nfs_client;
+       lrp->cred = lo->plh_lc_cred;
 
        status = nfs4_proc_layoutreturn(lrp);
 out:
@@ -984,8 +986,8 @@ out:
  * are seen first.
  */
 static s64
-cmp_layout(struct pnfs_layout_range *l1,
-          struct pnfs_layout_range *l2)
+pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
+          const struct pnfs_layout_range *l2)
 {
        s64 d;
 
@@ -1012,7 +1014,7 @@ pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
        dprintk("%s:Begin\n", __func__);
 
        list_for_each_entry(lp, &lo->plh_segs, pls_list) {
-               if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
+               if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
                        continue;
                list_add_tail(&lseg->pls_list, &lp->pls_list);
                dprintk("%s: inserted lseg %p "
@@ -1050,7 +1052,7 @@ alloc_init_layout_hdr(struct inode *ino,
        INIT_LIST_HEAD(&lo->plh_segs);
        INIT_LIST_HEAD(&lo->plh_bulk_destroy);
        lo->plh_inode = ino;
-       lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
+       lo->plh_lc_cred = get_rpccred(ctx->cred);
        return lo;
 }
 
@@ -1091,21 +1093,21 @@ out_existing:
  * READ                READ    true
  * READ                RW      true
  */
-static int
-is_matching_lseg(struct pnfs_layout_range *ls_range,
-                struct pnfs_layout_range *range)
+static bool
+pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
+                const struct pnfs_layout_range *range)
 {
        struct pnfs_layout_range range1;
 
        if ((range->iomode == IOMODE_RW &&
             ls_range->iomode != IOMODE_RW) ||
-           !lo_seg_intersecting(ls_range, range))
+           !pnfs_lseg_range_intersecting(ls_range, range))
                return 0;
 
        /* range1 covers only the first byte in the range */
        range1 = *range;
        range1.length = 1;
-       return lo_seg_contained(ls_range, &range1);
+       return pnfs_lseg_range_contained(ls_range, &range1);
 }
 
 /*
@@ -1121,7 +1123,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
 
        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
-                   is_matching_lseg(&lseg->pls_range, range)) {
+                   pnfs_lseg_range_match(&lseg->pls_range, range)) {
                        ret = pnfs_get_lseg(lseg);
                        break;
                }
index f5f8a470a647c7dc2f3a475e3e852d1aa7c97f5e..a4f41810a7f497fbe5b34d497d8c1a5078a44524 100644 (file)
@@ -149,9 +149,10 @@ struct pnfs_device {
        struct nfs4_deviceid dev_id;
        unsigned int  layout_type;
        unsigned int  mincount;
+       unsigned int  maxcount; /* gdia_maxcount */
        struct page **pages;
        unsigned int  pgbase;
-       unsigned int  pglen;
+       unsigned int  pglen;    /* reply buffer length */
 };
 
 #define NFS4_PNFS_GETDEVLIST_MAXNUM 16
@@ -170,7 +171,8 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
                                   const struct nfs_fh *fh,
                                   struct pnfs_devicelist *devlist);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
-                                  struct pnfs_device *dev);
+                                  struct pnfs_device *dev,
+                                  struct rpc_cred *cred);
 extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 
index fc8de9016acfb2a6c3185e3008aa5f70e23f4545..c041c41f7a52bcc849400bd55eae238c47b1ad0d 100644 (file)
@@ -98,7 +98,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
  */
 static int
 nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
-               struct nfs_fattr *fattr)
+               struct nfs_fattr *fattr, struct nfs4_label *label)
 {
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_GETATTR],
@@ -146,7 +146,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 
 static int
 nfs_proc_lookup(struct inode *dir, struct qstr *name,
-               struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+               struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+               struct nfs4_label *label)
 {
        struct nfs_diropargs    arg = {
                .fh             = NFS_FH(dir),
@@ -243,7 +244,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
        if (status == 0)
-               status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+               status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
        nfs_free_createdata(data);
 out:
        dprintk("NFS reply create: %d\n", status);
@@ -290,7 +291,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        }
        if (status == 0)
-               status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+               status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
        nfs_free_createdata(data);
 out:
        dprintk("NFS reply mknod: %d\n", status);
@@ -442,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
         * should fill in the data with a LOOKUP call on the wire.
         */
        if (status == 0)
-               status = nfs_instantiate(dentry, fh, fattr);
+               status = nfs_instantiate(dentry, fh, fattr, NULL);
 
 out_free:
        nfs_free_fattr(fattr);
@@ -471,7 +472,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
        if (status == 0)
-               status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+               status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
        nfs_free_createdata(data);
 out:
        dprintk("NFS reply mkdir: %d\n", status);
index 2d7525fbcf250225981ab521da638fc8f2a204d5..71fdc0dfa0d28d9e293cbb58729d63395af7cc99 100644 (file)
@@ -269,7 +269,7 @@ static match_table_t nfs_local_lock_tokens = {
 
 enum {
        Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0,
-       Opt_vers_4_1,
+       Opt_vers_4_1, Opt_vers_4_2,
 
        Opt_vers_err
 };
@@ -280,6 +280,7 @@ static match_table_t nfs_vers_tokens = {
        { Opt_vers_4, "4" },
        { Opt_vers_4_0, "4.0" },
        { Opt_vers_4_1, "4.1" },
+       { Opt_vers_4_2, "4.2" },
 
        { Opt_vers_err, NULL }
 };
@@ -832,6 +833,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
                seq_printf(m, "\n\tnfsv4:\t");
                seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
                seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+               seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);
                seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
                show_sessions(m, nfss);
                show_pnfs(m, nfss);
@@ -1097,6 +1099,10 @@ static int nfs_parse_version_string(char *string,
                mnt->version = 4;
                mnt->minorversion = 1;
                break;
+       case Opt_vers_4_2:
+               mnt->version = 4;
+               mnt->minorversion = 2;
+               break;
        default:
                return 0;
        }
@@ -1608,29 +1614,13 @@ out_security_failure:
 }
 
 /*
- * Select a security flavor for this mount.  The selected flavor
- * is planted in args->auth_flavors[0].
- *
- * Returns 0 on success, -EACCES on failure.
+ * Ensure that the specified authtype in args->auth_flavors[0] is supported by
+ * the server. Returns 0 if it's ok, and -EACCES if not.
  */
-static int nfs_select_flavor(struct nfs_parsed_mount_data *args,
-                             struct nfs_mount_request *request)
+static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
+                       rpc_authflavor_t *server_authlist, unsigned int count)
 {
-       unsigned int i, count = *(request->auth_flav_len);
-       rpc_authflavor_t flavor;
-
-       /*
-        * The NFSv2 MNT operation does not return a flavor list.
-        */
-       if (args->mount_server.version != NFS_MNT3_VERSION)
-               goto out_default;
-
-       /*
-        * Certain releases of Linux's mountd return an empty
-        * flavor list in some cases.
-        */
-       if (count == 0)
-               goto out_default;
+       unsigned int i;
 
        /*
         * If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1640,60 +1630,19 @@ static int nfs_select_flavor(struct nfs_parsed_mount_data *args,
         * means that the server will ignore the rpc creds, so any flavor
         * can be used.
         */
-       if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
-               for (i = 0; i < count; i++) {
-                       if (args->auth_flavors[0] == request->auth_flavs[i] ||
-                           request->auth_flavs[i] == RPC_AUTH_NULL)
-                               goto out;
-               }
-               dfprintk(MOUNT, "NFS: auth flavor %d not supported by server\n",
-                       args->auth_flavors[0]);
-               goto out_err;
-       }
-
-       /*
-        * RFC 2623, section 2.7 suggests we SHOULD prefer the
-        * flavor listed first.  However, some servers list
-        * AUTH_NULL first.  Avoid ever choosing AUTH_NULL.
-        */
        for (i = 0; i < count; i++) {
-               struct rpcsec_gss_info info;
-
-               flavor = request->auth_flavs[i];
-               switch (flavor) {
-               case RPC_AUTH_UNIX:
-                       goto out_set;
-               case RPC_AUTH_NULL:
-                       continue;
-               default:
-                       if (rpcauth_get_gssinfo(flavor, &info) == 0)
-                               goto out_set;
-               }
+               if (args->auth_flavors[0] == server_authlist[i] ||
+                   server_authlist[i] == RPC_AUTH_NULL)
+                       goto out;
        }
 
-       /*
-        * As a last chance, see if the server list contains AUTH_NULL -
-        * if it does, use the default flavor.
-        */
-       for (i = 0; i < count; i++) {
-               if (request->auth_flavs[i] == RPC_AUTH_NULL)
-                       goto out_default;
-       }
-
-       dfprintk(MOUNT, "NFS: no auth flavors in common with server\n");
-       goto out_err;
+       dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
+               args->auth_flavors[0]);
+       return -EACCES;
 
-out_default:
-       /* use default if flavor not already set */
-       flavor = (args->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) ?
-               RPC_AUTH_UNIX : args->auth_flavors[0];
-out_set:
-       args->auth_flavors[0] = flavor;
 out:
-       dfprintk(MOUNT, "NFS: using auth flavor %d\n", args->auth_flavors[0]);
+       dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
        return 0;
-out_err:
-       return -EACCES;
 }
 
 /*
@@ -1701,10 +1650,10 @@ out_err:
  * corresponding to the provided path.
  */
 static int nfs_request_mount(struct nfs_parsed_mount_data *args,
-                            struct nfs_fh *root_fh)
+                            struct nfs_fh *root_fh,
+                            rpc_authflavor_t *server_authlist,
+                            unsigned int *server_authlist_len)
 {
-       rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
-       unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
        struct nfs_mount_request request = {
                .sap            = (struct sockaddr *)
                                                &args->mount_server.address,
@@ -1712,7 +1661,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
                .protocol       = args->mount_server.protocol,
                .fh             = root_fh,
                .noresvport     = args->flags & NFS_MOUNT_NORESVPORT,
-               .auth_flav_len  = &server_authlist_len,
+               .auth_flav_len  = server_authlist_len,
                .auth_flavs     = server_authlist,
                .net            = args->net,
        };
@@ -1756,24 +1705,92 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
                return status;
        }
 
-       return nfs_select_flavor(args, &request);
+       return 0;
 }
 
-struct dentry *nfs_try_mount(int flags, const char *dev_name,
-                            struct nfs_mount_info *mount_info,
-                            struct nfs_subversion *nfs_mod)
+static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info,
+                                       struct nfs_subversion *nfs_mod)
 {
        int status;
-       struct nfs_server *server;
+       unsigned int i;
+       bool tried_auth_unix = false;
+       bool auth_null_in_list = false;
+       struct nfs_server *server = ERR_PTR(-EACCES);
+       struct nfs_parsed_mount_data *args = mount_info->parsed;
+       rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
+       unsigned int authlist_len = ARRAY_SIZE(authlist);
+
+       status = nfs_request_mount(args, mount_info->mntfh, authlist,
+                                       &authlist_len);
+       if (status)
+               return ERR_PTR(status);
 
-       if (mount_info->parsed->need_mount) {
-               status = nfs_request_mount(mount_info->parsed, mount_info->mntfh);
+       /*
+        * Was a sec= authflavor specified in the options? First, verify
+        * whether the server supports it, and then just try to use it if so.
+        */
+       if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
+               status = nfs_verify_authflavor(args, authlist, authlist_len);
+               dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
                if (status)
                        return ERR_PTR(status);
+               return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+       }
+
+       /*
+        * No sec= option was provided. RFC 2623, section 2.7 suggests we
+        * SHOULD prefer the flavor listed first. However, some servers list
+        * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
+        */
+       for (i = 0; i < authlist_len; ++i) {
+               rpc_authflavor_t flavor;
+               struct rpcsec_gss_info info;
+
+               flavor = authlist[i];
+               switch (flavor) {
+               case RPC_AUTH_UNIX:
+                       tried_auth_unix = true;
+                       break;
+               case RPC_AUTH_NULL:
+                       auth_null_in_list = true;
+                       continue;
+               default:
+                       if (rpcauth_get_gssinfo(flavor, &info) != 0)
+                               continue;
+                       /* Fallthrough */
+               }
+               dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
+               args->auth_flavors[0] = flavor;
+               server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+               if (!IS_ERR(server))
+                       return server;
        }
 
-       /* Get a volume representation */
-       server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+       /*
+        * Nothing we tried so far worked. At this point, give up if we've
+        * already tried AUTH_UNIX or if the server's list doesn't contain
+        * AUTH_NULL
+        */
+       if (tried_auth_unix || !auth_null_in_list)
+               return server;
+
+       /* Last chance! Try AUTH_UNIX */
+       dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
+       args->auth_flavors[0] = RPC_AUTH_UNIX;
+       return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+}
+
+struct dentry *nfs_try_mount(int flags, const char *dev_name,
+                            struct nfs_mount_info *mount_info,
+                            struct nfs_subversion *nfs_mod)
+{
+       struct nfs_server *server;
+
+       if (mount_info->parsed->need_mount)
+               server = nfs_try_mount_request(mount_info, nfs_mod);
+       else
+               server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+
        if (IS_ERR(server))
                return ERR_CAST(server);
 
@@ -2412,7 +2429,21 @@ static int nfs_bdi_register(struct nfs_server *server)
 int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
                        struct nfs_mount_info *mount_info)
 {
-       return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts);
+       int error;
+       unsigned long kflags = 0, kflags_out = 0;
+       if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
+               kflags |= SECURITY_LSM_NATIVE_LABELS;
+
+       error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts,
+                                               kflags, &kflags_out);
+       if (error)
+               goto err;
+
+       if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
+               !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
+               NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
+err:
+       return error;
 }
 EXPORT_SYMBOL_GPL(nfs_set_sb_security);
 
index 1f1f38f0c5d58703ca66c8477d551b708d9cab7b..60395ad3a2e475076ee3382e05b221aacdcaa60b 100644 (file)
@@ -479,7 +479,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 
        dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
-               dentry->d_count);
+               d_count(dentry));
        nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
 
        /*
index 07a473fd49bc1a577bfbc1c44c37efaafb9d01a0..c0d93170585d203e9543b898c5e3691e6234e920 100644 (file)
@@ -243,6 +243,12 @@ void               nfsd_lockd_shutdown(void);
 #define nfserr_reject_deleg            cpu_to_be32(NFS4ERR_REJECT_DELEG)
 #define nfserr_returnconflict          cpu_to_be32(NFS4ERR_RETURNCONFLICT)
 #define nfserr_deleg_revoked           cpu_to_be32(NFS4ERR_DELEG_REVOKED)
+#define nfserr_partner_notsupp         cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
+#define nfserr_partner_no_auth         cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
+#define nfserr_metadata_notsupp                cpu_to_be32(NFS4ERR_METADATA_NOTSUPP)
+#define nfserr_offload_denied          cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
+#define nfserr_wrong_lfs               cpu_to_be32(NFS4ERR_WRONG_LFS)
+#define nfserr_badlabel                cpu_to_be32(NFS4ERR_BADLABEL)
 
 /* error codes for internal use */
 /* if a request fails due to kmalloc failure, it gets dropped.
index 1427de5ebf4d7d2a66d05d3ef0b164ef4a9312d5..af3ba0478cdf5d7acedd8fba777f9217ae00f986 100644 (file)
@@ -996,7 +996,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
 
 static int nilfs_tree_was_touched(struct dentry *root_dentry)
 {
-       return root_dentry->d_count > 1;
+       return d_count(root_dentry) > 1;
 }
 
 /**
index 3e64169ef52710ff11f9c7f8c44ee3c910cbd321..fbad622841f904ebd471ae0668f03018309a076e 100644 (file)
@@ -2585,7 +2585,7 @@ static int do_proc_dqstats(struct ctl_table *table, int write,
        return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 
-static ctl_table fs_dqstats_table[] = {
+static struct ctl_table fs_dqstats_table[] = {
        {
                .procname       = "lookups",
                .data           = &dqstats.stat[DQST_LOOKUPS],
@@ -2654,7 +2654,7 @@ static ctl_table fs_dqstats_table[] = {
        { },
 };
 
-static ctl_table fs_table[] = {
+static struct ctl_table fs_table[] = {
        {
                .procname       = "quota",
                .mode           = 0555,
@@ -2663,7 +2663,7 @@ static ctl_table fs_table[] = {
        { },
 };
 
-static ctl_table sys_table[] = {
+static struct ctl_table sys_table[] = {
        {
                .procname       = "fs",
                .mode           = 0555,
index 774c1eb7f1c926c73a7018ae47d9cfb03f75da76..3135c2525c76635606ae9722f6badb5a11e44543 100644 (file)
@@ -921,3 +921,57 @@ struct hlist_node *seq_hlist_next_rcu(void *v,
                return rcu_dereference(node->next);
 }
 EXPORT_SYMBOL(seq_hlist_next_rcu);
+
+/**
+ * seq_hlist_start_precpu - start an iteration of a percpu hlist array
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu:  pointer to cpu "cursor"
+ * @pos:  start position of sequence
+ *
+ * Called at seq_file->op->start().
+ */
+struct hlist_node *
+seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos)
+{
+       struct hlist_node *node;
+
+       for_each_possible_cpu(*cpu) {
+               hlist_for_each(node, per_cpu_ptr(head, *cpu)) {
+                       if (pos-- == 0)
+                               return node;
+               }
+       }
+       return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start_percpu);
+
+/**
+ * seq_hlist_next_percpu - move to the next position of the percpu hlist array
+ * @v:    pointer to current hlist_node
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu:  pointer to cpu "cursor"
+ * @pos:  start position of sequence
+ *
+ * Called at seq_file->op->next().
+ */
+struct hlist_node *
+seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
+                       int *cpu, loff_t *pos)
+{
+       struct hlist_node *node = v;
+
+       ++*pos;
+
+       if (node->next)
+               return node->next;
+
+       for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids;
+            *cpu = cpumask_next(*cpu, cpu_possible_mask)) {
+               struct hlist_head *bucket = per_cpu_ptr(head, *cpu);
+
+               if (!hlist_empty(bucket))
+                       return bucket->first;
+       }
+       return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_next_percpu);
index 6313b69b6644cd6b8ab5d1a153f6e13c38e39e82..4a4508023a3c15724a2edfd87204550f718c805b 100644 (file)
@@ -71,6 +71,7 @@ xfs-y                         += xfs_alloc.o \
                                   xfs_dir2_sf.o \
                                   xfs_ialloc.o \
                                   xfs_ialloc_btree.o \
+                                  xfs_icreate_item.o \
                                   xfs_inode.o \
                                   xfs_log_recover.o \
                                   xfs_mount.o \
index 5673bcfda2f0c8c0151142b52dc5202d7030da63..71596e57283ae6b44702f8d6866405de6b911728 100644 (file)
@@ -175,6 +175,7 @@ xfs_alloc_compute_diff(
        xfs_agblock_t   wantbno,        /* target starting block */
        xfs_extlen_t    wantlen,        /* target length */
        xfs_extlen_t    alignment,      /* target alignment */
+       char            userdata,       /* are we allocating data? */
        xfs_agblock_t   freebno,        /* freespace's starting block */
        xfs_extlen_t    freelen,        /* freespace's length */
        xfs_agblock_t   *newbnop)       /* result: best start block from free */
@@ -189,7 +190,14 @@ xfs_alloc_compute_diff(
        ASSERT(freelen >= wantlen);
        freeend = freebno + freelen;
        wantend = wantbno + wantlen;
-       if (freebno >= wantbno) {
+       /*
+        * We want to allocate from the start of a free extent if it is past
+        * the desired block or if we are allocating user data and the free
+        * extent is before desired block. The second case is there to allow
+        * for contiguous allocation from the remaining free space if the file
+        * grows in the short term.
+        */
+       if (freebno >= wantbno || (userdata && freeend < wantend)) {
                if ((newbno1 = roundup(freebno, alignment)) >= freeend)
                        newbno1 = NULLAGBLOCK;
        } else if (freeend >= wantend && alignment > 1) {
@@ -805,7 +813,8 @@ xfs_alloc_find_best_extent(
                        xfs_alloc_fix_len(args);
 
                        sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                                      args->alignment, *sbnoa,
+                                                      args->alignment,
+                                                      args->userdata, *sbnoa,
                                                       *slena, &new);
 
                        /*
@@ -976,7 +985,8 @@ restart:
                        if (args->len < blen)
                                continue;
                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                               args->alignment, ltbnoa, ltlena, &ltnew);
+                               args->alignment, args->userdata, ltbnoa,
+                               ltlena, &ltnew);
                        if (ltnew != NULLAGBLOCK &&
                            (args->len > blen || ltdiff < bdiff)) {
                                bdiff = ltdiff;
@@ -1128,7 +1138,8 @@ restart:
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                               args->alignment, ltbnoa, ltlena, &ltnew);
+                               args->alignment, args->userdata, ltbnoa,
+                               ltlena, &ltnew);
 
                        error = xfs_alloc_find_best_extent(args,
                                                &bno_cur_lt, &bno_cur_gt,
@@ -1144,7 +1155,8 @@ restart:
                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                        xfs_alloc_fix_len(args);
                        gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                               args->alignment, gtbnoa, gtlena, &gtnew);
+                               args->alignment, args->userdata, gtbnoa,
+                               gtlena, &gtnew);
 
                        error = xfs_alloc_find_best_extent(args,
                                                &bno_cur_gt, &bno_cur_lt,
@@ -1203,7 +1215,7 @@ restart:
        }
        rlen = args->len;
        (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
-                                    ltbnoa, ltlena, &ltnew);
+                                    args->userdata, ltbnoa, ltlena, &ltnew);
        ASSERT(ltnew >= ltbno);
        ASSERT(ltnew + rlen <= ltbnoa + ltlena);
        ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
index 70c43d9f72c1a8a45800374edb411a2a42e84c71..1b726d6269412d8a9685a6db71c33bf9da9558ae 100644 (file)
@@ -196,6 +196,8 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
 #define XFS_BMDR_SPACE_CALC(nrecs) \
        (int)(sizeof(xfs_bmdr_block_t) + \
               ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BMDR_SPACE(bb) \
+       (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
 
 /*
  * Maximum number of bmap btree levels.
index 4ec431777048740528d0d1b96ca7f52e6ba69d01..bfc4e0c26fd3404fb36f007da344be79543aea4c 100644 (file)
@@ -140,6 +140,16 @@ xfs_buf_item_size(
 
        ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
 
+       if (bip->bli_flags & XFS_BLI_ORDERED) {
+               /*
+                * The buffer has been logged just to order it.
+                * It is not being included in the transaction
+                * commit, so no vectors are used at all.
+                */
+               trace_xfs_buf_item_size_ordered(bip);
+               return XFS_LOG_VEC_ORDERED;
+       }
+
        /*
         * the vector count is based on the number of buffer vectors we have
         * dirty bits in. This will only be greater than one when we have a
@@ -212,6 +222,7 @@ xfs_buf_item_format_segment(
                goto out;
        }
 
+
        /*
         * Fill in an iovec for each set of contiguous chunks.
         */
@@ -299,18 +310,36 @@ xfs_buf_item_format(
 
        /*
         * If it is an inode buffer, transfer the in-memory state to the
-        * format flags and clear the in-memory state. We do not transfer
+        * format flags and clear the in-memory state.
+        *
+        * For buffer based inode allocation, we do not transfer
         * this state if the inode buffer allocation has not yet been committed
         * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
         * correct replay of the inode allocation.
+        *
+        * For icreate item based inode allocation, the buffers aren't written
+        * to the journal during allocation, and hence we should always tag the
+        * buffer as an inode buffer so that the correct unlinked list replay
+        * occurs during recovery.
         */
        if (bip->bli_flags & XFS_BLI_INODE_BUF) {
-               if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+               if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
+                   !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
                      xfs_log_item_in_current_chkpt(lip)))
                        bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
                bip->bli_flags &= ~XFS_BLI_INODE_BUF;
        }
 
+       if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+                                                       XFS_BLI_ORDERED) {
+               /*
+                * The buffer has been logged just to order it.  It is not being
+                * included in the transaction commit, so don't format it.
+                */
+               trace_xfs_buf_item_format_ordered(bip);
+               return;
+       }
+
        for (i = 0; i < bip->bli_format_count; i++) {
                vecp = xfs_buf_item_format_segment(bip, vecp, offset,
                                                &bip->bli_formats[i]);
@@ -340,6 +369,7 @@ xfs_buf_item_pin(
 
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+              (bip->bli_flags & XFS_BLI_ORDERED) ||
               (bip->bli_flags & XFS_BLI_STALE));
 
        trace_xfs_buf_item_pin(bip);
@@ -512,8 +542,9 @@ xfs_buf_item_unlock(
 {
        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
        struct xfs_buf          *bp = bip->bli_buf;
-       int                     aborted, clean, i;
-       uint                    hold;
+       bool                    clean;
+       bool                    aborted;
+       int                     flags;
 
        /* Clear the buffer's association with this transaction. */
        bp->b_transp = NULL;
@@ -524,23 +555,21 @@ xfs_buf_item_unlock(
         * (cancelled) buffers at unpin time, but we'll never go through the
         * pin/unpin cycle if we abort inside commit.
         */
-       aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
-
+       aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
        /*
-        * Before possibly freeing the buf item, determine if we should
-        * release the buffer at the end of this routine.
+        * Before possibly freeing the buf item, copy the per-transaction state
+        * so we can reference it safely later after clearing it from the
+        * buffer log item.
         */
-       hold = bip->bli_flags & XFS_BLI_HOLD;
-
-       /* Clear the per transaction state. */
-       bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
+       flags = bip->bli_flags;
+       bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
 
        /*
         * If the buf item is marked stale, then don't do anything.  We'll
         * unlock the buffer and free the buf item when the buffer is unpinned
         * for the last time.
         */
-       if (bip->bli_flags & XFS_BLI_STALE) {
+       if (flags & XFS_BLI_STALE) {
                trace_xfs_buf_item_unlock_stale(bip);
                ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
                if (!aborted) {
@@ -557,13 +586,19 @@ xfs_buf_item_unlock(
         * be the only reference to the buf item, so we free it anyway
         * regardless of whether it is dirty or not. A dirty abort implies a
         * shutdown, anyway.
+        *
+        * Ordered buffers are dirty but may have no recorded changes, so ensure
+        * we only release clean items here.
         */
-       clean = 1;
-       for (i = 0; i < bip->bli_format_count; i++) {
-               if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
-                            bip->bli_formats[i].blf_map_size)) {
-                       clean = 0;
-                       break;
+       clean = (flags & XFS_BLI_DIRTY) ? false : true;
+       if (clean) {
+               int i;
+               for (i = 0; i < bip->bli_format_count; i++) {
+                       if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+                                    bip->bli_formats[i].blf_map_size)) {
+                               clean = false;
+                               break;
+                       }
                }
        }
        if (clean)
@@ -576,7 +611,7 @@ xfs_buf_item_unlock(
        } else
                atomic_dec(&bip->bli_refcount);
 
-       if (!hold)
+       if (!(flags & XFS_BLI_HOLD))
                xfs_buf_relse(bp);
 }
 
@@ -841,12 +876,6 @@ xfs_buf_item_log(
        uint                    end;
        struct xfs_buf          *bp = bip->bli_buf;
 
-       /*
-        * Mark the item as having some dirty data for
-        * quick reference in xfs_buf_item_dirty.
-        */
-       bip->bli_flags |= XFS_BLI_DIRTY;
-
        /*
         * walk each buffer segment and mark them dirty appropriately.
         */
@@ -873,7 +902,7 @@ xfs_buf_item_log(
 
 
 /*
- * Return 1 if the buffer has some data that has been logged (at any
+ * Return 1 if the buffer has been logged or ordered in a transaction (at any
  * point, not just the current transaction) and 0 if not.
  */
 uint
@@ -907,11 +936,11 @@ void
 xfs_buf_item_relse(
        xfs_buf_t       *bp)
 {
-       xfs_buf_log_item_t      *bip;
+       xfs_buf_log_item_t      *bip = bp->b_fspriv;
 
        trace_xfs_buf_item_relse(bp, _RET_IP_);
+       ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
 
-       bip = bp->b_fspriv;
        bp->b_fspriv = bip->bli_item.li_bio_list;
        if (bp->b_fspriv == NULL)
                bp->b_iodone = NULL;
index 2573d2a75fc83e937b6f14f44767b245e17f5575..0f1c247dc680031fe06554a4bd41f6f962290a53 100644 (file)
@@ -120,6 +120,7 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
 #define        XFS_BLI_INODE_ALLOC_BUF 0x10
 #define XFS_BLI_STALE_INODE    0x20
 #define        XFS_BLI_INODE_BUF       0x40
+#define        XFS_BLI_ORDERED         0x80
 
 #define XFS_BLI_FLAGS \
        { XFS_BLI_HOLD,         "HOLD" }, \
@@ -128,7 +129,8 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
        { XFS_BLI_LOGGED,       "LOGGED" }, \
        { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
        { XFS_BLI_STALE_INODE,  "STALE_INODE" }, \
-       { XFS_BLI_INODE_BUF,    "INODE_BUF" }
+       { XFS_BLI_INODE_BUF,    "INODE_BUF" }, \
+       { XFS_BLI_ORDERED,      "ORDERED" }
 
 
 #ifdef __KERNEL__
index c407e1ccff438a1db7e11cfdca1db4d45beea000..e36445ceaf80c82b46c1ef634a3495c43c845685 100644 (file)
@@ -24,6 +24,9 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -182,7 +185,7 @@ xfs_swap_extents_check_format(
         */
        if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
                if (XFS_IFORK_BOFF(ip) &&
-                   tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+                   XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
                        return EINVAL;
                if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
                    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
@@ -192,9 +195,8 @@ xfs_swap_extents_check_format(
        /* Reciprocal target->temp btree format checks */
        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
                if (XFS_IFORK_BOFF(tip) &&
-                   ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+                   XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
                        return EINVAL;
-
                if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
                    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
                        return EINVAL;
index e0cc1243a8aaea11765336a8f8908b25b6b1c963..2aed25cae04d9f265df00aba8e5d63a624350a0f 100644 (file)
@@ -1108,6 +1108,7 @@ xfs_dir2_leaf_readbuf(
        struct xfs_mount        *mp = dp->i_mount;
        struct xfs_buf          *bp = *bpp;
        struct xfs_bmbt_irec    *map = mip->map;
+       struct blk_plug         plug;
        int                     error = 0;
        int                     length;
        int                     i;
@@ -1236,6 +1237,7 @@ xfs_dir2_leaf_readbuf(
        /*
         * Do we need more readahead?
         */
+       blk_start_plug(&plug);
        for (mip->ra_index = mip->ra_offset = i = 0;
             mip->ra_want > mip->ra_current && i < mip->map_blocks;
             i += mp->m_dirblkfsbs) {
@@ -1287,6 +1289,7 @@ xfs_dir2_leaf_readbuf(
                        }
                }
        }
+       blk_finish_plug(&plug);
 
 out:
        *bpp = bp;
index 044e97a33c8d0a155f1ea026a20881f95dd95274..f01012de06d0b3d3809c4dc2e09c9e5ba6f38982 100644 (file)
@@ -570,13 +570,13 @@ xfs_qm_dqtobp(
        xfs_buf_t               **O_bpp,
        uint                    flags)
 {
-       xfs_bmbt_irec_t map;
-       int             nmaps = 1, error;
-       xfs_buf_t       *bp;
-       xfs_inode_t     *quotip = XFS_DQ_TO_QIP(dqp);
-       xfs_mount_t     *mp = dqp->q_mount;
-       xfs_dqid_t      id = be32_to_cpu(dqp->q_core.d_id);
-       xfs_trans_t     *tp = (tpp ? *tpp : NULL);
+       struct xfs_bmbt_irec    map;
+       int                     nmaps = 1, error;
+       struct xfs_buf          *bp;
+       struct xfs_inode        *quotip = xfs_dq_to_quota_inode(dqp);
+       struct xfs_mount        *mp = dqp->q_mount;
+       xfs_dqid_t              id = be32_to_cpu(dqp->q_core.d_id);
+       struct xfs_trans        *tp = (tpp ? *tpp : NULL);
 
        dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
 
@@ -804,7 +804,7 @@ xfs_qm_dqget(
        xfs_dquot_t     **O_dqpp) /* OUT : locked incore dquot */
 {
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
-       struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+       struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
        struct xfs_dquot        *dqp;
        int                     error;
 
index 4f0ebfc43cc962c8a93121682ec1755ee003905b..b596626249b84e9eb91d29a2b110e57532d46a60 100644 (file)
@@ -143,10 +143,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
 #define XFS_QM_ISUDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_USER)
 #define XFS_QM_ISPDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_PROJ)
 #define XFS_QM_ISGDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_GROUP)
-#define XFS_DQ_TO_QINF(dqp)    ((dqp)->q_mount->m_quotainfo)
-#define XFS_DQ_TO_QIP(dqp)     (XFS_QM_ISUDQ(dqp) ? \
-                                XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
-                                XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
 
 extern int             xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
                                        uint, struct xfs_dquot  **);
index 3c3644ea825b65edd813965296aa4c73d6214313..614eb0cc360860214ce08443ff84993afa531143 100644 (file)
@@ -176,7 +176,7 @@ xfs_growfs_data_private(
        if (!bp)
                return EIO;
        if (bp->b_error) {
-               int     error = bp->b_error;
+               error = bp->b_error;
                xfs_buf_relse(bp);
                return error;
        }
index c8f5ae1debf2aed95fd272eadd4ec2dd4eaba13c..7a0c17d7ec0974354cfd645695e9f4f7778704fa 100644 (file)
@@ -38,6 +38,7 @@
 #include "xfs_bmap.h"
 #include "xfs_cksum.h"
 #include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
 
 
 /*
@@ -150,12 +151,16 @@ xfs_check_agi_freecount(
 #endif
 
 /*
- * Initialise a new set of inodes.
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
  */
-STATIC int
+int
 xfs_ialloc_inode_init(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
+       struct list_head        *buffer_list,
        xfs_agnumber_t          agno,
        xfs_agblock_t           agbno,
        xfs_agblock_t           length,
@@ -208,6 +213,18 @@ xfs_ialloc_inode_init(
                version = 3;
                ino = XFS_AGINO_TO_INO(mp, agno,
                                       XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+
+               /*
+                * log the initialisation that is about to take place as an
+                * logical operation. This means the transaction does not
+                * need to log the physical changes to the inode buffers as log
+                * recovery will know what initialisation is actually needed.
+                * Hence we only need to log the buffers as "ordered" buffers so
+                * they track in the AIL as if they were physically logged.
+                */
+               if (tp)
+                       xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp),
+                                       mp->m_sb.sb_inodesize, length, gen);
        } else if (xfs_sb_version_hasnlink(&mp->m_sb))
                version = 2;
        else
@@ -223,13 +240,8 @@ xfs_ialloc_inode_init(
                                         XBF_UNMAPPED);
                if (!fbuf)
                        return ENOMEM;
-               /*
-                * Initialize all inodes in this buffer and then log them.
-                *
-                * XXX: It would be much better if we had just one transaction
-                *      to log a whole cluster of inodes instead of all the
-                *      individual transactions causing a lot of log traffic.
-                */
+
+               /* Initialize the inode buffers and log them appropriately. */
                fbuf->b_ops = &xfs_inode_buf_ops;
                xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
                for (i = 0; i < ninodes; i++) {
@@ -247,18 +259,39 @@ xfs_ialloc_inode_init(
                                ino++;
                                uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
                                xfs_dinode_calc_crc(mp, free);
-                       } else {
+                       } else if (tp) {
                                /* just log the inode core */
                                xfs_trans_log_buf(tp, fbuf, ioffset,
                                                  ioffset + isize - 1);
                        }
                }
-               if (version == 3) {
-                       /* need to log the entire buffer */
-                       xfs_trans_log_buf(tp, fbuf, 0,
-                                         BBTOB(fbuf->b_length) - 1);
+
+               if (tp) {
+                       /*
+                        * Mark the buffer as an inode allocation buffer so it
+                        * sticks in AIL at the point of this allocation
+                        * transaction. This ensures the they are on disk before
+                        * the tail of the log can be moved past this
+                        * transaction (i.e. by preventing relogging from moving
+                        * it forward in the log).
+                        */
+                       xfs_trans_inode_alloc_buf(tp, fbuf);
+                       if (version == 3) {
+                               /*
+                                * Mark the buffer as ordered so that they are
+                                * not physically logged in the transaction but
+                                * still tracked in the AIL as part of the
+                                * transaction and pin the log appropriately.
+                                */
+                               xfs_trans_ordered_buf(tp, fbuf);
+                               xfs_trans_log_buf(tp, fbuf, 0,
+                                                 BBTOB(fbuf->b_length) - 1);
+                       }
+               } else {
+                       fbuf->b_flags |= XBF_DONE;
+                       xfs_buf_delwri_queue(fbuf, buffer_list);
+                       xfs_buf_relse(fbuf);
                }
-               xfs_trans_inode_alloc_buf(tp, fbuf);
        }
        return 0;
 }
@@ -303,7 +336,7 @@ xfs_ialloc_ag_alloc(
         * First try to allocate inodes contiguous with the last-allocated
         * chunk of inodes.  If the filesystem is striped, this will fill
         * an entire stripe unit with inodes.
-        */
+        */
        agi = XFS_BUF_TO_AGI(agbp);
        newino = be32_to_cpu(agi->agi_newino);
        agno = be32_to_cpu(agi->agi_seqno);
@@ -402,7 +435,7 @@ xfs_ialloc_ag_alloc(
         * rather than a linear progression to prevent the next generation
         * number from being easily guessable.
         */
-       error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
+       error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
                        args.len, prandom_u32());
 
        if (error)
@@ -615,8 +648,7 @@ xfs_ialloc_get_rec(
        struct xfs_btree_cur    *cur,
        xfs_agino_t             agino,
        xfs_inobt_rec_incore_t  *rec,
-       int                     *done,
-       int                     left)
+       int                     *done)
 {
        int                     error;
        int                     i;
@@ -724,12 +756,12 @@ xfs_dialloc_ag(
                    pag->pagl_leftrec != NULLAGINO &&
                    pag->pagl_rightrec != NULLAGINO) {
                        error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
-                                                  &trec, &doneleft, 1);
+                                                  &trec, &doneleft);
                        if (error)
                                goto error1;
 
                        error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
-                                                  &rec, &doneright, 0);
+                                                  &rec, &doneright);
                        if (error)
                                goto error1;
                } else {
index c8da3df271e6b94c97bc76d627ad1c1052ad03b7..68c07320f096e53d31f3050850414e1038c79c4a 100644 (file)
@@ -150,6 +150,14 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
 int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
                xfs_inobt_rec_incore_t *rec, int *stat);
 
+/*
+ * Inode chunk initialisation routine
+ */
+int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
+                         struct list_head *buffer_list,
+                         xfs_agnumber_t agno, xfs_agblock_t agbno,
+                         xfs_agblock_t length, unsigned int gen);
+
 extern const struct xfs_buf_ops xfs_agi_buf_ops;
 
 #endif /* __XFS_IALLOC_H__ */
index 96e344e3e9277fdc0bb307d04d8945d81602c70a..9560dc1f15a96299392e990da3a08cf7317b89b6 100644 (file)
@@ -335,7 +335,8 @@ xfs_iget_cache_miss(
        iflags = XFS_INEW;
        if (flags & XFS_IGET_DONTCACHE)
                iflags |= XFS_IDONTCACHE;
-       ip->i_udquot = ip->i_gdquot = NULL;
+       ip->i_udquot = NULL;
+       ip->i_gdquot = NULL;
        xfs_iflags_set(ip, iflags);
 
        /* insert the new inode */
index e0f138c70a2ff666e5b831651c875e2545e66a2c..a01afbb3909a465a6e94f23bdc819530a3f22140 100644 (file)
@@ -40,7 +40,6 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
 int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
 void xfs_eofblocks_worker(struct work_struct *);
 
-int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
                int flags, void *args),
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
new file mode 100644 (file)
index 0000000..7716a4e
--- /dev/null
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2008-2010, 2013 Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_error.h"
+#include "xfs_icreate_item.h"
+
+kmem_zone_t    *xfs_icreate_zone;              /* inode create item zone */
+
+static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
+{
+       return container_of(lip, struct xfs_icreate_item, ic_item);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We only need one iovec for the icreate log structure.
+ */
+STATIC uint
+xfs_icreate_item_size(
+       struct xfs_log_item     *lip)
+{
+       return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode create log item.
+ */
+STATIC void
+xfs_icreate_item_format(
+       struct xfs_log_item     *lip,
+       struct xfs_log_iovec    *log_vector)
+{
+       struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+       log_vector->i_addr = (xfs_caddr_t)&icp->ic_format;
+       log_vector->i_len  = sizeof(struct xfs_icreate_log);
+       log_vector->i_type = XLOG_REG_TYPE_ICREATE;
+}
+
+
+/* Pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_pin(
+       struct xfs_log_item     *lip)
+{
+}
+
+
+/* pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_unpin(
+       struct xfs_log_item     *lip,
+       int                     remove)
+{
+}
+
+STATIC void
+xfs_icreate_item_unlock(
+       struct xfs_log_item     *lip)
+{
+       struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+       if (icp->ic_item.li_flags & XFS_LI_ABORTED)
+               kmem_zone_free(xfs_icreate_zone, icp);
+       return;
+}
+
+/*
+ * Because we have ordered buffers being tracked in the AIL for the inode
+ * creation, we don't need the create item after this. Hence we can free
+ * the log item and return -1 to tell the caller we're done with the item.
+ */
+STATIC xfs_lsn_t
+xfs_icreate_item_committed(
+       struct xfs_log_item     *lip,
+       xfs_lsn_t               lsn)
+{
+       struct xfs_icreate_item *icp = ICR_ITEM(lip);
+
+       kmem_zone_free(xfs_icreate_zone, icp);
+       return (xfs_lsn_t)-1;
+}
+
+/* item can never get into the AIL */
+STATIC uint
+xfs_icreate_item_push(
+       struct xfs_log_item     *lip,
+       struct list_head        *buffer_list)
+{
+       ASSERT(0);
+       return XFS_ITEM_SUCCESS;
+}
+
+/* Ordered buffers do the dependency tracking here, so this does nothing. */
+STATIC void
+xfs_icreate_item_committing(
+       struct xfs_log_item     *lip,
+       xfs_lsn_t               lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static struct xfs_item_ops xfs_icreate_item_ops = {
+       .iop_size       = xfs_icreate_item_size,
+       .iop_format     = xfs_icreate_item_format,
+       .iop_pin        = xfs_icreate_item_pin,
+       .iop_unpin      = xfs_icreate_item_unpin,
+       .iop_push       = xfs_icreate_item_push,
+       .iop_unlock     = xfs_icreate_item_unlock,
+       .iop_committed  = xfs_icreate_item_committed,
+       .iop_committing = xfs_icreate_item_committing,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the icreate item.
+ */
+void
+xfs_icreate_log(
+       struct xfs_trans        *tp,
+       xfs_agnumber_t          agno,
+       xfs_agblock_t           agbno,
+       unsigned int            count,
+       unsigned int            inode_size,
+       xfs_agblock_t           length,
+       unsigned int            generation)
+{
+       struct xfs_icreate_item *icp;
+
+       icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+
+       xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
+                         &xfs_icreate_item_ops);
+
+       icp->ic_format.icl_type = XFS_LI_ICREATE;
+       icp->ic_format.icl_size = 1;    /* single vector */
+       icp->ic_format.icl_ag = cpu_to_be32(agno);
+       icp->ic_format.icl_agbno = cpu_to_be32(agbno);
+       icp->ic_format.icl_count = cpu_to_be32(count);
+       icp->ic_format.icl_isize = cpu_to_be32(inode_size);
+       icp->ic_format.icl_length = cpu_to_be32(length);
+       icp->ic_format.icl_gen = cpu_to_be32(generation);
+
+       xfs_trans_add_item(tp, &icp->ic_item);
+       tp->t_flags |= XFS_TRANS_DIRTY;
+       icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
new file mode 100644 (file)
index 0000000..88ba8aa
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2008-2010, Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_ICREATE_ITEM_H
+#define XFS_ICREATE_ITEM_H     1
+
+/*
+ * on disk log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+       __uint16_t      icl_type;       /* type of log format structure */
+       __uint16_t      icl_size;       /* size of log format structure */
+       __be32          icl_ag;         /* ag being allocated in */
+       __be32          icl_agbno;      /* start block of inode range */
+       __be32          icl_count;      /* number of inodes to initialise */
+       __be32          icl_isize;      /* size of inodes */
+       __be32          icl_length;     /* length of extent to initialise */
+       __be32          icl_gen;        /* inode generation number to use */
+};
+
+/* in memory log item structure */
+struct xfs_icreate_item {
+       struct xfs_log_item     ic_item;
+       struct xfs_icreate_log  ic_format;
+};
+
+extern kmem_zone_t *xfs_icreate_zone;  /* inode create item zone */
+
+void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
+                       xfs_agblock_t agbno, unsigned int count,
+                       unsigned int inode_size, xfs_agblock_t length,
+                       unsigned int generation);
+
+#endif /* XFS_ICREATE_ITEM_H */
index 7f7be5f98f52f743e04b4a915ef8466a62ec6237..9ecfe1e559fc61703228a7c015c989dc95a960cb 100644 (file)
@@ -1028,6 +1028,11 @@ xfs_dinode_calc_crc(
 
 /*
  * Read the disk inode attributes into the in-core inode structure.
+ *
+ * If we are initialising a new inode and we are not utilising the
+ * XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new inode core
+ * with a random generation number. If we are keeping inodes around, we need to
+ * read the inode cluster to get the existing generation number off disk.
  */
 int
 xfs_iread(
@@ -1047,6 +1052,22 @@ xfs_iread(
        if (error)
                return error;
 
+       /* shortcut IO on inode allocation if possible */
+       if ((iget_flags & XFS_IGET_CREATE) &&
+           !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+               /* initialise the on-disk inode core */
+               memset(&ip->i_d, 0, sizeof(ip->i_d));
+               ip->i_d.di_magic = XFS_DINODE_MAGIC;
+               ip->i_d.di_gen = prandom_u32();
+               if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                       ip->i_d.di_version = 3;
+                       ip->i_d.di_ino = ip->i_ino;
+                       uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+               } else
+                       ip->i_d.di_version = 2;
+               return 0;
+       }
+
        /*
         * Get pointers to the on-disk inode and the buffer containing it.
         */
@@ -1133,17 +1154,16 @@ xfs_iread(
        xfs_buf_set_ref(bp, XFS_INO_REF);
 
        /*
-        * Use xfs_trans_brelse() to release the buffer containing the
-        * on-disk inode, because it was acquired with xfs_trans_read_buf()
-        * in xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
+        * Use xfs_trans_brelse() to release the buffer containing the on-disk
+        * inode, because it was acquired with xfs_trans_read_buf() in
+        * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
         * brelse().  If we're within a transaction, then xfs_trans_brelse()
         * will only release the buffer if it is not dirty within the
         * transaction.  It will be OK to release the buffer in this case,
-        * because inodes on disk are never destroyed and we will be
-        * locking the new in-core inode before putting it in the hash
-        * table where other processes can find it.  Thus we don't have
-        * to worry about the inode being changed just because we released
-        * the buffer.
+        * because inodes on disk are never destroyed and we will be locking the
+        * new in-core inode before putting it in the cache where other
+        * processes can find it.  Thus we don't have to worry about the inode
+        * being changed just because we released the buffer.
         */
  out_brelse:
        xfs_trans_brelse(tp, bp);
@@ -2028,8 +2048,6 @@ xfs_ifree(
        int                     error;
        int                     delete;
        xfs_ino_t               first_ino;
-       xfs_dinode_t            *dip;
-       xfs_buf_t               *ibp;
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ASSERT(ip->i_d.di_nlink == 0);
@@ -2042,14 +2060,13 @@ xfs_ifree(
         * Pull the on-disk inode from the AGI unlinked list.
         */
        error = xfs_iunlink_remove(tp, ip);
-       if (error != 0) {
+       if (error)
                return error;
-       }
 
        error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
-       if (error != 0) {
+       if (error)
                return error;
-       }
+
        ip->i_d.di_mode = 0;            /* mark incore inode as free */
        ip->i_d.di_flags = 0;
        ip->i_d.di_dmevmask = 0;
@@ -2061,31 +2078,10 @@ xfs_ifree(
         * by reincarnations of this inode.
         */
        ip->i_d.di_gen++;
-
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-       error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp,
-                              0, 0);
-       if (error)
-               return error;
-
-        /*
-       * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
-       * from picking up this inode when it is reclaimed (its incore state
-       * initialzed but not flushed to disk yet). The in-core di_mode is
-       * already cleared  and a corresponding transaction logged.
-       * The hack here just synchronizes the in-core to on-disk
-       * di_mode value in advance before the actual inode sync to disk.
-       * This is OK because the inode is already unlinked and would never
-       * change its di_mode again for this inode generation.
-       * This is a temporary hack that would require a proper fix
-       * in the future.
-       */
-       dip->di_mode = 0;
-
-       if (delete) {
+       if (delete)
                error = xfs_ifree_cluster(ip, tp, first_ino);
-       }
 
        return error;
 }
index 8f8aaee7f3791457c9ec43ea46927bc05f624f04..6a7096422295d1d821f1e0bab397041c42f53de1 100644 (file)
@@ -283,6 +283,15 @@ xfs_iomap_eof_want_preallocate(
        if (offset + count <= XFS_ISIZE(ip))
                return 0;
 
+       /*
+        * If the file is smaller than the minimum prealloc and we are using
+        * dynamic preallocation, don't do any preallocation at all as it is
+        * likely this is the only write to the file that is going to be done.
+        */
+       if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
+           XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
+               return 0;
+
        /*
         * If there are any real blocks past eof, then don't
         * do any speculative allocation.
@@ -345,6 +354,10 @@ xfs_iomap_eof_prealloc_initial_size(
        if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
                return 0;
 
+       /* If the file is small, then use the minimum prealloc */
+       if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
+               return 0;
+
        /*
         * As we write multiple pages, the offset will always align to the
         * start of a page and hence point to a hole at EOF. i.e. if the size is
index ca9ecaa81112fac7706c4cac23c92f2326f0bba8..c69bbc493cb0c7d09cd5d3f8c271e0091228968a 100644 (file)
@@ -987,7 +987,8 @@ xfs_fiemap_format(
        if (bmv->bmv_oflags & BMV_OF_PREALLOC)
                fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
        else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
-               fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+               fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
+                                FIEMAP_EXTENT_UNKNOWN);
                physical = 0;   /* no block yet */
        }
        if (bmv->bmv_oflags & BMV_OF_LAST)
index 2ea7d402188db8596e4c04a231d29cf0cb506756..bc92c5306a170afd58ca55945bc9471287933699 100644 (file)
@@ -43,7 +43,7 @@ xfs_internal_inum(
 {
        return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
                (xfs_sb_version_hasquota(&mp->m_sb) &&
-                (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
+                xfs_is_quota_inode(&mp->m_sb, ino)));
 }
 
 /*
@@ -383,11 +383,13 @@ xfs_bulkstat(
                         * Also start read-ahead now for this chunk.
                         */
                        if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+                               struct blk_plug plug;
                                /*
                                 * Loop over all clusters in the next chunk.
                                 * Do a readahead if there are any allocated
                                 * inodes in that cluster.
                                 */
+                               blk_start_plug(&plug);
                                agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
                                for (chunkidx = 0;
                                     chunkidx < XFS_INODES_PER_CHUNK;
@@ -399,6 +401,7 @@ xfs_bulkstat(
                                                        agbno, nbcluster,
                                                        &xfs_inode_buf_ops);
                                }
+                               blk_finish_plug(&plug);
                                irbp->ir_startino = r.ir_startino;
                                irbp->ir_freecount = r.ir_freecount;
                                irbp->ir_free = r.ir_free;
index b345a7c85153dda82825050a8d1234ed4c73c541..d852a2b3e1fdfae0c4fb5bf18452ec79fd03ab01 100644 (file)
@@ -1963,6 +1963,10 @@ xlog_write_calc_vec_length(
                headers++;
 
        for (lv = log_vector; lv; lv = lv->lv_next) {
+               /* we don't write ordered log vectors */
+               if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
+                       continue;
+
                headers += lv->lv_niovecs;
 
                for (i = 0; i < lv->lv_niovecs; i++) {
@@ -2216,7 +2220,7 @@ xlog_write(
        index = 0;
        lv = log_vector;
        vecp = lv->lv_iovecp;
-       while (lv && index < lv->lv_niovecs) {
+       while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
                void            *ptr;
                int             log_offset;
 
@@ -2236,13 +2240,22 @@ xlog_write(
                 * This loop writes out as many regions as can fit in the amount
                 * of space which was allocated by xlog_state_get_iclog_space().
                 */
-               while (lv && index < lv->lv_niovecs) {
-                       struct xfs_log_iovec    *reg = &vecp[index];
+               while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
+                       struct xfs_log_iovec    *reg;
                        struct xlog_op_header   *ophdr;
                        int                     start_rec_copy;
                        int                     copy_len;
                        int                     copy_off;
+                       bool                    ordered = false;
+
+                       /* ordered log vectors have no regions to write */
+                       if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+                               ASSERT(lv->lv_niovecs == 0);
+                               ordered = true;
+                               goto next_lv;
+                       }
 
+                       reg = &vecp[index];
                        ASSERT(reg->i_len % sizeof(__int32_t) == 0);
                        ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
 
@@ -2302,12 +2315,13 @@ xlog_write(
                                break;
 
                        if (++index == lv->lv_niovecs) {
+next_lv:
                                lv = lv->lv_next;
                                index = 0;
                                if (lv)
                                        vecp = lv->lv_iovecp;
                        }
-                       if (record_cnt == 0) {
+                       if (record_cnt == 0 && ordered == false) {
                                if (!lv)
                                        return 0;
                                break;
index 5caee96059dfb3a9fe5a1f03cd84c868674626b0..fb630e496c12406c558b7cc53854bf0cd123ccaf 100644 (file)
@@ -88,7 +88,8 @@ static inline xfs_lsn_t       _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XLOG_REG_TYPE_UNMOUNT          17
 #define XLOG_REG_TYPE_COMMIT           18
 #define XLOG_REG_TYPE_TRANSHDR         19
-#define XLOG_REG_TYPE_MAX              19
+#define XLOG_REG_TYPE_ICREATE          20
+#define XLOG_REG_TYPE_MAX              20
 
 typedef struct xfs_log_iovec {
        void            *i_addr;        /* beginning address of region */
@@ -105,6 +106,8 @@ struct xfs_log_vec {
        int                     lv_buf_len;     /* size of formatted buffer */
 };
 
+#define XFS_LOG_VEC_ORDERED    (-1)
+
 /*
  * Structure used to pass callback function and the function's argument
  * to the log manager.
index d0833b54e55d63ed83eb61cedfd110cd4a1263b2..02b9cf3f8252baeade5d4e99b3e88853a7b50b98 100644 (file)
@@ -127,6 +127,7 @@ xlog_cil_prepare_log_vecs(
                int     index;
                int     len = 0;
                uint    niovecs;
+               bool    ordered = false;
 
                /* Skip items which aren't dirty in this transaction. */
                if (!(lidp->lid_flags & XFS_LID_DIRTY))
@@ -137,14 +138,30 @@ xlog_cil_prepare_log_vecs(
                if (!niovecs)
                        continue;
 
+               /*
+                * Ordered items need to be tracked but we do not wish to write
+                * them. We need a logvec to track the object, but we do not
+                * need an iovec or buffer to be allocated for copying data.
+                */
+               if (niovecs == XFS_LOG_VEC_ORDERED) {
+                       ordered = true;
+                       niovecs = 0;
+               }
+
                new_lv = kmem_zalloc(sizeof(*new_lv) +
                                niovecs * sizeof(struct xfs_log_iovec),
                                KM_SLEEP|KM_NOFS);
 
+               new_lv->lv_item = lidp->lid_item;
+               new_lv->lv_niovecs = niovecs;
+               if (ordered) {
+                       /* track as an ordered logvec */
+                       new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                       goto next;
+               }
+
                /* The allocated iovec region lies beyond the log vector. */
                new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
-               new_lv->lv_niovecs = niovecs;
-               new_lv->lv_item = lidp->lid_item;
 
                /* build the vector array and calculate it's length */
                IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
@@ -165,6 +182,7 @@ xlog_cil_prepare_log_vecs(
                }
                ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
 
+next:
                if (!ret_lv)
                        ret_lv = new_lv;
                else
@@ -191,8 +209,18 @@ xfs_cil_prepare_item(
 
        if (old) {
                /* existing lv on log item, space used is a delta */
-               ASSERT(!list_empty(&lv->lv_item->li_cil));
-               ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+               ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) ||
+                       old->lv_buf_len == XFS_LOG_VEC_ORDERED);
+
+               /*
+                * If the new item is ordered, keep the old one that is already
+                * tracking dirty or ordered regions
+                */
+               if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+                       ASSERT(!lv->lv_buf);
+                       kmem_free(lv);
+                       return;
+               }
 
                *len += lv->lv_buf_len - old->lv_buf_len;
                *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
@@ -201,10 +229,11 @@ xfs_cil_prepare_item(
        } else {
                /* new lv, must pin the log item */
                ASSERT(!lv->lv_item->li_lv);
-               ASSERT(list_empty(&lv->lv_item->li_cil));
 
-               *len += lv->lv_buf_len;
-               *diff_iovecs += lv->lv_niovecs;
+               if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+                       *len += lv->lv_buf_len;
+                       *diff_iovecs += lv->lv_niovecs;
+               }
                IOP_PIN(lv->lv_item);
 
        }
@@ -259,18 +288,24 @@ xlog_cil_insert_items(
         * We can do this safely because the context can't checkpoint until we
         * are done so it doesn't matter exactly how we update the CIL.
         */
-       for (lv = log_vector; lv; lv = lv->lv_next)
-               xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
-
-       /* account for space used by new iovec headers  */
-       len += diff_iovecs * sizeof(xlog_op_header_t);
-
        spin_lock(&cil->xc_cil_lock);
+       for (lv = log_vector; lv; ) {
+               struct xfs_log_vec *next = lv->lv_next;
 
-       /* move the items to the tail of the CIL */
-       for (lv = log_vector; lv; lv = lv->lv_next)
+               ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil));
+               lv->lv_next = NULL;
+
+               /*
+                * xfs_cil_prepare_item() may free the lv, so move the item on
+                * the CIL first.
+                */
                list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+               xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+               lv = next;
+       }
 
+       /* account for space used by new iovec headers  */
+       len += diff_iovecs * sizeof(xlog_op_header_t);
        ctx->nvecs += diff_iovecs;
 
        /*
@@ -381,9 +416,7 @@ xlog_cil_push(
        struct xfs_cil_ctx      *new_ctx;
        struct xlog_in_core     *commit_iclog;
        struct xlog_ticket      *tic;
-       int                     num_lv;
        int                     num_iovecs;
-       int                     len;
        int                     error = 0;
        struct xfs_trans_header thdr;
        struct xfs_log_iovec    lhdr;
@@ -428,12 +461,9 @@ xlog_cil_push(
         * side which is currently locked out by the flush lock.
         */
        lv = NULL;
-       num_lv = 0;
        num_iovecs = 0;
-       len = 0;
        while (!list_empty(&cil->xc_cil)) {
                struct xfs_log_item     *item;
-               int                     i;
 
                item = list_first_entry(&cil->xc_cil,
                                        struct xfs_log_item, li_cil);
@@ -444,11 +474,7 @@ xlog_cil_push(
                        lv->lv_next = item->li_lv;
                lv = item->li_lv;
                item->li_lv = NULL;
-
-               num_lv++;
                num_iovecs += lv->lv_niovecs;
-               for (i = 0; i < lv->lv_niovecs; i++)
-                       len += lv->lv_iovecp[i].i_len;
        }
 
        /*
@@ -701,6 +727,7 @@ xfs_log_commit_cil(
        if (commit_lsn)
                *commit_lsn = log->l_cilp->xc_ctx->sequence;
 
+       /* xlog_cil_insert_items() destroys log_vector list */
        xlog_cil_insert_items(log, log_vector, tp->t_ticket);
 
        /* check we didn't blow the reservation */
index 7cf5e4eafe28b1a05890d63e5c1d706d964eef19..6fcc910a50b9d7e3fbd3c15851aa0ee0cfc1b6af 100644 (file)
@@ -45,6 +45,7 @@
 #include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_icreate_item.h"
 
 /* Need all the magic numbers and buffer ops structures from these headers */
 #include "xfs_symlink.h"
@@ -1617,7 +1618,10 @@ xlog_recover_add_to_trans(
  *        form the cancelled buffer table. Hence they have tobe done last.
  *
  *     3. Inode allocation buffers must be replayed before inode items that
- *        read the buffer and replay changes into it.
+ *        read the buffer and replay changes into it. For filesystems using the
+ *        ICREATE transactions, this means XFS_LI_ICREATE objects need to get
+ *        treated the same as inode allocation buffers as they create and
+ *        initialise the buffers directly.
  *
  *     4. Inode unlink buffers must be replayed after inode items are replayed.
  *        This ensures that inodes are completely flushed to the inode buffer
@@ -1632,10 +1636,17 @@ xlog_recover_add_to_trans(
  * from all the other buffers and move them to last.
  *
  * Hence, 4 lists, in order from head to tail:
- *     - buffer_list for all buffers except cancelled/inode unlink buffers
- *     - item_list for all non-buffer items
- *     - inode_buffer_list for inode unlink buffers
- *     - cancel_list for the cancelled buffers
+ *     - buffer_list for all buffers except cancelled/inode unlink buffers
+ *     - item_list for all non-buffer items
+ *     - inode_buffer_list for inode unlink buffers
+ *     - cancel_list for the cancelled buffers
+ *
+ * Note that we add objects to the tail of the lists so that first-to-last
+ * ordering is preserved within the lists. Adding objects to the head of the
+ * list means when we traverse from the head we walk them in last-to-first
+ * order. For cancelled buffers and inode unlink buffers this doesn't matter,
+ * but for all other items there may be specific ordering that we need to
+ * preserve.
  */
 STATIC int
 xlog_recover_reorder_trans(
@@ -1655,6 +1666,9 @@ xlog_recover_reorder_trans(
                xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
 
                switch (ITEM_TYPE(item)) {
+               case XFS_LI_ICREATE:
+                       list_move_tail(&item->ri_list, &buffer_list);
+                       break;
                case XFS_LI_BUF:
                        if (buf_f->blf_flags & XFS_BLF_CANCEL) {
                                trace_xfs_log_recover_item_reorder_head(log,
@@ -2981,6 +2995,93 @@ xlog_recover_efd_pass2(
        return 0;
 }
 
+/*
+ * This routine is called when an inode create format structure is found in a
+ * committed transaction in the log.  It's purpose is to initialise the inodes
+ * being allocated on disk. This requires us to get inode cluster buffers that
+ * match the range to be intialised, stamped with inode templates and written
+ * by delayed write so that subsequent modifications will hit the cached buffer
+ * and only need writing out at the end of recovery.
+ */
+STATIC int
+xlog_recover_do_icreate_pass2(
+       struct xlog             *log,
+       struct list_head        *buffer_list,
+       xlog_recover_item_t     *item)
+{
+       struct xfs_mount        *mp = log->l_mp;
+       struct xfs_icreate_log  *icl;
+       xfs_agnumber_t          agno;
+       xfs_agblock_t           agbno;
+       unsigned int            count;
+       unsigned int            isize;
+       xfs_agblock_t           length;
+
+       icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+       if (icl->icl_type != XFS_LI_ICREATE) {
+               xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
+               return EINVAL;
+       }
+
+       if (icl->icl_size != 1) {
+               xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
+               return EINVAL;
+       }
+
+       agno = be32_to_cpu(icl->icl_ag);
+       if (agno >= mp->m_sb.sb_agcount) {
+               xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
+               return EINVAL;
+       }
+       agbno = be32_to_cpu(icl->icl_agbno);
+       if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
+               xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
+               return EINVAL;
+       }
+       isize = be32_to_cpu(icl->icl_isize);
+       if (isize != mp->m_sb.sb_inodesize) {
+               xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
+               return EINVAL;
+       }
+       count = be32_to_cpu(icl->icl_count);
+       if (!count) {
+               xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
+               return EINVAL;
+       }
+       length = be32_to_cpu(icl->icl_length);
+       if (!length || length >= mp->m_sb.sb_agblocks) {
+               xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
+               return EINVAL;
+       }
+
+       /* existing allocation is fixed value */
+       ASSERT(count == XFS_IALLOC_INODES(mp));
+       ASSERT(length == XFS_IALLOC_BLOCKS(mp));
+       if (count != XFS_IALLOC_INODES(mp) ||
+            length != XFS_IALLOC_BLOCKS(mp)) {
+               xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+               return EINVAL;
+       }
+
+       /*
+        * Inode buffers can be freed. Do not replay the inode initialisation as
+        * we could be overwriting something written after this inode buffer was
+        * cancelled.
+        *
+        * XXX: we need to iterate all buffers and only init those that are not
+        * cancelled. I think that a more fine grained factoring of
+        * xfs_ialloc_inode_init may be appropriate here to enable this to be
+        * done easily.
+        */
+       if (xlog_check_buffer_cancelled(log,
+                       XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+               return 0;
+
+       xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
+                                       be32_to_cpu(icl->icl_gen));
+       return 0;
+}
+
 /*
  * Free up any resources allocated by the transaction
  *
@@ -3023,6 +3124,7 @@ xlog_recover_commit_pass1(
        case XFS_LI_EFI:
        case XFS_LI_EFD:
        case XFS_LI_DQUOT:
+       case XFS_LI_ICREATE:
                /* nothing to do in pass 1 */
                return 0;
        default:
@@ -3053,6 +3155,8 @@ xlog_recover_commit_pass2(
                return xlog_recover_efd_pass2(log, item);
        case XFS_LI_DQUOT:
                return xlog_recover_dquot_pass2(log, buffer_list, item);
+       case XFS_LI_ICREATE:
+               return xlog_recover_do_icreate_pass2(log, buffer_list, item);
        case XFS_LI_QUOTAOFF:
                /* nothing to do in pass2 */
                return 0;
index e8e310c050977c51ecf1927c177c17cc0b17ef01..2b0ba358165619b87523315f1ca3940b4e2606c0 100644 (file)
@@ -336,6 +336,14 @@ xfs_mount_validate_sb(
                return XFS_ERROR(EWRONGFS);
        }
 
+       if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) &&
+                       (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+                               XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) {
+               xfs_notice(mp,
+"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n");
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
        /*
         * Version 5 superblock feature mask validation. Reject combinations the
         * kernel cannot support up front before checking anything else. For
@@ -561,6 +569,18 @@ out_unwind:
        return error;
 }
 
+static void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+       if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+                                       XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+       if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+                                       XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+       sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+}
+
 void
 xfs_sb_from_disk(
        struct xfs_sb   *to,
@@ -622,6 +642,35 @@ xfs_sb_from_disk(
        to->sb_lsn = be64_to_cpu(from->sb_lsn);
 }
 
+static inline void
+xfs_sb_quota_to_disk(
+       xfs_dsb_t       *to,
+       xfs_sb_t        *from,
+       __int64_t       *fields)
+{
+       __uint16_t      qflags = from->sb_qflags;
+
+       if (*fields & XFS_SB_QFLAGS) {
+               /*
+                * The in-core version of sb_qflags do not have
+                * XFS_OQUOTA_* flags, whereas the on-disk version
+                * does.  So, convert incore XFS_{PG}QUOTA_* flags
+                * to on-disk XFS_OQUOTA_* flags.
+                */
+               qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+                               XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+
+               if (from->sb_qflags &
+                               (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+                       qflags |= XFS_OQUOTA_ENFD;
+               if (from->sb_qflags &
+                               (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+                       qflags |= XFS_OQUOTA_CHKD;
+               to->sb_qflags = cpu_to_be16(qflags);
+               *fields &= ~XFS_SB_QFLAGS;
+       }
+}
+
 /*
  * Copy in core superblock to ondisk one.
  *
@@ -643,6 +692,7 @@ xfs_sb_to_disk(
        if (!fields)
                return;
 
+       xfs_sb_quota_to_disk(to, from, &fields);
        while (fields) {
                f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
                first = xfs_sb_info[f].offset;
@@ -835,6 +885,7 @@ reread:
         */
        xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
 
+       xfs_sb_quota_from_disk(&mp->m_sb);
        /*
         * We must be able to do sector-sized and sector-aligned IO.
         */
@@ -987,42 +1038,27 @@ xfs_update_alignment(xfs_mount_t *mp)
                 */
                if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
                    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
-                       if (mp->m_flags & XFS_MOUNT_RETERR) {
-                               xfs_warn(mp, "alignment check failed: "
-                                        "(sunit/swidth vs. blocksize)");
-                               return XFS_ERROR(EINVAL);
-                       }
-                       mp->m_dalign = mp->m_swidth = 0;
+                       xfs_warn(mp,
+               "alignment check failed: sunit/swidth vs. blocksize(%d)",
+                               sbp->sb_blocksize);
+                       return XFS_ERROR(EINVAL);
                } else {
                        /*
                         * Convert the stripe unit and width to FSBs.
                         */
                        mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
                        if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
-                               if (mp->m_flags & XFS_MOUNT_RETERR) {
-                                       xfs_warn(mp, "alignment check failed: "
-                                                "(sunit/swidth vs. ag size)");
-                                       return XFS_ERROR(EINVAL);
-                               }
                                xfs_warn(mp,
-               "stripe alignment turned off: sunit(%d)/swidth(%d) "
-               "incompatible with agsize(%d)",
-                                       mp->m_dalign, mp->m_swidth,
-                                       sbp->sb_agblocks);
-
-                               mp->m_dalign = 0;
-                               mp->m_swidth = 0;
+                       "alignment check failed: sunit/swidth vs. agsize(%d)",
+                                        sbp->sb_agblocks);
+                               return XFS_ERROR(EINVAL);
                        } else if (mp->m_dalign) {
                                mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
                        } else {
-                               if (mp->m_flags & XFS_MOUNT_RETERR) {
-                                       xfs_warn(mp, "alignment check failed: "
-                                               "sunit(%d) less than bsize(%d)",
-                                               mp->m_dalign,
-                                               mp->m_blockmask +1);
-                                       return XFS_ERROR(EINVAL);
-                               }
-                               mp->m_swidth = 0;
+                               xfs_warn(mp,
+                       "alignment check failed: sunit(%d) less than bsize(%d)",
+                                        mp->m_dalign, sbp->sb_blocksize);
+                               return XFS_ERROR(EINVAL);
                        }
                }
 
@@ -1039,6 +1075,10 @@ xfs_update_alignment(xfs_mount_t *mp)
                                sbp->sb_width = mp->m_swidth;
                                mp->m_update_flags |= XFS_SB_WIDTH;
                        }
+               } else {
+                       xfs_warn(mp,
+       "cannot change alignment: superblock does not support data alignment");
+                       return XFS_ERROR(EINVAL);
                }
        } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
                    xfs_sb_version_hasdalign(&mp->m_sb)) {
index b004cecdfb042925821fea1810c945fc319b90d4..4e374d4a9189622bccb2b56aa331c7218567d1f2 100644 (file)
@@ -192,8 +192,6 @@ typedef struct xfs_mount {
        xfs_dablk_t             m_dirleafblk;   /* blockno of dir non-data v2 */
        xfs_dablk_t             m_dirfreeblk;   /* blockno of dirfreeindex v2 */
        uint                    m_chsize;       /* size of next field */
-       struct xfs_chash        *m_chash;       /* fs private inode per-cluster
-                                                * hash table */
        atomic_t                m_active_trans; /* number trans frozen */
 #ifdef HAVE_PERCPU_SB
        xfs_icsb_cnts_t __percpu *m_sb_cnts;    /* per-cpu superblock counters */
@@ -229,8 +227,6 @@ typedef struct xfs_mount {
                                                   operations, typically for
                                                   disk errors in metadata */
 #define XFS_MOUNT_DISCARD      (1ULL << 5)     /* discard unused blocks */
-#define XFS_MOUNT_RETERR       (1ULL << 6)     /* return alignment errors to
-                                                  user */
 #define XFS_MOUNT_NOALIGN      (1ULL << 7)     /* turn off stripe alignment
                                                   allocations */
 #define XFS_MOUNT_ATTR2                (1ULL << 8)     /* allow use of attr2 format */
index b75c9bb6e71e34b0c65158f63ba475a217d9e347..7a3e007b49f49ceab50aafc84b1a04239a954de8 100644 (file)
@@ -70,7 +70,7 @@ xfs_qm_dquot_walk(
        void                    *data)
 {
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
-       struct radix_tree_root  *tree = XFS_DQUOT_TREE(qi, type);
+       struct radix_tree_root  *tree = xfs_dquot_tree(qi, type);
        uint32_t                next_index;
        int                     last_error = 0;
        int                     skipped;
@@ -189,7 +189,7 @@ xfs_qm_dqpurge(
        xfs_dqfunlock(dqp);
        xfs_dqunlock(dqp);
 
-       radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+       radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
                          be32_to_cpu(dqp->q_core.d_id));
        qi->qi_dquots--;
 
@@ -299,8 +299,10 @@ xfs_qm_mount_quotas(
         */
        if (!XFS_IS_UQUOTA_ON(mp))
                mp->m_qflags &= ~XFS_UQUOTA_CHKD;
-       if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
-               mp->m_qflags &= ~XFS_OQUOTA_CHKD;
+       if (!XFS_IS_GQUOTA_ON(mp))
+               mp->m_qflags &= ~XFS_GQUOTA_CHKD;
+       if (!XFS_IS_PQUOTA_ON(mp))
+               mp->m_qflags &= ~XFS_PQUOTA_CHKD;
 
  write_changes:
        /*
@@ -489,8 +491,7 @@ xfs_qm_need_dqattach(
                return false;
        if (!XFS_NOT_DQATTACHED(mp, ip))
                return false;
-       if (ip->i_ino == mp->m_sb.sb_uquotino ||
-           ip->i_ino == mp->m_sb.sb_gquotino)
+       if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
                return false;
        return true;
 }
@@ -606,8 +607,7 @@ xfs_qm_dqdetach(
 
        trace_xfs_dquot_dqdetach(ip);
 
-       ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
-       ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
+       ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino));
        if (ip->i_udquot) {
                xfs_qm_dqrele(ip->i_udquot);
                ip->i_udquot = NULL;
@@ -1152,7 +1152,7 @@ xfs_qm_dqusage_adjust(
         * rootino must have its resources accounted for, not so with the quota
         * inodes.
         */
-       if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+       if (xfs_is_quota_inode(&mp->m_sb, ino)) {
                *res = BULKSTAT_RV_NOTHING;
                return XFS_ERROR(EINVAL);
        }
@@ -1262,19 +1262,20 @@ int
 xfs_qm_quotacheck(
        xfs_mount_t     *mp)
 {
-       int             done, count, error, error2;
-       xfs_ino_t       lastino;
-       size_t          structsz;
-       xfs_inode_t     *uip, *gip;
-       uint            flags;
-       LIST_HEAD       (buffer_list);
+       int                     done, count, error, error2;
+       xfs_ino_t               lastino;
+       size_t                  structsz;
+       uint                    flags;
+       LIST_HEAD               (buffer_list);
+       struct xfs_inode        *uip = mp->m_quotainfo->qi_uquotaip;
+       struct xfs_inode        *gip = mp->m_quotainfo->qi_gquotaip;
 
        count = INT_MAX;
        structsz = 1;
        lastino = 0;
        flags = 0;
 
-       ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
+       ASSERT(uip || gip);
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
        xfs_notice(mp, "Quotacheck needed: Please wait.");
@@ -1284,7 +1285,6 @@ xfs_qm_quotacheck(
         * their counters to zero. We need a clean slate.
         * We don't log our changes till later.
         */
-       uip = mp->m_quotainfo->qi_uquotaip;
        if (uip) {
                error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
                                         &buffer_list);
@@ -1293,14 +1293,14 @@ xfs_qm_quotacheck(
                flags |= XFS_UQUOTA_CHKD;
        }
 
-       gip = mp->m_quotainfo->qi_gquotaip;
        if (gip) {
                error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
                                         XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
                                         &buffer_list);
                if (error)
                        goto error_return;
-               flags |= XFS_OQUOTA_CHKD;
+               flags |= XFS_IS_GQUOTA_ON(mp) ?
+                                       XFS_GQUOTA_CHKD : XFS_PQUOTA_CHKD;
        }
 
        do {
@@ -1395,15 +1395,13 @@ STATIC int
 xfs_qm_init_quotainos(
        xfs_mount_t     *mp)
 {
-       xfs_inode_t     *uip, *gip;
-       int             error;
-       __int64_t       sbflags;
-       uint            flags;
+       struct xfs_inode        *uip = NULL;
+       struct xfs_inode        *gip = NULL;
+       int                     error;
+       __int64_t               sbflags = 0;
+       uint                    flags = 0;
 
        ASSERT(mp->m_quotainfo);
-       uip = gip = NULL;
-       sbflags = 0;
-       flags = 0;
 
        /*
         * Get the uquota and gquota inodes
@@ -1412,19 +1410,18 @@ xfs_qm_init_quotainos(
                if (XFS_IS_UQUOTA_ON(mp) &&
                    mp->m_sb.sb_uquotino != NULLFSINO) {
                        ASSERT(mp->m_sb.sb_uquotino > 0);
-                       if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
-                                            0, 0, &uip)))
+                       error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+                                            0, 0, &uip);
+                       if (error)
                                return XFS_ERROR(error);
                }
                if (XFS_IS_OQUOTA_ON(mp) &&
                    mp->m_sb.sb_gquotino != NULLFSINO) {
                        ASSERT(mp->m_sb.sb_gquotino > 0);
-                       if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
-                                            0, 0, &gip))) {
-                               if (uip)
-                                       IRELE(uip);
-                               return XFS_ERROR(error);
-                       }
+                       error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+                                            0, 0, &gip);
+                       if (error)
+                               goto error_rele;
                }
        } else {
                flags |= XFS_QMOPT_SBVERSION;
@@ -1439,10 +1436,11 @@ xfs_qm_init_quotainos(
         * temporarily switch to read-write to do this.
         */
        if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
-               if ((error = xfs_qm_qino_alloc(mp, &uip,
+               error = xfs_qm_qino_alloc(mp, &uip,
                                              sbflags | XFS_SB_UQUOTINO,
-                                             flags | XFS_QMOPT_UQUOTA)))
-                       return XFS_ERROR(error);
+                                             flags | XFS_QMOPT_UQUOTA);
+               if (error)
+                       goto error_rele;
 
                flags &= ~XFS_QMOPT_SBVERSION;
        }
@@ -1451,18 +1449,21 @@ xfs_qm_init_quotainos(
                                XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
                error = xfs_qm_qino_alloc(mp, &gip,
                                          sbflags | XFS_SB_GQUOTINO, flags);
-               if (error) {
-                       if (uip)
-                               IRELE(uip);
-
-                       return XFS_ERROR(error);
-               }
+               if (error)
+                       goto error_rele;
        }
 
        mp->m_quotainfo->qi_uquotaip = uip;
        mp->m_quotainfo->qi_gquotaip = gip;
 
        return 0;
+
+error_rele:
+       if (uip)
+               IRELE(uip);
+       if (gip)
+               IRELE(gip);
+       return XFS_ERROR(error);
 }
 
 STATIC void
@@ -1473,7 +1474,7 @@ xfs_qm_dqfree_one(
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
 
        mutex_lock(&qi->qi_tree_lock);
-       radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+       radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
                          be32_to_cpu(dqp->q_core.d_id));
 
        qi->qi_dquots--;
@@ -1659,7 +1660,8 @@ xfs_qm_vop_dqalloc(
        struct xfs_dquot        **O_gdqpp)
 {
        struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_dquot        *uq, *gq;
+       struct xfs_dquot        *uq = NULL;
+       struct xfs_dquot        *gq = NULL;
        int                     error;
        uint                    lockflags;
 
@@ -1684,7 +1686,6 @@ xfs_qm_vop_dqalloc(
                }
        }
 
-       uq = gq = NULL;
        if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
                if (ip->i_d.di_uid != uid) {
                        /*
@@ -1697,11 +1698,12 @@ xfs_qm_vop_dqalloc(
                         * holding ilock.
                         */
                        xfs_iunlock(ip, lockflags);
-                       if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+                       error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
                                                 XFS_DQ_USER,
                                                 XFS_QMOPT_DQALLOC |
                                                 XFS_QMOPT_DOWARN,
-                                                &uq))) {
+                                                &uq);
+                       if (error) {
                                ASSERT(error != ENOENT);
                                return error;
                        }
@@ -1723,15 +1725,14 @@ xfs_qm_vop_dqalloc(
        if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
                if (ip->i_d.di_gid != gid) {
                        xfs_iunlock(ip, lockflags);
-                       if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+                       error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
                                                 XFS_DQ_GROUP,
                                                 XFS_QMOPT_DQALLOC |
                                                 XFS_QMOPT_DOWARN,
-                                                &gq))) {
-                               if (uq)
-                                       xfs_qm_dqrele(uq);
+                                                &gq);
+                       if (error) {
                                ASSERT(error != ENOENT);
-                               return error;
+                               goto error_rele;
                        }
                        xfs_dqunlock(gq);
                        lockflags = XFS_ILOCK_SHARED;
@@ -1743,15 +1744,14 @@ xfs_qm_vop_dqalloc(
        } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
                if (xfs_get_projid(ip) != prid) {
                        xfs_iunlock(ip, lockflags);
-                       if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
+                       error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
                                                 XFS_DQ_PROJ,
                                                 XFS_QMOPT_DQALLOC |
                                                 XFS_QMOPT_DOWARN,
-                                                &gq))) {
-                               if (uq)
-                                       xfs_qm_dqrele(uq);
+                                                &gq);
+                       if (error) {
                                ASSERT(error != ENOENT);
-                               return (error);
+                               goto error_rele;
                        }
                        xfs_dqunlock(gq);
                        lockflags = XFS_ILOCK_SHARED;
@@ -1774,6 +1774,11 @@ xfs_qm_vop_dqalloc(
        else if (gq)
                xfs_qm_dqrele(gq);
        return 0;
+
+error_rele:
+       if (uq)
+               xfs_qm_dqrele(uq);
+       return error;
 }
 
 /*
@@ -1821,29 +1826,31 @@ xfs_qm_vop_chown(
  */
 int
 xfs_qm_vop_chown_reserve(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip,
-       xfs_dquot_t     *udqp,
-       xfs_dquot_t     *gdqp,
-       uint            flags)
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       struct xfs_dquot        *udqp,
+       struct xfs_dquot        *gdqp,
+       uint                    flags)
 {
-       xfs_mount_t     *mp = ip->i_mount;
-       uint            delblks, blkflags, prjflags = 0;
-       xfs_dquot_t     *unresudq, *unresgdq, *delblksudq, *delblksgdq;
-       int             error;
+       struct xfs_mount        *mp = ip->i_mount;
+       uint                    delblks, blkflags, prjflags = 0;
+       struct xfs_dquot        *udq_unres = NULL;
+       struct xfs_dquot        *gdq_unres = NULL;
+       struct xfs_dquot        *udq_delblks = NULL;
+       struct xfs_dquot        *gdq_delblks = NULL;
+       int                     error;
 
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
        delblks = ip->i_delayed_blks;
-       delblksudq = delblksgdq = unresudq = unresgdq = NULL;
        blkflags = XFS_IS_REALTIME_INODE(ip) ?
                        XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
 
        if (XFS_IS_UQUOTA_ON(mp) && udqp &&
            ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
-               delblksudq = udqp;
+               udq_delblks = udqp;
                /*
                 * If there are delayed allocation blocks, then we have to
                 * unreserve those from the old dquot, and add them to the
@@ -1851,7 +1858,7 @@ xfs_qm_vop_chown_reserve(
                 */
                if (delblks) {
                        ASSERT(ip->i_udquot);
-                       unresudq = ip->i_udquot;
+                       udq_unres = ip->i_udquot;
                }
        }
        if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
@@ -1862,18 +1869,19 @@ xfs_qm_vop_chown_reserve(
                if (prjflags ||
                    (XFS_IS_GQUOTA_ON(ip->i_mount) &&
                     ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
-                       delblksgdq = gdqp;
+                       gdq_delblks = gdqp;
                        if (delblks) {
                                ASSERT(ip->i_gdquot);
-                               unresgdq = ip->i_gdquot;
+                               gdq_unres = ip->i_gdquot;
                        }
                }
        }
 
-       if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
-                               delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
-                               flags | blkflags | prjflags)))
-               return (error);
+       error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
+                               udq_delblks, gdq_delblks, ip->i_d.di_nblocks, 1,
+                               flags | blkflags | prjflags);
+       if (error)
+               return error;
 
        /*
         * Do the delayed blks reservations/unreservations now. Since, these
@@ -1885,14 +1893,15 @@ xfs_qm_vop_chown_reserve(
                /*
                 * Do the reservations first. Unreservation can't fail.
                 */
-               ASSERT(delblksudq || delblksgdq);
-               ASSERT(unresudq || unresgdq);
-               if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
-                               delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
-                               flags | blkflags | prjflags)))
-                       return (error);
+               ASSERT(udq_delblks || gdq_delblks);
+               ASSERT(udq_unres || gdq_unres);
+               error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+                           udq_delblks, gdq_delblks, (xfs_qcnt_t)delblks, 0,
+                           flags | blkflags | prjflags);
+               if (error)
+                       return error;
                xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
-                               unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
+                               udq_unres, gdq_unres, -((xfs_qcnt_t)delblks), 0,
                                blkflags);
        }
 
index 5d16a6e6900f8756051877bac7ef18f2d666668a..bdb4f8b95207714cfa97294cd8d2c7a5298841ee 100644 (file)
@@ -69,30 +69,62 @@ typedef struct xfs_quotainfo {
        struct shrinker  qi_shrinker;
 } xfs_quotainfo_t;
 
-#define XFS_DQUOT_TREE(qi, type) \
-       ((type & XFS_DQ_USER) ? \
-        &((qi)->qi_uquota_tree) : \
-        &((qi)->qi_gquota_tree))
+static inline struct radix_tree_root *
+xfs_dquot_tree(
+       struct xfs_quotainfo    *qi,
+       int                     type)
+{
+       switch (type) {
+       case XFS_DQ_USER:
+               return &qi->qi_uquota_tree;
+       case XFS_DQ_GROUP:
+       case XFS_DQ_PROJ:
+               return &qi->qi_gquota_tree;
+       default:
+               ASSERT(0);
+       }
+       return NULL;
+}
 
+static inline struct xfs_inode *
+xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
+{
+       switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
+       case XFS_DQ_USER:
+               return dqp->q_mount->m_quotainfo->qi_uquotaip;
+       case XFS_DQ_GROUP:
+       case XFS_DQ_PROJ:
+               return dqp->q_mount->m_quotainfo->qi_gquotaip;
+       default:
+               ASSERT(0);
+       }
+       return NULL;
+}
 
 extern int     xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp,
                                             unsigned int nbblks);
-extern void    xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
-extern int     xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
-                       xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
-extern void    xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
-extern void    xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
+extern void    xfs_trans_mod_dquot(struct xfs_trans *,
+                                       struct xfs_dquot *, uint, long);
+extern int     xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+                       struct xfs_mount *, struct xfs_dquot *,
+                       struct xfs_dquot *, long, long, uint);
+extern void    xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
+extern void    xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
 
 /*
  * We keep the usr and grp dquots separately so that locking will be easier
  * to do at commit time. All transactions that we know of at this point
  * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
  */
+enum {
+       XFS_QM_TRANS_USR = 0,
+       XFS_QM_TRANS_GRP,
+       XFS_QM_TRANS_DQTYPES
+};
 #define XFS_QM_TRANS_MAXDQS            2
-typedef struct xfs_dquot_acct {
-       xfs_dqtrx_t     dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
-       xfs_dqtrx_t     dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
-} xfs_dquot_acct_t;
+struct xfs_dquot_acct {
+       struct xfs_dqtrx        dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
+};
 
 /*
  * Users are allowed to have a usage exceeding their softlimit for
@@ -106,22 +138,23 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_IWARNLIMIT      5
 #define XFS_QM_RTBWARNLIMIT    5
 
-extern void            xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int             xfs_qm_quotacheck(xfs_mount_t *);
-extern int             xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
+extern void            xfs_qm_destroy_quotainfo(struct xfs_mount *);
+extern int             xfs_qm_quotacheck(struct xfs_mount *);
+extern int             xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
 
 /* dquot stuff */
-extern void            xfs_qm_dqpurge_all(xfs_mount_t *, uint);
-extern void            xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
+extern void            xfs_qm_dqpurge_all(struct xfs_mount *, uint);
+extern void            xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
 
 /* quota ops */
-extern int             xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
-extern int             xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
-                                       fs_disk_quota_t *);
+extern int             xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
+extern int             xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
+                                       uint, struct fs_disk_quota *);
 extern int             xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
-                                       fs_disk_quota_t *);
-extern int             xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
-extern int             xfs_qm_scall_quotaon(xfs_mount_t *, uint);
-extern int             xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
+                                       struct fs_disk_quota *);
+extern int             xfs_qm_scall_getqstat(struct xfs_mount *,
+                                       struct fs_quota_stat *);
+extern int             xfs_qm_scall_quotaon(struct xfs_mount *, uint);
+extern int             xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
 
 #endif /* __XFS_QM_H__ */
index 6cdf6ffc36a1d7f9fc967ec6c72de3621a58b47c..a08801ae24e22bf281c6abdeedb1488224d4cbb8 100644 (file)
@@ -117,11 +117,11 @@ xfs_qm_scall_quotaoff(
        }
        if (flags & XFS_GQUOTA_ACCT) {
                dqtype |= XFS_QMOPT_GQUOTA;
-               flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+               flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
                inactivate_flags |= XFS_GQUOTA_ACTIVE;
        } else if (flags & XFS_PQUOTA_ACCT) {
                dqtype |= XFS_QMOPT_PQUOTA;
-               flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+               flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD);
                inactivate_flags |= XFS_PQUOTA_ACTIVE;
        }
 
@@ -335,14 +335,14 @@ xfs_qm_scall_quotaon(
         * quota acct on ondisk without m_qflags' knowing.
         */
        if (((flags & XFS_UQUOTA_ACCT) == 0 &&
-           (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
-           (flags & XFS_UQUOTA_ENFD))
-           ||
+            (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+            (flags & XFS_UQUOTA_ENFD)) ||
+           ((flags & XFS_GQUOTA_ACCT) == 0 &&
+            (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+            (flags & XFS_GQUOTA_ENFD)) ||
            ((flags & XFS_PQUOTA_ACCT) == 0 &&
-           (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
-           (flags & XFS_GQUOTA_ACCT) == 0 &&
-           (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
-           (flags & XFS_OQUOTA_ENFD))) {
+            (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
+            (flags & XFS_PQUOTA_ENFD))) {
                xfs_debug(mp,
                        "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
                        __func__, flags, mp->m_sb.sb_qflags);
@@ -407,11 +407,11 @@ xfs_qm_scall_getqstat(
        struct fs_quota_stat    *out)
 {
        struct xfs_quotainfo    *q = mp->m_quotainfo;
-       struct xfs_inode        *uip, *gip;
-       bool                    tempuqip, tempgqip;
+       struct xfs_inode        *uip = NULL;
+       struct xfs_inode        *gip = NULL;
+       bool                    tempuqip = false;
+       bool                    tempgqip = false;
 
-       uip = gip = NULL;
-       tempuqip = tempgqip = false;
        memset(out, 0, sizeof(fs_quota_stat_t));
 
        out->qs_version = FS_QSTAT_VERSION;
@@ -776,9 +776,12 @@ xfs_qm_scall_getquota(
         * gets turned off. No need to confuse the user level code,
         * so return zeroes in that case.
         */
-       if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) ||
-           (!XFS_IS_OQUOTA_ENFORCED(mp) &&
-                       (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+       if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
+            dqp->q_core.d_flags == XFS_DQ_USER) ||
+           (!XFS_IS_GQUOTA_ENFORCED(mp) &&
+            dqp->q_core.d_flags == XFS_DQ_GROUP) ||
+           (!XFS_IS_PQUOTA_ENFORCED(mp) &&
+            dqp->q_core.d_flags == XFS_DQ_PROJ)) {
                dst->d_btimer = 0;
                dst->d_itimer = 0;
                dst->d_rtbtimer = 0;
@@ -786,8 +789,8 @@ xfs_qm_scall_getquota(
 
 #ifdef DEBUG
        if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
-            (XFS_IS_OQUOTA_ENFORCED(mp) &&
-                       (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
+            (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
+            (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
            dst->d_id != 0) {
                if ((dst->d_bcount > dst->d_blk_softlimit) &&
                    (dst->d_blk_softlimit > 0)) {
@@ -833,16 +836,16 @@ xfs_qm_export_flags(
        uflags = 0;
        if (flags & XFS_UQUOTA_ACCT)
                uflags |= FS_QUOTA_UDQ_ACCT;
-       if (flags & XFS_PQUOTA_ACCT)
-               uflags |= FS_QUOTA_PDQ_ACCT;
        if (flags & XFS_GQUOTA_ACCT)
                uflags |= FS_QUOTA_GDQ_ACCT;
+       if (flags & XFS_PQUOTA_ACCT)
+               uflags |= FS_QUOTA_PDQ_ACCT;
        if (flags & XFS_UQUOTA_ENFD)
                uflags |= FS_QUOTA_UDQ_ENFD;
-       if (flags & (XFS_OQUOTA_ENFD)) {
-               uflags |= (flags & XFS_GQUOTA_ACCT) ?
-                       FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
-       }
+       if (flags & XFS_GQUOTA_ENFD)
+               uflags |= FS_QUOTA_GDQ_ENFD;
+       if (flags & XFS_PQUOTA_ENFD)
+               uflags |= FS_QUOTA_PDQ_ENFD;
        return (uflags);
 }
 
index c38068f26c558d2b1f6c81ac902c4c53498cafbc..c3483bab9cde162f87e069af51f24c19ecbbb005 100644 (file)
@@ -160,31 +160,43 @@ typedef struct xfs_qoff_logformat {
 #define XFS_OQUOTA_CHKD        0x0020  /* quotacheck run on other (grp/prj) quotas */
 #define XFS_GQUOTA_ACCT        0x0040  /* group quota accounting ON */
 
+/*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD        0x0080  /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD        0x0100  /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD        0x0200  /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD        0x0400  /* quotacheck run on project quotas */
+
 /*
  * Quota Accounting/Enforcement flags
  */
 #define XFS_ALL_QUOTA_ACCT     \
                (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
-#define XFS_ALL_QUOTA_ENFD     (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
-#define XFS_ALL_QUOTA_CHKD     (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
+#define XFS_ALL_QUOTA_ENFD     \
+               (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD     \
+               (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
 
 #define XFS_IS_QUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
 #define XFS_IS_UQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_UQUOTA_ACCT)
 #define XFS_IS_PQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_PQUOTA_ACCT)
 #define XFS_IS_GQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_GQUOTA_ACCT)
 #define XFS_IS_UQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_UQUOTA_ENFD)
-#define XFS_IS_OQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_OQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_PQUOTA_ENFD)
 
 /*
  * Incore only flags for quotaoff - these bits get cleared when quota(s)
  * are in the process of getting turned off. These flags are in m_qflags but
  * never in sb_qflags.
  */
-#define XFS_UQUOTA_ACTIVE      0x0100  /* uquotas are being turned off */
-#define XFS_PQUOTA_ACTIVE      0x0200  /* pquotas are being turned off */
-#define XFS_GQUOTA_ACTIVE      0x0400  /* gquotas are being turned off */
+#define XFS_UQUOTA_ACTIVE      0x1000  /* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE      0x2000  /* gquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE      0x4000  /* pquotas are being turned off */
 #define XFS_ALL_QUOTA_ACTIVE   \
-       (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
+       (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
 
 /*
  * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
@@ -268,24 +280,23 @@ typedef struct xfs_qoff_logformat {
        ((XFS_IS_UQUOTA_ON(mp) && \
                (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
         (XFS_IS_GQUOTA_ON(mp) && \
-               ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
-                (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \
+               (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \
         (XFS_IS_PQUOTA_ON(mp) && \
-               ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
-                (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT))))
+               (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
 
 #define XFS_MOUNT_QUOTA_SET1   (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
-                                XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
-                                XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
+                                XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+                                XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD)
 
 #define XFS_MOUNT_QUOTA_SET2   (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
-                                XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
-                                XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
+                                XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
+                                XFS_PQUOTA_ENFD|XFS_PQUOTA_CHKD)
 
 #define XFS_MOUNT_QUOTA_ALL    (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
-                                XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
-                                XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\
-                                XFS_GQUOTA_ACCT)
+                                XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+                                XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
+                                XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+                                XFS_PQUOTA_CHKD)
 
 
 /*
index 71926d6305273426c8007c7ac1ae4092767a565c..20e30f93b0c7dab8b548527c46634c49486e02cd 100644 (file)
@@ -75,8 +75,10 @@ xfs_fs_set_xstate(
                flags |= XFS_GQUOTA_ACCT;
        if (uflags & FS_QUOTA_UDQ_ENFD)
                flags |= XFS_UQUOTA_ENFD;
-       if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
-               flags |= XFS_OQUOTA_ENFD;
+       if (uflags & FS_QUOTA_GDQ_ENFD)
+               flags |= XFS_GQUOTA_ENFD;
+       if (uflags & FS_QUOTA_PDQ_ENFD)
+               flags |= XFS_PQUOTA_ENFD;
 
        switch (op) {
        case Q_XQUOTAON:
index 2de58a85833c273d49baea1db168192f0c409120..78f9e70b80c7da8a64b92d528a9f7aa2ba49401e 100644 (file)
@@ -618,6 +618,12 @@ xfs_sb_has_incompat_log_feature(
        return (sbp->sb_features_log_incompat & feature) != 0;
 }
 
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+       return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino);
+}
+
 /*
  * end of superblock version macros
  */
index 3033ba5e9762f19609a7f86d5117f64eb70f7bd4..1d68ffcdeaa7f555ab77ee05f5c13571c6ee3b4d 100644 (file)
@@ -51,6 +51,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_icache.h"
 #include "xfs_trace.h"
+#include "xfs_icreate_item.h"
 
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -359,17 +360,17 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
                           !strcmp(this_char, MNTOPT_PRJQUOTA)) {
                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
-                                        XFS_OQUOTA_ENFD);
+                                        XFS_PQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-                       mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+                       mp->m_qflags &= ~XFS_PQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
                           !strcmp(this_char, MNTOPT_GRPQUOTA)) {
                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
-                                        XFS_OQUOTA_ENFD);
+                                        XFS_GQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-                       mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+                       mp->m_qflags &= ~XFS_GQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
                        xfs_warn(mp,
        "delaylog is the default now, option is deprecated.");
@@ -439,20 +440,15 @@ xfs_parseargs(
        }
 
 done:
-       if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+       if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) {
                /*
                 * At this point the superblock has not been read
                 * in, therefore we do not know the block size.
                 * Before the mount call ends we will convert
                 * these to FSBs.
                 */
-               if (dsunit) {
-                       mp->m_dalign = dsunit;
-                       mp->m_flags |= XFS_MOUNT_RETERR;
-               }
-
-               if (dswidth)
-                       mp->m_swidth = dswidth;
+               mp->m_dalign = dsunit;
+               mp->m_swidth = dswidth;
        }
 
        if (mp->m_logbufs != -1 &&
@@ -563,12 +559,12 @@ xfs_showargs(
        /* Either project or group quotas can be active, not both */
 
        if (mp->m_qflags & XFS_PQUOTA_ACCT) {
-               if (mp->m_qflags & XFS_OQUOTA_ENFD)
+               if (mp->m_qflags & XFS_PQUOTA_ENFD)
                        seq_puts(m, "," MNTOPT_PRJQUOTA);
                else
                        seq_puts(m, "," MNTOPT_PQUOTANOENF);
        } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
-               if (mp->m_qflags & XFS_OQUOTA_ENFD)
+               if (mp->m_qflags & XFS_GQUOTA_ENFD)
                        seq_puts(m, "," MNTOPT_GRPQUOTA);
                else
                        seq_puts(m, "," MNTOPT_GQUOTANOENF);
@@ -1136,8 +1132,8 @@ xfs_fs_statfs(
        spin_unlock(&mp->m_sb_lock);
 
        if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-           ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
-                             (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+           ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
+                             (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
                xfs_qm_statvfs(ip, statp);
        return 0;
 }
@@ -1481,6 +1477,10 @@ xfs_fs_fill_super(
        sb->s_time_gran = 1;
        set_posix_acl_flag(sb);
 
+       /* version 5 superblocks support inode version counters. */
+       if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+               sb->s_flags |= MS_I_VERSION;
+
        error = xfs_mountfs(mp);
        if (error)
                goto out_filestream_unmount;
@@ -1655,9 +1655,15 @@ xfs_init_zones(void)
                                        KM_ZONE_SPREAD, NULL);
        if (!xfs_ili_zone)
                goto out_destroy_inode_zone;
+       xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
+                                       "xfs_icr");
+       if (!xfs_icreate_zone)
+               goto out_destroy_ili_zone;
 
        return 0;
 
+ out_destroy_ili_zone:
+       kmem_zone_destroy(xfs_ili_zone);
  out_destroy_inode_zone:
        kmem_zone_destroy(xfs_inode_zone);
  out_destroy_efi_zone:
@@ -1696,6 +1702,7 @@ xfs_destroy_zones(void)
         * destroy caches.
         */
        rcu_barrier();
+       kmem_zone_destroy(xfs_icreate_zone);
        kmem_zone_destroy(xfs_ili_zone);
        kmem_zone_destroy(xfs_inode_zone);
        kmem_zone_destroy(xfs_efi_zone);
index 195a403e1522bbed00cb0f843b1d53c885db0e0d..e830fb56e27f9fa0a979dee9f3f58b56e6afa6ae 100644 (file)
@@ -358,7 +358,8 @@ xfs_symlink(
        int                     n;
        xfs_buf_t               *bp;
        prid_t                  prid;
-       struct xfs_dquot        *udqp, *gdqp;
+       struct xfs_dquot        *udqp = NULL;
+       struct xfs_dquot        *gdqp = NULL;
        uint                    resblks;
 
        *ipp = NULL;
@@ -585,7 +586,7 @@ xfs_symlink(
 /*
  * Free a symlink that has blocks associated with it.
  */
-int
+STATIC int
 xfs_inactive_symlink_rmt(
        xfs_inode_t     *ip,
        xfs_trans_t     **tpp)
@@ -606,7 +607,7 @@ xfs_inactive_symlink_rmt(
 
        tp = *tpp;
        mp = ip->i_mount;
-       ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
+       ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
        /*
         * We're freeing a symlink that has some
         * blocks allocated to it.  Free the
@@ -720,3 +721,47 @@ xfs_inactive_symlink_rmt(
  error0:
        return error;
 }
+
+/*
+ * xfs_inactive_symlink - free a symlink
+ */
+int
+xfs_inactive_symlink(
+       struct xfs_inode        *ip,
+       struct xfs_trans        **tp)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     pathlen;
+
+       trace_xfs_inactive_symlink(ip);
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       /*
+        * Zero length symlinks _can_ exist.
+        */
+       pathlen = (int)ip->i_d.di_size;
+       if (!pathlen)
+               return 0;
+
+       if (pathlen < 0 || pathlen > MAXPATHLEN) {
+               xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
+                        __func__, (unsigned long long)ip->i_ino, pathlen);
+               ASSERT(0);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       if (ip->i_df.if_flags & XFS_IFINLINE) {
+               if (ip->i_df.if_bytes > 0)
+                       xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
+                                         XFS_DATA_FORK);
+               ASSERT(ip->i_df.if_bytes == 0);
+               return 0;
+       }
+
+       /* remove the remote symlink */
+       return xfs_inactive_symlink_rmt(ip, tp);
+}
index b39398d2097cd4ac16e608523630e280fd4d455f..374394880c01e4d8db8dc36d6468e748295c6f29 100644 (file)
@@ -60,7 +60,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
                const char *target_path, umode_t mode, struct xfs_inode **ipp);
 int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_inactive_symlink_rmt(struct xfs_inode *ip, struct xfs_trans **tpp);
+int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
 
 #endif /* __KERNEL__ */
 #endif /* __XFS_SYMLINK_H */
index 2801b5ce6cdb61d9ba03a1b518561a64924a7068..1743b9f8e23d1582ddbd30372a10d5559cb52b1e 100644 (file)
@@ -25,11 +25,11 @@ static struct ctl_table_header *xfs_table_header;
 #ifdef CONFIG_PROC_FS
 STATIC int
 xfs_stats_clear_proc_handler(
-       ctl_table       *ctl,
-       int             write,
-       void            __user *buffer,
-       size_t          *lenp,
-       loff_t          *ppos)
+       struct ctl_table        *ctl,
+       int                     write,
+       void                    __user *buffer,
+       size_t                  *lenp,
+       loff_t                  *ppos)
 {
        int             c, ret, *valp = ctl->data;
        __uint32_t      vn_active;
@@ -55,11 +55,11 @@ xfs_stats_clear_proc_handler(
 
 STATIC int
 xfs_panic_mask_proc_handler(
-       ctl_table       *ctl,
-       int             write,
-       void            __user *buffer,
-       size_t          *lenp,
-       loff_t          *ppos)
+       struct ctl_table        *ctl,
+       int                     write,
+       void                    __user *buffer,
+       size_t                  *lenp,
+       loff_t                  *ppos)
 {
        int             ret, *valp = ctl->data;
 
@@ -74,7 +74,7 @@ xfs_panic_mask_proc_handler(
 }
 #endif /* CONFIG_PROC_FS */
 
-static ctl_table xfs_table[] = {
+static struct ctl_table xfs_table[] = {
        {
                .procname       = "irix_sgid_inherit",
                .data           = &xfs_params.sgid_inherit.val,
@@ -227,7 +227,7 @@ static ctl_table xfs_table[] = {
        {}
 };
 
-static ctl_table xfs_dir_table[] = {
+static struct ctl_table xfs_dir_table[] = {
        {
                .procname       = "xfs",
                .mode           = 0555,
@@ -236,7 +236,7 @@ static ctl_table xfs_dir_table[] = {
        {}
 };
 
-static ctl_table xfs_root_table[] = {
+static struct ctl_table xfs_root_table[] = {
        {
                .procname       = "fs",
                .mode           = 0555,
index a04701de6bbd2cfad8fa844a38bdf66c20b744db..47910e638c187fdd46470ca2c4ca975f2d3c1334 100644 (file)
@@ -486,9 +486,12 @@ DEFINE_EVENT(xfs_buf_item_class, name, \
        TP_PROTO(struct xfs_buf_log_item *bip), \
        TP_ARGS(bip))
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
@@ -508,6 +511,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
 
 DECLARE_EVENT_CLASS(xfs_lock_class,
        TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
@@ -571,6 +575,7 @@ DEFINE_INODE_EVENT(xfs_iget_miss);
 DEFINE_INODE_EVENT(xfs_getattr);
 DEFINE_INODE_EVENT(xfs_setattr);
 DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_inactive_symlink);
 DEFINE_INODE_EVENT(xfs_alloc_file_space);
 DEFINE_INODE_EVENT(xfs_free_file_space);
 DEFINE_INODE_EVENT(xfs_readdir);
index 2fd7c1ff1d21dd684e3409f98b35369c81a3e56b..35a229981354159add4b143aea86682c22966a7f 100644 (file)
@@ -234,71 +234,93 @@ xfs_calc_remove_reservation(
 }
 
 /*
- * For symlink we can modify:
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+
+/*
+ * For create we can modify:
  *    the parent directory inode: inode size
  *    the new inode: inode size
- *    the inode btree entry: 1 block
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
  *    the directory btree: (max depth + v2) * dir block size
  *    the directory inode's bmap btree: (max depth + v2) * block size
- *    the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               (uint)XFS_FSB_TO_B(mp, 1) +
+               xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * For create we can allocate some inodes giving:
  *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
  *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
  *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
  */
 STATIC uint
-xfs_calc_symlink_reservation(
+xfs_calc_create_resv_alloc(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               mp->m_sb.sb_sectsize +
+               xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+__xfs_calc_create_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
-                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-                                     XFS_FSB_TO_B(mp, 1)) +
-                    xfs_calc_buf_res(1, 1024)),
-                   (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
-                                     XFS_FSB_TO_B(mp, 1)) +
-                    xfs_calc_buf_res(mp->m_in_maxlevels,
-                                     XFS_FSB_TO_B(mp, 1)) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                     XFS_FSB_TO_B(mp, 1))));
+               MAX(xfs_calc_create_resv_alloc(mp),
+                   xfs_calc_create_resv_modify(mp));
 }
 
 /*
- * For create we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: block size
- *    the superblock for the nlink flag: sector size
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
+ * For icreate we can allocate some inodes giving:
  *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
  *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
  *    the inode btree: max depth * blocksize
  *    the allocation btrees: 2 trees * (max depth - 1) * block size
  */
 STATIC uint
-xfs_calc_create_reservation(
+xfs_calc_icreate_resv_alloc(
        struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               mp->m_sb.sb_sectsize +
+               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-                    (uint)XFS_FSB_TO_B(mp, 1) +
-                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-                    mp->m_sb.sb_sectsize +
-                    xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
-                                     XFS_FSB_TO_B(mp, 1)) +
-                    xfs_calc_buf_res(mp->m_in_maxlevels,
-                                     XFS_FSB_TO_B(mp, 1)) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                     XFS_FSB_TO_B(mp, 1))));
+               MAX(xfs_calc_icreate_resv_alloc(mp),
+                   xfs_calc_create_resv_modify(mp));
+}
+
+STATIC uint
+xfs_calc_create_reservation(
+       struct xfs_mount        *mp)
+{
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               return xfs_calc_icreate_reservation(mp);
+       return __xfs_calc_create_reservation(mp);
+
 }
 
 /*
@@ -311,6 +333,20 @@ xfs_calc_mkdir_reservation(
        return xfs_calc_create_reservation(mp);
 }
 
+
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (MAXPATHLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_create_reservation(mp) +
+              xfs_calc_buf_res(1, MAXPATHLEN);
+}
+
 /*
  * In freeing an inode we can modify:
  *    the inode being freed: inode size
index a44dba5b2cdb1665d5e70e3dc59477632730a8f5..2b4946393e30f56655e55c782813778760846f79 100644 (file)
@@ -48,6 +48,7 @@ typedef struct xfs_trans_header {
 #define        XFS_LI_BUF              0x123c  /* v2 bufs, variable sized inode bufs */
 #define        XFS_LI_DQUOT            0x123d
 #define        XFS_LI_QUOTAOFF         0x123e
+#define        XFS_LI_ICREATE          0x123f
 
 #define XFS_LI_TYPE_DESC \
        { XFS_LI_EFI,           "XFS_LI_EFI" }, \
@@ -107,7 +108,8 @@ typedef struct xfs_trans_header {
 #define        XFS_TRANS_SWAPEXT               40
 #define        XFS_TRANS_SB_COUNT              41
 #define        XFS_TRANS_CHECKPOINT            42
-#define        XFS_TRANS_TYPE_MAX              42
+#define        XFS_TRANS_ICREATE               43
+#define        XFS_TRANS_TYPE_MAX              43
 /* new transaction types need to be reflected in xfs_logprint(8) */
 
 #define XFS_TRANS_TYPES \
@@ -210,23 +212,18 @@ struct xfs_log_item_desc {
 /*
  * Per-extent log reservation for the allocation btree changes
  * involved in freeing or allocating an extent.
- * 2 trees * (2 blocks/level * max depth - 1) * block size
+ * 2 trees * (2 blocks/level * max depth - 1)
  */
-#define        XFS_ALLOCFREE_LOG_RES(mp,nx) \
-       ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
 #define        XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
        ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
 
 /*
  * Per-directory log reservation for any directory change.
- * dir blocks: (1 btree block per level + data block + free block) * dblock size
- * bmap btree: (levels + 2) * max depth * block size
+ * dir blocks: (1 btree block per level + data block + free block)
+ * bmap btree: (levels + 2) * max depth
  * v2 directory blocks can be fragmented below the dirblksize down to the fsb
  * size, so account for that in the DAENTER macros.
  */
-#define        XFS_DIROP_LOG_RES(mp)   \
-       (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
-        (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
 #define        XFS_DIROP_LOG_COUNT(mp) \
        (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
         XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
@@ -503,6 +500,7 @@ void                xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
 void           xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
 void           xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void           xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void           xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
 void           xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void           xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 void           xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
index 73a5fa457e16ddd72c4a45631cda8a5ae05fdbf6..aa5a04b844d6d530b12edd39b3f99240aff625bc 100644 (file)
@@ -397,7 +397,6 @@ shutdown_abort:
        return XFS_ERROR(EIO);
 }
 
-
 /*
  * Release the buffer bp which was previously acquired with one of the
  * xfs_trans_... buffer allocation routines if the buffer has not
@@ -603,8 +602,14 @@ xfs_trans_log_buf(xfs_trans_t      *tp,
 
        tp->t_flags |= XFS_TRANS_DIRTY;
        bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-       bip->bli_flags |= XFS_BLI_LOGGED;
-       xfs_buf_item_log(bip, first, last);
+
+       /*
+        * If we have an ordered buffer we are not logging any dirty range but
+        * it still needs to be marked dirty and that it has been logged.
+        */
+       bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+       if (!(bip->bli_flags & XFS_BLI_ORDERED))
+               xfs_buf_item_log(bip, first, last);
 }
 
 
@@ -756,6 +761,29 @@ xfs_trans_inode_alloc_buf(
        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
 }
 
+/*
+ * Mark the buffer as ordered for this transaction. This means
+ * that the contents of the buffer are not recorded in the transaction
+ * but it is tracked in the AIL as though it was. This allows us
+ * to record logical changes in transactions rather than the physical
+ * changes we make to the buffer without changing writeback ordering
+ * constraints of metadata buffers.
+ */
+void
+xfs_trans_ordered_buf(
+       struct xfs_trans        *tp,
+       struct xfs_buf          *bp)
+{
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+       ASSERT(bp->b_transp == tp);
+       ASSERT(bip != NULL);
+       ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+       bip->bli_flags |= XFS_BLI_ORDERED;
+       trace_xfs_buf_item_ordered(bip);
+}
+
 /*
  * Set the type of the buffer for log recovery so that it can correctly identify
  * and hence attach the correct buffer ops to the buffer after replay.
index fec75d0237031a5d030f2bbaa7057ecc0f28092a..3ba64d5401680ae749e36fdd424c6a139db0bdf9 100644 (file)
@@ -103,8 +103,6 @@ xfs_trans_dup_dqinfo(
                return;
 
        xfs_trans_alloc_dqinfo(ntp);
-       oqa = otp->t_dqinfo->dqa_usrdquots;
-       nqa = ntp->t_dqinfo->dqa_usrdquots;
 
        /*
         * Because the quota blk reservation is carried forward,
@@ -113,7 +111,9 @@ xfs_trans_dup_dqinfo(
        if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
                ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
 
-       for (j = 0; j < 2; j++) {
+       for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+               oqa = otp->t_dqinfo->dqs[j];
+               nqa = ntp->t_dqinfo->dqs[j];
                for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
                        if (oqa[i].qt_dquot == NULL)
                                break;
@@ -138,8 +138,6 @@ xfs_trans_dup_dqinfo(
                        oq->qt_ino_res = oq->qt_ino_res_used;
 
                }
-               oqa = otp->t_dqinfo->dqa_grpdquots;
-               nqa = ntp->t_dqinfo->dqa_grpdquots;
        }
 }
 
@@ -157,8 +155,7 @@ xfs_trans_mod_dquot_byino(
 
        if (!XFS_IS_QUOTA_RUNNING(mp) ||
            !XFS_IS_QUOTA_ON(mp) ||
-           ip->i_ino == mp->m_sb.sb_uquotino ||
-           ip->i_ino == mp->m_sb.sb_gquotino)
+           xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
                return;
 
        if (tp->t_dqinfo == NULL)
@@ -170,16 +167,18 @@ xfs_trans_mod_dquot_byino(
                (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
 }
 
-STATIC xfs_dqtrx_t *
+STATIC struct xfs_dqtrx *
 xfs_trans_get_dqtrx(
-       xfs_trans_t     *tp,
-       xfs_dquot_t     *dqp)
+       struct xfs_trans        *tp,
+       struct xfs_dquot        *dqp)
 {
-       int             i;
-       xfs_dqtrx_t     *qa;
+       int                     i;
+       struct xfs_dqtrx        *qa;
 
-       qa = XFS_QM_ISUDQ(dqp) ?
-               tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+       if (XFS_QM_ISUDQ(dqp))
+               qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
+       else
+               qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
 
        for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
                if (qa[i].qt_dquot == NULL ||
@@ -339,12 +338,10 @@ xfs_trans_apply_dquot_deltas(
                return;
 
        ASSERT(tp->t_dqinfo);
-       qa = tp->t_dqinfo->dqa_usrdquots;
-       for (j = 0; j < 2; j++) {
-               if (qa[0].qt_dquot == NULL) {
-                       qa = tp->t_dqinfo->dqa_grpdquots;
+       for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+               qa = tp->t_dqinfo->dqs[j];
+               if (qa[0].qt_dquot == NULL)
                        continue;
-               }
 
                /*
                 * Lock all of the dquots and join them to the transaction.
@@ -495,10 +492,6 @@ xfs_trans_apply_dquot_deltas(
                        ASSERT(dqp->q_res_rtbcount >=
                                be64_to_cpu(dqp->q_core.d_rtbcount));
                }
-               /*
-                * Do the group quotas next
-                */
-               qa = tp->t_dqinfo->dqa_grpdquots;
        }
 }
 
@@ -521,9 +514,9 @@ xfs_trans_unreserve_and_mod_dquots(
        if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
                return;
 
-       qa = tp->t_dqinfo->dqa_usrdquots;
+       for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+               qa = tp->t_dqinfo->dqs[j];
 
-       for (j = 0; j < 2; j++) {
                for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
                        qtrx = &qa[i];
                        /*
@@ -565,7 +558,6 @@ xfs_trans_unreserve_and_mod_dquots(
                                xfs_dqunlock(dqp);
 
                }
-               qa = tp->t_dqinfo->dqa_grpdquots;
        }
 }
 
@@ -640,8 +632,8 @@ xfs_trans_dqresv(
        if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
            dqp->q_core.d_id &&
            ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
-            (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
-             (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
+            (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
+            (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
                if (nblks > 0) {
                        /*
                         * dquot is locked already. See if we'd go over the
@@ -748,15 +740,15 @@ error_return:
  */
 int
 xfs_trans_reserve_quota_bydquots(
-       xfs_trans_t     *tp,
-       xfs_mount_t     *mp,
-       xfs_dquot_t     *udqp,
-       xfs_dquot_t     *gdqp,
-       long            nblks,
-       long            ninos,
-       uint            flags)
+       struct xfs_trans        *tp,
+       struct xfs_mount        *mp,
+       struct xfs_dquot        *udqp,
+       struct xfs_dquot        *gdqp,
+       long                    nblks,
+       long                    ninos,
+       uint                    flags)
 {
-       int             resvd = 0, error;
+       int             error;
 
        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
@@ -771,28 +763,24 @@ xfs_trans_reserve_quota_bydquots(
                                        (flags & ~XFS_QMOPT_ENOSPC));
                if (error)
                        return error;
-               resvd = 1;
        }
 
        if (gdqp) {
                error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
-               if (error) {
-                       /*
-                        * can't do it, so backout previous reservation
-                        */
-                       if (resvd) {
-                               flags |= XFS_QMOPT_FORCE_RES;
-                               xfs_trans_dqresv(tp, mp, udqp,
-                                                -nblks, -ninos, flags);
-                       }
-                       return error;
-               }
+               if (error)
+                       goto unwind_usr;
        }
 
        /*
         * Didn't change anything critical, so, no need to log
         */
        return 0;
+
+unwind_usr:
+       flags |= XFS_QMOPT_FORCE_RES;
+       if (udqp)
+               xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags);
+       return error;
 }
 
 
@@ -816,8 +804,7 @@ xfs_trans_reserve_quota_nblks(
        if (XFS_IS_PQUOTA_ON(mp))
                flags |= XFS_QMOPT_ENOSPC;
 
-       ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
-       ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
+       ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
index ac6d567704dbef8beb6c2b74bc22250a2f5da77b..53dfe46f3680791a8eab2e72c1a92db3cb17c091 100644 (file)
@@ -112,6 +112,17 @@ xfs_trans_log_inode(
        ASSERT(ip->i_itemp != NULL);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
+       /*
+        * First time we log the inode in a transaction, bump the inode change
+        * counter if it is configured for this to occur.
+        */
+       if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
+           IS_I_VERSION(VFS_I(ip))) {
+               inode_inc_iversion(VFS_I(ip));
+               ip->i_d.di_changecount = VFS_I(ip)->i_version;
+               flags |= XFS_ILOG_CORE;
+       }
+
        tp->t_flags |= XFS_TRANS_DIRTY;
        ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 
index 0176bb21f09a5c3c0795f6f6932e07f92643c94a..42c0ef288aeb219290ad344bc426430b17a76f52 100644 (file)
@@ -322,18 +322,9 @@ xfs_inactive(
        xfs_trans_ijoin(tp, ip, 0);
 
        if (S_ISLNK(ip->i_d.di_mode)) {
-               /*
-                * Zero length symlinks _can_ exist.
-                */
-               if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) {
-                       error = xfs_inactive_symlink_rmt(ip, &tp);
-                       if (error)
-                               goto out_cancel;
-               } else if (ip->i_df.if_bytes > 0) {
-                       xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
-                                         XFS_DATA_FORK);
-                       ASSERT(ip->i_df.if_bytes == 0);
-               }
+               error = xfs_inactive_symlink(ip, &tp);
+               if (error)
+                       goto out_cancel;
        } else if (truncate) {
                ip->i_d.di_size = 0;
                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
index 379f71508995f1bd65a740c2c558792ca9061ff0..0442c3d800f0f9c8a9268a93715cfe2bc18c639b 100644 (file)
@@ -160,11 +160,6 @@ static inline void ceph_decode_timespec(struct timespec *ts,
 static inline void ceph_encode_timespec(struct ceph_timespec *tv,
                                        const struct timespec *ts)
 {
-       BUG_ON(ts->tv_sec < 0);
-       BUG_ON(ts->tv_sec > (__kernel_time_t)U32_MAX);
-       BUG_ON(ts->tv_nsec < 0);
-       BUG_ON(ts->tv_nsec > (long)U32_MAX);
-
        tv->tv_sec = cpu_to_le32((u32)ts->tv_sec);
        tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec);
 }
index 186db0bf4951b6c537e5bd477d86bf8c7825dc1a..ce6df39f60ff6f966144855ca0e1bdb04c312903 100644 (file)
@@ -145,7 +145,6 @@ struct ceph_osd_request {
        s32               r_reply_op_result[CEPH_OSD_MAX_OP];
        int               r_got_reply;
        int               r_linger;
-       int               r_completed;
 
        struct ceph_osd_client *r_osdc;
        struct kref       r_kref;
index f42dbe1454793870dba2d4afd456f51bc2c8b2a5..3092df3614ae4651a867571ccbaa1d670e97c314 100644 (file)
@@ -324,6 +324,11 @@ static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq)
        return ret;
 }
 
+static inline unsigned d_count(struct dentry *dentry)
+{
+       return dentry->d_count;
+}
+
 /* validate "insecure" dentry pointer */
 extern int d_validate(struct dentry *, struct dentry *);
 
index cb771ecc2362353572d1ae8a662935bd484e7861..a35b10e9a68071e553098899a589cd2bdf58a710 100644 (file)
@@ -954,6 +954,7 @@ struct file_lock {
        unsigned int fl_flags;
        unsigned char fl_type;
        unsigned int fl_pid;
+       int fl_link_cpu;                /* what cpu's list is this on? */
        struct pid *fl_nspid;
        wait_queue_head_t fl_wait;
        struct file *fl_file;
index 7b8fc73810ad83baca0e61e8c843ed4cf75d9d98..e36dee52f224f146805b58a48008012dc68602a8 100644 (file)
@@ -32,6 +32,15 @@ struct nfs4_acl {
        struct nfs4_ace aces[0];
 };
 
+#define NFS4_MAXLABELLEN       2048
+
+struct nfs4_label {
+       uint32_t        lfs;
+       uint32_t        pi;
+       u32             len;
+       char    *label;
+};
+
 typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
 
 struct nfs_stateid4 {
@@ -219,6 +228,14 @@ enum nfsstat4 {
        NFS4ERR_REJECT_DELEG    = 10085,        /* on callback */
        NFS4ERR_RETURNCONFLICT  = 10086,        /* outstanding layoutreturn */
        NFS4ERR_DELEG_REVOKED   = 10087,        /* deleg./layout revoked */
+
+       /* nfs42 */
+       NFS4ERR_PARTNER_NOTSUPP = 10088,
+       NFS4ERR_PARTNER_NO_AUTH = 10089,
+       NFS4ERR_METADATA_NOTSUPP = 10090,
+       NFS4ERR_OFFLOAD_DENIED = 10091,
+       NFS4ERR_WRONG_LFS = 10092,
+       NFS4ERR_BADLABEL = 10093,
 };
 
 static inline bool seqid_mutating_err(u32 err)
@@ -378,6 +395,7 @@ enum lock_type4 {
 #define FATTR4_WORD1_FS_LAYOUT_TYPES    (1UL << 30)
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 #define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
+#define FATTR4_WORD2_SECURITY_LABEL     (1UL << 17)
 
 /* MDS threshold bitmap bits */
 #define THRESHOLD_RD                    (1UL << 0)
@@ -390,11 +408,15 @@ enum lock_type4 {
 #define NFS4_VERSION 4
 #define NFS4_MINOR_VERSION 0
 
+#if defined(CONFIG_NFS_V4_2)
+#define NFS4_MAX_MINOR_VERSION 2
+#else
 #if defined(CONFIG_NFS_V4_1)
 #define NFS4_MAX_MINOR_VERSION 1
 #else
 #define NFS4_MAX_MINOR_VERSION 0
 #endif /* CONFIG_NFS_V4_1 */
+#endif /* CONFIG_NFS_V4_2 */
 
 #define NFS4_DEBUG 1
 
index fc01d5cb4cf1e013afc41f42f9f9f9b6da6908b7..0b176297aaf6337e231785558b99ac93c5b6bac3 100644 (file)
@@ -207,6 +207,7 @@ struct nfs_inode {
 #define NFS_INO_INVALID_ACL    0x0010          /* cached acls are invalid */
 #define NFS_INO_REVAL_PAGECACHE        0x0020          /* must revalidate pagecache */
 #define NFS_INO_REVAL_FORCED   0x0040          /* force revalidation ignoring a delegation */
+#define NFS_INO_INVALID_LABEL  0x0080          /* cached label is invalid */
 
 /*
  * Bit offsets in flags field
@@ -336,7 +337,7 @@ extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping);
 extern void nfs_zap_caches(struct inode *);
 extern void nfs_invalidate_atime(struct inode *);
 extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *,
-                               struct nfs_fattr *);
+                               struct nfs_fattr *, struct nfs4_label *);
 extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr);
@@ -352,10 +353,13 @@ extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
+extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+                               struct nfs4_label *label);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode);
 extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode);
+extern void nfs_inode_attach_open_context(struct nfs_open_context *ctx);
 extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx);
 extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx);
 extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx);
@@ -468,7 +472,8 @@ extern const struct file_operations nfs_dir_operations;
 extern const struct dentry_operations nfs_dentry_operations;
 
 extern void nfs_force_lookup_revalidate(struct inode *dir);
-extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr);
+extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh,
+                       struct nfs_fattr *fattr, struct nfs4_label *label);
 extern int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags);
 extern void nfs_access_zap_cache(struct inode *inode);
 
@@ -496,6 +501,24 @@ extern const struct inode_operations nfs_referral_inode_operations;
 extern int nfs_mountpoint_expiry_timeout;
 extern void nfs_release_automount_timer(void);
 
+/*
+ * linux/fs/nfs/nfs4proc.c
+ */
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags);
+static inline void nfs4_label_free(struct nfs4_label *label)
+{
+       if (label) {
+               kfree(label->label);
+               kfree(label);
+       }
+       return;
+}
+#else
+static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; }
+static inline void nfs4_label_free(void *label) {}
+#endif
+
 /*
  * linux/fs/nfs/unlink.c
  */
index 3b7fa2abecca690e0d007a4aa97fded3e1a1bad3..d2212432c456a3fb2b1cbe33311d6e789d518173 100644 (file)
@@ -146,7 +146,12 @@ struct nfs_server {
        u32                     attr_bitmask[3];/* V4 bitmask representing the set
                                                   of attributes supported on this
                                                   filesystem */
-       u32                     cache_consistency_bitmask[2];
+       u32                     attr_bitmask_nl[3];
+                                               /* V4 bitmask representing the
+                                                  set of attributes supported
+                                                  on this filesystem excluding
+                                                  the label support bit. */
+       u32                     cache_consistency_bitmask[3];
                                                /* V4 bitmask representing the subset
                                                   of change attribute, size, ctime
                                                   and mtime attributes supported by
@@ -200,5 +205,6 @@ struct nfs_server {
 #define NFS_CAP_UIDGID_NOMAP   (1U << 15)
 #define NFS_CAP_STATEID_NFSV41 (1U << 16)
 #define NFS_CAP_ATOMIC_OPEN_V1 (1U << 17)
+#define NFS_CAP_SECURITY_LABEL (1U << 18)
 
 #endif
index 104b62f23ee025a51d730228e061ea49ce4513e0..8651574a305bb44815e0c5e19826a3fbcbf7a37e 100644 (file)
@@ -101,6 +101,7 @@ struct nfs_fattr {
 #define NFS_ATTR_FATTR_MOUNTED_ON_FILEID (1U << 22)
 #define NFS_ATTR_FATTR_OWNER_NAME      (1U << 23)
 #define NFS_ATTR_FATTR_GROUP_NAME      (1U << 24)
+#define NFS_ATTR_FATTR_V4_SECURITY_LABEL (1U << 25)
 
 #define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \
                | NFS_ATTR_FATTR_MODE \
@@ -120,7 +121,8 @@ struct nfs_fattr {
 #define NFS_ATTR_FATTR_V3 (NFS_ATTR_FATTR \
                | NFS_ATTR_FATTR_SPACE_USED)
 #define NFS_ATTR_FATTR_V4 (NFS_ATTR_FATTR \
-               | NFS_ATTR_FATTR_SPACE_USED)
+               | NFS_ATTR_FATTR_SPACE_USED \
+               | NFS_ATTR_FATTR_V4_SECURITY_LABEL)
 
 /*
  * Info on the file system
@@ -246,6 +248,7 @@ struct nfs4_layoutget_res {
 struct nfs4_layoutget {
        struct nfs4_layoutget_args args;
        struct nfs4_layoutget_res res;
+       struct rpc_cred *cred;
        gfp_t gfp_flags;
 };
 
@@ -347,6 +350,7 @@ struct nfs_openargs {
        const u32 *             open_bitmap;
        __u32                   claim;
        enum createmode4        createmode;
+       const struct nfs4_label *label;
 };
 
 struct nfs_openres {
@@ -356,6 +360,7 @@ struct nfs_openres {
        struct nfs4_change_info cinfo;
        __u32                   rflags;
        struct nfs_fattr *      f_attr;
+       struct nfs4_label       *f_label;
        struct nfs_seqid *      seqid;
        const struct nfs_server *server;
        fmode_t                 delegation_type;
@@ -598,6 +603,7 @@ struct nfs_entry {
        int                     eof;
        struct nfs_fh *         fh;
        struct nfs_fattr *      fattr;
+       struct nfs4_label  *label;
        unsigned char           d_type;
        struct nfs_server *     server;
 };
@@ -630,6 +636,7 @@ struct nfs_setattrargs {
        struct iattr *                  iap;
        const struct nfs_server *       server; /* Needed for name mapping */
        const u32 *                     bitmask;
+       const struct nfs4_label         *label;
 };
 
 struct nfs_setaclargs {
@@ -665,6 +672,7 @@ struct nfs_getaclres {
 struct nfs_setattrres {
        struct nfs4_sequence_res        seq_res;
        struct nfs_fattr *              fattr;
+       struct nfs4_label               *label;
        const struct nfs_server *       server;
 };
 
@@ -862,6 +870,7 @@ struct nfs4_create_arg {
        const struct iattr *            attrs;
        const struct nfs_fh *           dir_fh;
        const u32 *                     bitmask;
+       const struct nfs4_label         *label;
 };
 
 struct nfs4_create_res {
@@ -869,6 +878,7 @@ struct nfs4_create_res {
        const struct nfs_server *       server;
        struct nfs_fh *                 fh;
        struct nfs_fattr *              fattr;
+       struct nfs4_label               *label;
        struct nfs4_change_info         dir_cinfo;
 };
 
@@ -893,6 +903,7 @@ struct nfs4_getattr_res {
        struct nfs4_sequence_res        seq_res;
        const struct nfs_server *       server;
        struct nfs_fattr *              fattr;
+       struct nfs4_label               *label;
 };
 
 struct nfs4_link_arg {
@@ -907,6 +918,7 @@ struct nfs4_link_res {
        struct nfs4_sequence_res        seq_res;
        const struct nfs_server *       server;
        struct nfs_fattr *              fattr;
+       struct nfs4_label               *label;
        struct nfs4_change_info         cinfo;
        struct nfs_fattr *              dir_attr;
 };
@@ -924,6 +936,7 @@ struct nfs4_lookup_res {
        const struct nfs_server *       server;
        struct nfs_fattr *              fattr;
        struct nfs_fh *                 fh;
+       struct nfs4_label               *label;
 };
 
 struct nfs4_lookup_root_arg {
@@ -1366,11 +1379,12 @@ struct nfs_rpc_ops {
        struct dentry *(*try_mount) (int, const char *, struct nfs_mount_info *,
                                     struct nfs_subversion *);
        int     (*getattr) (struct nfs_server *, struct nfs_fh *,
-                           struct nfs_fattr *);
+                           struct nfs_fattr *, struct nfs4_label *);
        int     (*setattr) (struct dentry *, struct nfs_fattr *,
                            struct iattr *);
        int     (*lookup)  (struct inode *, struct qstr *,
-                           struct nfs_fh *, struct nfs_fattr *);
+                           struct nfs_fh *, struct nfs_fattr *,
+                           struct nfs4_label *);
        int     (*access)  (struct inode *, struct nfs_access_entry *);
        int     (*readlink)(struct inode *, struct page *, unsigned int,
                            unsigned int);
index 40560f41e3d540d7e4a7259faa7fa0e8a5176c57..7ce53ae1266bf04c0305f8b695dc240818a7d4ca 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/capability.h>
 #include <linux/slab.h>
 #include <linux/err.h>
+#include <linux/string.h>
 
 struct linux_binprm;
 struct cred;
@@ -60,6 +61,9 @@ struct mm_struct;
 #define SECURITY_CAP_NOAUDIT 0
 #define SECURITY_CAP_AUDIT 1
 
+/* LSM Agnostic defines for sb_set_mnt_opts */
+#define SECURITY_LSM_NATIVE_LABELS     1
+
 struct ctl_table;
 struct audit_krule;
 struct user_namespace;
@@ -306,6 +310,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *     Parse a string of security data filling in the opts structure
  *     @options string containing all mount options known by the LSM
  *     @opts binary data structure usable by the LSM
+ * @dentry_init_security:
+ *     Compute a context for a dentry as the inode is not yet available
+ *     since NFSv4 has no label backed by an EA anyway.
+ *     @dentry dentry to use in calculating the context.
+ *     @mode mode used to determine resource type.
+ *     @name name of the last path component used to create file
+ *     @ctx pointer to place the pointer to the resulting context in.
+ *     @ctxlen point to place the length of the resulting context.
+ *
  *
  * Security hooks for inode operations.
  *
@@ -1313,6 +1326,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *     @pages contains the number of pages.
  *     Return 0 if permission is granted.
  *
+ * @ismaclabel:
+ *     Check if the extended attribute specified by @name
+ *     represents a MAC label. Returns 1 if name is a MAC
+ *     attribute otherwise returns 0.
+ *     @name full extended attribute name to check against
+ *     LSM as a MAC label.
+ *
  * @secid_to_secctx:
  *     Convert secid to security context.  If secdata is NULL the length of
  *     the result will be returned in seclen, but no secdata will be returned.
@@ -1440,10 +1460,16 @@ struct security_operations {
        int (*sb_pivotroot) (struct path *old_path,
                             struct path *new_path);
        int (*sb_set_mnt_opts) (struct super_block *sb,
-                               struct security_mnt_opts *opts);
+                               struct security_mnt_opts *opts,
+                               unsigned long kern_flags,
+                               unsigned long *set_kern_flags);
        int (*sb_clone_mnt_opts) (const struct super_block *oldsb,
                                   struct super_block *newsb);
        int (*sb_parse_opts_str) (char *options, struct security_mnt_opts *opts);
+       int (*dentry_init_security) (struct dentry *dentry, int mode,
+                                       struct qstr *name, void **ctx,
+                                       u32 *ctxlen);
+
 
 #ifdef CONFIG_SECURITY_PATH
        int (*path_unlink) (struct path *dir, struct dentry *dentry);
@@ -1591,6 +1617,7 @@ struct security_operations {
 
        int (*getprocattr) (struct task_struct *p, char *name, char **value);
        int (*setprocattr) (struct task_struct *p, char *name, void *value, size_t size);
+       int (*ismaclabel) (const char *name);
        int (*secid_to_secctx) (u32 secid, char **secdata, u32 *seclen);
        int (*secctx_to_secid) (const char *secdata, u32 seclen, u32 *secid);
        void (*release_secctx) (char *secdata, u32 seclen);
@@ -1726,10 +1753,16 @@ int security_sb_mount(const char *dev_name, struct path *path,
                      const char *type, unsigned long flags, void *data);
 int security_sb_umount(struct vfsmount *mnt, int flags);
 int security_sb_pivotroot(struct path *old_path, struct path *new_path);
-int security_sb_set_mnt_opts(struct super_block *sb, struct security_mnt_opts *opts);
+int security_sb_set_mnt_opts(struct super_block *sb,
+                               struct security_mnt_opts *opts,
+                               unsigned long kern_flags,
+                               unsigned long *set_kern_flags);
 int security_sb_clone_mnt_opts(const struct super_block *oldsb,
                                struct super_block *newsb);
 int security_sb_parse_opts_str(char *options, struct security_mnt_opts *opts);
+int security_dentry_init_security(struct dentry *dentry, int mode,
+                                       struct qstr *name, void **ctx,
+                                       u32 *ctxlen);
 
 int security_inode_alloc(struct inode *inode);
 void security_inode_free(struct inode *inode);
@@ -1841,6 +1874,7 @@ void security_d_instantiate(struct dentry *dentry, struct inode *inode);
 int security_getprocattr(struct task_struct *p, char *name, char **value);
 int security_setprocattr(struct task_struct *p, char *name, void *value, size_t size);
 int security_netlink_send(struct sock *sk, struct sk_buff *skb);
+int security_ismaclabel(const char *name);
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
 int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid);
 void security_release_secctx(char *secdata, u32 seclen);
@@ -2012,7 +2046,9 @@ static inline int security_sb_pivotroot(struct path *old_path,
 }
 
 static inline int security_sb_set_mnt_opts(struct super_block *sb,
-                                          struct security_mnt_opts *opts)
+                                          struct security_mnt_opts *opts,
+                                          unsigned long kern_flags,
+                                          unsigned long *set_kern_flags)
 {
        return 0;
 }
@@ -2036,6 +2072,16 @@ static inline int security_inode_alloc(struct inode *inode)
 static inline void security_inode_free(struct inode *inode)
 { }
 
+static inline int security_dentry_init_security(struct dentry *dentry,
+                                                int mode,
+                                                struct qstr *name,
+                                                void **ctx,
+                                                u32 *ctxlen)
+{
+       return -EOPNOTSUPP;
+}
+
+
 static inline int security_inode_init_security(struct inode *inode,
                                                struct inode *dir,
                                                const struct qstr *qstr,
@@ -2521,6 +2567,11 @@ static inline int security_netlink_send(struct sock *sk, struct sk_buff *skb)
        return cap_netlink_send(sk, skb);
 }
 
+static inline int security_ismaclabel(const char *name)
+{
+       return 0;
+}
+
 static inline int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 {
        return -EOPNOTSUPP;
index 2da29ac178fc25ccf01fca81552bf9f515e3c945..4e32edc8f506b89f13b0260200498b5cf87b490a 100644 (file)
@@ -173,4 +173,10 @@ extern struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
 extern struct hlist_node *seq_hlist_next_rcu(void *v,
                                                   struct hlist_head *head,
                                                   loff_t *ppos);
+
+/* Helpers for iterating over per-cpu hlist_head-s in seq_files */
+extern struct hlist_node *seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos);
+
+extern struct hlist_node *seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos);
+
 #endif
index 84ca436b76c21986436d1a569add79ab1702a1f1..6d870353674ac274a3f27e0f454749f7e5c18485 100644 (file)
@@ -88,15 +88,6 @@ struct rpc_task {
                                tk_rebind_retry : 2;
 };
 
-/* support walking a list of tasks on a wait queue */
-#define        task_for_each(task, pos, head) \
-       list_for_each(pos, head) \
-               if ((task=list_entry(pos, struct rpc_task, u.tk_wait.list)),1)
-
-#define        task_for_first(task, head) \
-       if (!list_empty(head) &&  \
-           ((task=list_entry((head)->next, struct rpc_task, u.tk_wait.list)),1))
-
 typedef void                   (*rpc_action)(struct rpc_task *);
 
 struct rpc_call_ops {
@@ -238,7 +229,6 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *,
                                        bool (*)(struct rpc_task *, void *),
                                        void *);
 void           rpc_wake_up_status(struct rpc_wait_queue *, int);
-int            rpc_queue_empty(struct rpc_wait_queue *);
 void           rpc_delay(struct rpc_task *, unsigned long);
 void *         rpc_malloc(struct rpc_task *, size_t);
 void           rpc_free(void *);
@@ -259,16 +249,6 @@ static inline int rpc_wait_for_completion_task(struct rpc_task *task)
        return __rpc_wait_for_completion_task(task, NULL);
 }
 
-static inline void rpc_task_set_priority(struct rpc_task *task, unsigned char prio)
-{
-       task->tk_priority = prio - RPC_PRIORITY_LOW;
-}
-
-static inline int rpc_task_has_priority(struct rpc_task *task, unsigned char prio)
-{
-       return (task->tk_priority + RPC_PRIORITY_LOW == prio);
-}
-
 #if defined(RPC_DEBUG) || defined (RPC_TRACEPOINTS)
 static inline const char * rpc_qname(const struct rpc_wait_queue *q)
 {
index beeaed8398eca09745746e75171b05c54345ac44..a0666362c111ae1b597ebda14973826f29f33e75 100644 (file)
@@ -143,31 +143,9 @@ TRACE_EVENT(9p_protocol_dump,
                    __entry->tag    =  pdu->tag;
                    memcpy(__entry->line, pdu->sdata, P9_PROTO_DUMP_SZ);
                    ),
-           TP_printk("clnt %lu %s(tag = %d)\n%.3x: "
-                     "%02x %02x %02x %02x %02x %02x %02x %02x "
-                     "%02x %02x %02x %02x %02x %02x %02x %02x\n"
-                     "%.3x: "
-                     "%02x %02x %02x %02x %02x %02x %02x %02x "
-                     "%02x %02x %02x %02x %02x %02x %02x %02x\n",
-                     (long)__entry->clnt, show_9p_op(__entry->type),
-                     __entry->tag, 0,
-                     __entry->line[0],  __entry->line[1],
-                     __entry->line[2],  __entry->line[3],
-                     __entry->line[4],  __entry->line[5],
-                     __entry->line[6],  __entry->line[7],
-                     __entry->line[8],  __entry->line[9],
-                     __entry->line[10], __entry->line[11],
-                     __entry->line[12], __entry->line[13],
-                     __entry->line[14], __entry->line[15],
-                     16,
-                     __entry->line[16], __entry->line[17],
-                     __entry->line[18], __entry->line[19],
-                     __entry->line[20], __entry->line[21],
-                     __entry->line[22], __entry->line[23],
-                     __entry->line[24], __entry->line[25],
-                     __entry->line[26], __entry->line[27],
-                     __entry->line[28], __entry->line[29],
-                     __entry->line[30], __entry->line[31])
+           TP_printk("clnt %lu %s(tag = %d)\n%.3x: %16ph\n%.3x: %16ph\n",
+                     (unsigned long)__entry->clnt, show_9p_op(__entry->type),
+                     __entry->tag, 0, __entry->line, 16, __entry->line + 16)
  );
 
 #endif /* _TRACE_9P_H */
index ea546a4e9609aa0f9c9b56a6f307e06b10602fe6..2902657ba766bb7dc1a9949b1f0436816615484a 100644 (file)
@@ -40,22 +40,25 @@ struct extent_buffer;
                { BTRFS_ROOT_TREE_DIR_OBJECTID, "ROOT_TREE_DIR" },      \
                { BTRFS_CSUM_TREE_OBJECTID,     "CSUM_TREE"     },      \
                { BTRFS_TREE_LOG_OBJECTID,      "TREE_LOG"      },      \
+               { BTRFS_QUOTA_TREE_OBJECTID,    "QUOTA_TREE"    },      \
                { BTRFS_TREE_RELOC_OBJECTID,    "TREE_RELOC"    },      \
                { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" })
 
 #define show_root_type(obj)                                            \
        obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||                \
              (obj >= BTRFS_ROOT_TREE_OBJECTID &&                       \
-              obj <= BTRFS_CSUM_TREE_OBJECTID)) ? __show_root_type(obj) : "-"
+              obj <= BTRFS_QUOTA_TREE_OBJECTID)) ? __show_root_type(obj) : "-"
 
 #define BTRFS_GROUP_FLAGS      \
-       { BTRFS_BLOCK_GROUP_DATA,       "DATA"}, \
-       { BTRFS_BLOCK_GROUP_SYSTEM,     "SYSTEM"}, \
-       { BTRFS_BLOCK_GROUP_METADATA,   "METADATA"}, \
-       { BTRFS_BLOCK_GROUP_RAID0,      "RAID0"}, \
-       { BTRFS_BLOCK_GROUP_RAID1,      "RAID1"}, \
-       { BTRFS_BLOCK_GROUP_DUP,        "DUP"}, \
-       { BTRFS_BLOCK_GROUP_RAID10,     "RAID10"}
+       { BTRFS_BLOCK_GROUP_DATA,       "DATA"},        \
+       { BTRFS_BLOCK_GROUP_SYSTEM,     "SYSTEM"},      \
+       { BTRFS_BLOCK_GROUP_METADATA,   "METADATA"},    \
+       { BTRFS_BLOCK_GROUP_RAID0,      "RAID0"},       \
+       { BTRFS_BLOCK_GROUP_RAID1,      "RAID1"},       \
+       { BTRFS_BLOCK_GROUP_DUP,        "DUP"},         \
+       { BTRFS_BLOCK_GROUP_RAID10,     "RAID10"},      \
+       { BTRFS_BLOCK_GROUP_RAID5,      "RAID5"},       \
+       { BTRFS_BLOCK_GROUP_RAID6,      "RAID6"}
 
 #define BTRFS_UUID_SIZE 16
 
@@ -154,7 +157,9 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
                { EXTENT_FLAG_PINNED,           "PINNED"        },      \
                { EXTENT_FLAG_COMPRESSED,       "COMPRESSED"    },      \
                { EXTENT_FLAG_VACANCY,          "VACANCY"       },      \
-               { EXTENT_FLAG_PREALLOC,         "PREALLOC"      })
+               { EXTENT_FLAG_PREALLOC,         "PREALLOC"      },      \
+               { EXTENT_FLAG_LOGGING,          "LOGGING"       },      \
+               { EXTENT_FLAG_FILLING,          "FILLING"       })
 
 TRACE_EVENT(btrfs_get_extent,
 
@@ -201,13 +206,17 @@ TRACE_EVENT(btrfs_get_extent,
 );
 
 #define show_ordered_flags(flags)                                      \
-       __print_symbolic(flags,                                 \
+       __print_symbolic(flags,                                         \
                { BTRFS_ORDERED_IO_DONE,        "IO_DONE"       },      \
                { BTRFS_ORDERED_COMPLETE,       "COMPLETE"      },      \
                { BTRFS_ORDERED_NOCOW,          "NOCOW"         },      \
                { BTRFS_ORDERED_COMPRESSED,     "COMPRESSED"    },      \
                { BTRFS_ORDERED_PREALLOC,       "PREALLOC"      },      \
-               { BTRFS_ORDERED_DIRECT,         "DIRECT"        })
+               { BTRFS_ORDERED_DIRECT,         "DIRECT"        },      \
+               { BTRFS_ORDERED_IOERR,          "IOERR"         },      \
+               { BTRFS_ORDERED_UPDATED_ISIZE,  "UPDATED_ISIZE" },      \
+               { BTRFS_ORDERED_LOGGED_CSUM,    "LOGGED_CSUM"   })
+
 
 DECLARE_EVENT_CLASS(btrfs__ordered_extent,
 
@@ -555,7 +564,9 @@ TRACE_EVENT(btrfs_delayed_ref_head,
                { BTRFS_BLOCK_GROUP_RAID0,      "RAID0" },      \
                { BTRFS_BLOCK_GROUP_RAID1,      "RAID1" },      \
                { BTRFS_BLOCK_GROUP_DUP,        "DUP"   },      \
-               { BTRFS_BLOCK_GROUP_RAID10,     "RAID10"})
+               { BTRFS_BLOCK_GROUP_RAID10,     "RAID10"},      \
+               { BTRFS_BLOCK_GROUP_RAID5,      "RAID5" },      \
+               { BTRFS_BLOCK_GROUP_RAID6,      "RAID6" })
 
 DECLARE_EVENT_CLASS(btrfs__chunk,
 
index 5ef0df545a2a17f137ad0d75d13e11f46a967cee..05aed70627e24392d87cb4bc350e16bf32ceca44 100644 (file)
@@ -447,6 +447,46 @@ struct btrfs_ioctl_send_args {
        __u64 reserved[4];              /* in */
 };
 
+/* Error codes as returned by the kernel */
+enum btrfs_err_code {
+       notused,
+       BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
+       BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
+       BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
+       BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
+       BTRFS_ERROR_DEV_TGT_REPLACE,
+       BTRFS_ERROR_DEV_MISSING_NOT_FOUND,
+       BTRFS_ERROR_DEV_ONLY_WRITABLE,
+       BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS
+};
+/* An error code to error string mapping for the kernel
+*  error codes
+*/
+static inline char *btrfs_err_str(enum btrfs_err_code err_code)
+{
+       switch (err_code) {
+               case BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET:
+                       return "unable to go below two devices on raid1";
+               case BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET:
+                       return "unable to go below four devices on raid10";
+               case BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET:
+                       return "unable to go below two devices on raid5";
+               case BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET:
+                       return "unable to go below three devices on raid6";
+               case BTRFS_ERROR_DEV_TGT_REPLACE:
+                       return "unable to remove the dev_replace target dev";
+               case BTRFS_ERROR_DEV_MISSING_NOT_FOUND:
+                       return "no missing devices found to remove";
+               case BTRFS_ERROR_DEV_ONLY_WRITABLE:
+                       return "unable to remove the only writeable device";
+               case BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS:
+                       return "add/delete/balance/replace/resize operation "\
+                               "in progress";
+               default:
+                       return NULL;
+       }
+}
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
                                   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -530,6 +570,7 @@ struct btrfs_ioctl_send_args {
                               struct btrfs_ioctl_quota_rescan_args)
 #define BTRFS_IOC_QUOTA_RESCAN_STATUS _IOR(BTRFS_IOCTL_MAGIC, 45, \
                               struct btrfs_ioctl_quota_rescan_args)
+#define BTRFS_IOC_QUOTA_RESCAN_WAIT _IO(BTRFS_IOCTL_MAGIC, 46)
 #define BTRFS_IOC_GET_FSLABEL _IOR(BTRFS_IOCTL_MAGIC, 49, \
                                   char[BTRFS_LABEL_SIZE])
 #define BTRFS_IOC_SET_FSLABEL _IOW(BTRFS_IOCTL_MAGIC, 50, \
@@ -538,5 +579,4 @@ struct btrfs_ioctl_send_args {
                                      struct btrfs_ioctl_get_dev_stats)
 #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
                                    struct btrfs_ioctl_dev_replace_args)
-
 #endif /* _UAPI_LINUX_BTRFS_H */
index addc116cecf0be16210498b6544fa40ad24cc615..01f1779eba805c0e32192280cbd73be6500aab08 100644 (file)
@@ -127,7 +127,7 @@ static int parse_opts(char *opts, struct p9_client *clnt)
        char *s;
        int ret = 0;
 
-       clnt->proto_version = p9_proto_2000u;
+       clnt->proto_version = p9_proto_2000L;
        clnt->msize = 8192;
 
        if (!opts)
@@ -995,6 +995,9 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
        if (err < 0)
                goto destroy_tagpool;
 
+       if (!clnt->trans_mod)
+               clnt->trans_mod = v9fs_get_trans_by_name("virtio");
+
        if (!clnt->trans_mod)
                clnt->trans_mod = v9fs_get_default_trans();
 
index 925ca583c09c8eae2fbaebbd73603e9194132f97..8c93fa8d81bc45b2d4bf1e9e1fd1eb3b96e3de53 100644 (file)
@@ -39,6 +39,11 @@ static int should_authenticate(struct ceph_auth_client *ac)
        return xi->starting;
 }
 
+static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
+{
+       return 0;
+}
+
 /*
  * the generic auth code decode the global_id, and we carry no actual
  * authenticate state, so nothing happens here.
@@ -106,6 +111,7 @@ static const struct ceph_auth_client_ops ceph_auth_none_ops = {
        .destroy = destroy,
        .is_authenticated = is_authenticated,
        .should_authenticate = should_authenticate,
+       .build_request = build_request,
        .handle_reply = handle_reply,
        .create_authorizer = ceph_auth_none_create_authorizer,
        .destroy_authorizer = ceph_auth_none_destroy_authorizer,
index 3a246a6cab473496e58275cbb367485acbbda4d8..dd47889adc4aec94941d6f17105878ebe235db8f 100644 (file)
@@ -733,12 +733,14 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
        object_size = le32_to_cpu(layout->fl_object_size);
        object_base = off - objoff;
-       if (truncate_size <= object_base) {
-               truncate_size = 0;
-       } else {
-               truncate_size -= object_base;
-               if (truncate_size > object_size)
-                       truncate_size = object_size;
+       if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
+               if (truncate_size <= object_base) {
+                       truncate_size = 0;
+               } else {
+                       truncate_size -= object_base;
+                       if (truncate_size > object_size)
+                               truncate_size = object_size;
+               }
        }
 
        osd_req_op_extent_init(req, 0, opcode, objoff, objlen,
@@ -1174,6 +1176,7 @@ static void __register_linger_request(struct ceph_osd_client *osdc,
                                    struct ceph_osd_request *req)
 {
        dout("__register_linger_request %p\n", req);
+       ceph_osdc_get_request(req);
        list_add_tail(&req->r_linger_item, &osdc->req_linger);
        if (req->r_osd)
                list_add_tail(&req->r_linger_osd,
@@ -1196,6 +1199,7 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
                if (list_empty(&req->r_osd_item))
                        req->r_osd = NULL;
        }
+       ceph_osdc_put_request(req);
 }
 
 void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
@@ -1203,9 +1207,8 @@ void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
 {
        mutex_lock(&osdc->request_mutex);
        if (req->r_linger) {
-               __unregister_linger_request(osdc, req);
                req->r_linger = 0;
-               ceph_osdc_put_request(req);
+               __unregister_linger_request(osdc, req);
        }
        mutex_unlock(&osdc->request_mutex);
 }
@@ -1217,11 +1220,6 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
        if (!req->r_linger) {
                dout("set_request_linger %p\n", req);
                req->r_linger = 1;
-               /*
-                * caller is now responsible for calling
-                * unregister_linger_request
-                */
-               ceph_osdc_get_request(req);
        }
 }
 EXPORT_SYMBOL(ceph_osdc_set_request_linger);
@@ -1339,10 +1337,6 @@ static void __send_request(struct ceph_osd_client *osdc,
 
        ceph_msg_get(req->r_request); /* send consumes a ref */
 
-       /* Mark the request unsafe if this is the first timet's being sent. */
-
-       if (!req->r_sent && req->r_unsafe_callback)
-               req->r_unsafe_callback(req, true);
        req->r_sent = req->r_osd->o_incarnation;
 
        ceph_con_send(&req->r_osd->o_con, req->r_request);
@@ -1433,8 +1427,6 @@ static void handle_osds_timeout(struct work_struct *work)
 
 static void complete_request(struct ceph_osd_request *req)
 {
-       if (req->r_unsafe_callback)
-               req->r_unsafe_callback(req, false);
        complete_all(&req->r_safe_completion);  /* fsync waiter */
 }
 
@@ -1526,6 +1518,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
        for (i = 0; i < numops; i++)
                req->r_reply_op_result[i] = ceph_decode_32(&p);
 
+       already_completed = req->r_got_reply;
+
        if (!req->r_got_reply) {
 
                req->r_result = result;
@@ -1556,19 +1550,23 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
            ((flags & CEPH_OSD_FLAG_WRITE) == 0))
                __unregister_request(osdc, req);
 
-       already_completed = req->r_completed;
-       req->r_completed = 1;
        mutex_unlock(&osdc->request_mutex);
-       if (already_completed)
-               goto done;
 
-       if (req->r_callback)
-               req->r_callback(req, msg);
-       else
-               complete_all(&req->r_completion);
+       if (!already_completed) {
+               if (req->r_unsafe_callback &&
+                   result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
+                       req->r_unsafe_callback(req, true);
+               if (req->r_callback)
+                       req->r_callback(req, msg);
+               else
+                       complete_all(&req->r_completion);
+       }
 
-       if (flags & CEPH_OSD_FLAG_ONDISK)
+       if (flags & CEPH_OSD_FLAG_ONDISK) {
+               if (req->r_unsafe_callback && already_completed)
+                       req->r_unsafe_callback(req, false);
                complete_request(req);
+       }
 
 done:
        dout("req=%p req->r_linger=%d\n", req, req->r_linger);
@@ -1633,8 +1631,10 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
                        dout("%p tid %llu restart on osd%d\n",
                             req, req->r_tid,
                             req->r_osd ? req->r_osd->o_osd : -1);
+                       ceph_osdc_get_request(req);
                        __unregister_request(osdc, req);
                        __register_linger_request(osdc, req);
+                       ceph_osdc_put_request(req);
                        continue;
                }
 
@@ -2123,7 +2123,6 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
        __register_request(osdc, req);
        req->r_sent = 0;
        req->r_got_reply = 0;
-       req->r_completed = 0;
        rc = __map_request(osdc, req, 0);
        if (rc < 0) {
                if (nofail) {
@@ -2456,8 +2455,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        ceph_msg_revoke_incoming(req->r_reply);
 
        if (front > req->r_reply->front.iov_len) {
-               pr_warning("get_reply front %d > preallocated %d\n",
-                          front, (int)req->r_reply->front.iov_len);
+               pr_warning("get_reply front %d > preallocated %d (%u#%llu)\n",
+                          front, (int)req->r_reply->front.iov_len,
+                          (unsigned int)con->peer_name.type,
+                          le64_to_cpu(con->peer_name.num));
                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false);
                if (!m)
                        goto out;
index 5a750b9c36404b34a3b41bd0e2d38628f881a5f2..f0339ae9bf37efe9bf30acecd3639ae092c27fd0 100644 (file)
@@ -157,20 +157,15 @@ static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb,
 }
 
 static int
-rpc_setup_pipedir(struct rpc_clnt *clnt, const char *dir_name)
+rpc_setup_pipedir(struct rpc_clnt *clnt, const char *dir_name,
+                 struct super_block *pipefs_sb)
 {
-       struct net *net = rpc_net_ns(clnt);
-       struct super_block *pipefs_sb;
        struct dentry *dentry;
 
        clnt->cl_dentry = NULL;
        if (dir_name == NULL)
                return 0;
-       pipefs_sb = rpc_get_sb_net(net);
-       if (!pipefs_sb)
-               return 0;
        dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt, dir_name);
-       rpc_put_sb_net(net);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
        clnt->cl_dentry = dentry;
@@ -182,6 +177,8 @@ static inline int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event
        if (((event == RPC_PIPEFS_MOUNT) && clnt->cl_dentry) ||
            ((event == RPC_PIPEFS_UMOUNT) && !clnt->cl_dentry))
                return 1;
+       if ((event == RPC_PIPEFS_MOUNT) && atomic_read(&clnt->cl_count) == 0)
+               return 1;
        return 0;
 }
 
@@ -241,8 +238,6 @@ static struct rpc_clnt *rpc_get_client_for_event(struct net *net, int event)
                        continue;
                if (rpc_clnt_skip_event(clnt, event))
                        continue;
-               if (atomic_inc_not_zero(&clnt->cl_count) == 0)
-                       continue;
                spin_unlock(&sn->rpc_client_lock);
                return clnt;
        }
@@ -259,7 +254,6 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
 
        while ((clnt = rpc_get_client_for_event(sb->s_fs_info, event))) {
                error = __rpc_pipefs_event(clnt, event, sb);
-               rpc_release_client(clnt);
                if (error)
                        break;
        }
@@ -289,12 +283,46 @@ static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename)
        memcpy(clnt->cl_nodename, nodename, clnt->cl_nodelen);
 }
 
+static int rpc_client_register(const struct rpc_create_args *args,
+                              struct rpc_clnt *clnt)
+{
+       const struct rpc_program *program = args->program;
+       struct rpc_auth *auth;
+       struct net *net = rpc_net_ns(clnt);
+       struct super_block *pipefs_sb;
+       int err = 0;
+
+       pipefs_sb = rpc_get_sb_net(net);
+       if (pipefs_sb) {
+               err = rpc_setup_pipedir(clnt, program->pipe_dir_name, pipefs_sb);
+               if (err)
+                       goto out;
+       }
+
+       auth = rpcauth_create(args->authflavor, clnt);
+       if (IS_ERR(auth)) {
+               dprintk("RPC:       Couldn't create auth handle (flavor %u)\n",
+                               args->authflavor);
+               err = PTR_ERR(auth);
+               goto err_auth;
+       }
+
+       rpc_register_client(clnt);
+out:
+       if (pipefs_sb)
+               rpc_put_sb_net(net);
+       return err;
+
+err_auth:
+       __rpc_clnt_remove_pipedir(clnt);
+       goto out;
+}
+
 static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, struct rpc_xprt *xprt)
 {
        const struct rpc_program *program = args->program;
        const struct rpc_version *version;
        struct rpc_clnt         *clnt = NULL;
-       struct rpc_auth         *auth;
        int err;
 
        /* sanity check the name before trying to print it */
@@ -354,25 +382,14 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
 
        atomic_set(&clnt->cl_count, 1);
 
-       err = rpc_setup_pipedir(clnt, program->pipe_dir_name);
-       if (err < 0)
-               goto out_no_path;
-
-       auth = rpcauth_create(args->authflavor, clnt);
-       if (IS_ERR(auth)) {
-               dprintk("RPC:       Couldn't create auth handle (flavor %u)\n",
-                               args->authflavor);
-               err = PTR_ERR(auth);
-               goto out_no_auth;
-       }
-
        /* save the nodename */
        rpc_clnt_set_nodename(clnt, utsname()->nodename);
-       rpc_register_client(clnt);
+
+       err = rpc_client_register(args, clnt);
+       if (err)
+               goto out_no_path;
        return clnt;
 
-out_no_auth:
-       rpc_clnt_remove_pipedir(clnt);
 out_no_path:
        kfree(clnt->cl_principal);
 out_no_principal:
@@ -637,8 +654,8 @@ rpc_free_client(struct rpc_clnt *clnt)
                        rcu_dereference(clnt->cl_xprt)->servername);
        if (clnt->cl_parent != clnt)
                rpc_release_client(clnt->cl_parent);
-       rpc_unregister_client(clnt);
        rpc_clnt_remove_pipedir(clnt);
+       rpc_unregister_client(clnt);
        rpc_free_iostats(clnt->cl_metrics);
        kfree(clnt->cl_principal);
        clnt->cl_metrics = NULL;
index e7ce4b3eb0bdde4f209ba2cdf6cd1e3dbcca03f9..4679df5a6d50f00efaac83cefbad73c43f0890e8 100644 (file)
@@ -667,7 +667,8 @@ static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
                        return ERR_PTR(-ENOMEM);
        }
        if (dentry->d_inode == NULL) {
-               d_set_d_op(dentry, &rpc_dentry_operations);
+               if (!dentry->d_op)
+                       d_set_d_op(dentry, &rpc_dentry_operations);
                return dentry;
        }
        dput(dentry);
@@ -1126,6 +1127,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
                return -ENOMEM;
        dprintk("RPC:       sending pipefs MOUNT notification for net %p%s\n",
                net, NET_NAME(net));
+       mutex_lock(&sn->pipefs_sb_lock);
        sn->pipefs_sb = sb;
        err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
                                           RPC_PIPEFS_MOUNT,
@@ -1133,6 +1135,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
        if (err)
                goto err_depopulate;
        sb->s_fs_info = get_net(net);
+       mutex_unlock(&sn->pipefs_sb_lock);
        return 0;
 
 err_depopulate:
@@ -1141,6 +1144,7 @@ err_depopulate:
                                           sb);
        sn->pipefs_sb = NULL;
        __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF);
+       mutex_unlock(&sn->pipefs_sb_lock);
        return err;
 }
 
@@ -1162,12 +1166,12 @@ static void rpc_kill_sb(struct super_block *sb)
                goto out;
        }
        sn->pipefs_sb = NULL;
-       mutex_unlock(&sn->pipefs_sb_lock);
        dprintk("RPC:       sending pipefs UMOUNT notification for net %p%s\n",
                net, NET_NAME(net));
        blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
                                           RPC_PIPEFS_UMOUNT,
                                           sb);
+       mutex_unlock(&sn->pipefs_sb_lock);
        put_net(net);
 out:
        kill_litter_super(sb);
index 77d251e0259315eeef4acd4de50def4382bba5e1..93a7a4e94d80abcd423215e2102d510af7331fe9 100644 (file)
@@ -445,20 +445,6 @@ static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct r
        }
 }
 
-/*
- * Tests whether rpc queue is empty
- */
-int rpc_queue_empty(struct rpc_wait_queue *queue)
-{
-       int res;
-
-       spin_lock_bh(&queue->lock);
-       res = queue->qlen;
-       spin_unlock_bh(&queue->lock);
-       return res == 0;
-}
-EXPORT_SYMBOL_GPL(rpc_queue_empty);
-
 /*
  * Wake up a task on a specific queue
  */
@@ -804,7 +790,6 @@ static void __rpc_execute(struct rpc_task *task)
                        task->tk_flags |= RPC_TASK_KILLED;
                        rpc_exit(task, -ERESTARTSYS);
                }
-               rpc_set_running(task);
                dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
        }
 
@@ -825,9 +810,11 @@ static void __rpc_execute(struct rpc_task *task)
  */
 void rpc_execute(struct rpc_task *task)
 {
+       bool is_async = RPC_IS_ASYNC(task);
+
        rpc_set_active(task);
        rpc_make_runnable(task);
-       if (!RPC_IS_ASYNC(task))
+       if (!is_async)
                __rpc_execute(task);
 }
 
index 1728d4e375db509c4e192e0e69fb1a0ee020bf49..d32e16e3c6ae661163359f70f1fcf6c66c49dae3 100644 (file)
@@ -91,7 +91,10 @@ static int cap_sb_pivotroot(struct path *old_path, struct path *new_path)
 }
 
 static int cap_sb_set_mnt_opts(struct super_block *sb,
-                              struct security_mnt_opts *opts)
+                              struct security_mnt_opts *opts,
+                              unsigned long kern_flags,
+                              unsigned long *set_kern_flags)
+
 {
        if (unlikely(opts->num_mnt_opts))
                return -EOPNOTSUPP;
@@ -109,6 +112,13 @@ static int cap_sb_parse_opts_str(char *options, struct security_mnt_opts *opts)
        return 0;
 }
 
+static int cap_dentry_init_security(struct dentry *dentry, int mode,
+                                       struct qstr *name, void **ctx,
+                                       u32 *ctxlen)
+{
+       return 0;
+}
+
 static int cap_inode_alloc_security(struct inode *inode)
 {
        return 0;
@@ -816,6 +826,11 @@ static int cap_setprocattr(struct task_struct *p, char *name, void *value,
        return -EINVAL;
 }
 
+static int cap_ismaclabel(const char *name)
+{
+       return 0;
+}
+
 static int cap_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 {
        return -EOPNOTSUPP;
@@ -931,6 +946,7 @@ void __init security_fixup_ops(struct security_operations *ops)
        set_to_cap_if_null(ops, sb_set_mnt_opts);
        set_to_cap_if_null(ops, sb_clone_mnt_opts);
        set_to_cap_if_null(ops, sb_parse_opts_str);
+       set_to_cap_if_null(ops, dentry_init_security);
        set_to_cap_if_null(ops, inode_alloc_security);
        set_to_cap_if_null(ops, inode_free_security);
        set_to_cap_if_null(ops, inode_init_security);
@@ -1034,6 +1050,7 @@ void __init security_fixup_ops(struct security_operations *ops)
        set_to_cap_if_null(ops, d_instantiate);
        set_to_cap_if_null(ops, getprocattr);
        set_to_cap_if_null(ops, setprocattr);
+       set_to_cap_if_null(ops, ismaclabel);
        set_to_cap_if_null(ops, secid_to_secctx);
        set_to_cap_if_null(ops, secctx_to_secid);
        set_to_cap_if_null(ops, release_secctx);
index a3dce87d1aeffccb92181631ef2687baec41d36c..94b35aef6871a9978cf21799cfec3385502bc555 100644 (file)
@@ -12,6 +12,7 @@
  */
 
 #include <linux/capability.h>
+#include <linux/dcache.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -293,9 +294,12 @@ int security_sb_pivotroot(struct path *old_path, struct path *new_path)
 }
 
 int security_sb_set_mnt_opts(struct super_block *sb,
-                               struct security_mnt_opts *opts)
+                               struct security_mnt_opts *opts,
+                               unsigned long kern_flags,
+                               unsigned long *set_kern_flags)
 {
-       return security_ops->sb_set_mnt_opts(sb, opts);
+       return security_ops->sb_set_mnt_opts(sb, opts, kern_flags,
+                                               set_kern_flags);
 }
 EXPORT_SYMBOL(security_sb_set_mnt_opts);
 
@@ -324,6 +328,15 @@ void security_inode_free(struct inode *inode)
        security_ops->inode_free_security(inode);
 }
 
+int security_dentry_init_security(struct dentry *dentry, int mode,
+                                       struct qstr *name, void **ctx,
+                                       u32 *ctxlen)
+{
+       return security_ops->dentry_init_security(dentry, mode, name,
+                                                       ctx, ctxlen);
+}
+EXPORT_SYMBOL(security_dentry_init_security);
+
 int security_inode_init_security(struct inode *inode, struct inode *dir,
                                 const struct qstr *qstr,
                                 const initxattrs initxattrs, void *fs_data)
@@ -647,6 +660,7 @@ int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer
                return 0;
        return security_ops->inode_listsecurity(inode, buffer, buffer_size);
 }
+EXPORT_SYMBOL(security_inode_listsecurity);
 
 void security_inode_getsecid(const struct inode *inode, u32 *secid)
 {
@@ -1047,6 +1061,12 @@ int security_netlink_send(struct sock *sk, struct sk_buff *skb)
        return security_ops->netlink_send(sk, skb);
 }
 
+int security_ismaclabel(const char *name)
+{
+       return security_ops->ismaclabel(name);
+}
+EXPORT_SYMBOL(security_ismaclabel);
+
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 {
        return security_ops->secid_to_secctx(secid, secdata, seclen);
index db1fca990a2468a4ac292f50daba582df4dc113e..c956390a9136b75a7fb8ed17ded49c69310b31cf 100644 (file)
@@ -81,6 +81,7 @@
 #include <linux/syslog.h>
 #include <linux/user_namespace.h>
 #include <linux/export.h>
+#include <linux/security.h>
 #include <linux/msg.h>
 #include <linux/shm.h>
 
@@ -284,13 +285,14 @@ static void superblock_free_security(struct super_block *sb)
 
 /* The file system's label must be initialized prior to use. */
 
-static const char *labeling_behaviors[6] = {
+static const char *labeling_behaviors[7] = {
        "uses xattr",
        "uses transition SIDs",
        "uses task SIDs",
        "uses genfs_contexts",
        "not configured for labeling",
        "uses mountpoint labeling",
+       "uses native labeling",
 };
 
 static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry);
@@ -552,7 +554,9 @@ static int bad_option(struct superblock_security_struct *sbsec, char flag,
  * labeling information.
  */
 static int selinux_set_mnt_opts(struct super_block *sb,
-                               struct security_mnt_opts *opts)
+                               struct security_mnt_opts *opts,
+                               unsigned long kern_flags,
+                               unsigned long *set_kern_flags)
 {
        const struct cred *cred = current_cred();
        int rc = 0, i;
@@ -580,6 +584,12 @@ static int selinux_set_mnt_opts(struct super_block *sb,
                        "before the security server is initialized\n");
                goto out;
        }
+       if (kern_flags && !set_kern_flags) {
+               /* Specifying internal flags without providing a place to
+                * place the results is not allowed */
+               rc = -EINVAL;
+               goto out;
+       }
 
        /*
         * Binary mount data FS will come through this function twice.  Once
@@ -670,14 +680,21 @@ static int selinux_set_mnt_opts(struct super_block *sb,
        if (strcmp(sb->s_type->name, "proc") == 0)
                sbsec->flags |= SE_SBPROC;
 
-       /* Determine the labeling behavior to use for this filesystem type. */
-       rc = security_fs_use((sbsec->flags & SE_SBPROC) ? "proc" : sb->s_type->name, &sbsec->behavior, &sbsec->sid);
-       if (rc) {
-               printk(KERN_WARNING "%s: security_fs_use(%s) returned %d\n",
-                      __func__, sb->s_type->name, rc);
-               goto out;
+       if (!sbsec->behavior) {
+               /*
+                * Determine the labeling behavior to use for this
+                * filesystem type.
+                */
+               rc = security_fs_use((sbsec->flags & SE_SBPROC) ?
+                                       "proc" : sb->s_type->name,
+                                       &sbsec->behavior, &sbsec->sid);
+               if (rc) {
+                       printk(KERN_WARNING
+                               "%s: security_fs_use(%s) returned %d\n",
+                                       __func__, sb->s_type->name, rc);
+                       goto out;
+               }
        }
-
        /* sets the context of the superblock for the fs being mounted. */
        if (fscontext_sid) {
                rc = may_context_mount_sb_relabel(fscontext_sid, sbsec, cred);
@@ -692,6 +709,11 @@ static int selinux_set_mnt_opts(struct super_block *sb,
         * sets the label used on all file below the mountpoint, and will set
         * the superblock context if not already set.
         */
+       if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !context_sid) {
+               sbsec->behavior = SECURITY_FS_USE_NATIVE;
+               *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
+       }
+
        if (context_sid) {
                if (!fscontext_sid) {
                        rc = may_context_mount_sb_relabel(context_sid, sbsec,
@@ -723,7 +745,8 @@ static int selinux_set_mnt_opts(struct super_block *sb,
        }
 
        if (defcontext_sid) {
-               if (sbsec->behavior != SECURITY_FS_USE_XATTR) {
+               if (sbsec->behavior != SECURITY_FS_USE_XATTR &&
+                       sbsec->behavior != SECURITY_FS_USE_NATIVE) {
                        rc = -EINVAL;
                        printk(KERN_WARNING "SELinux: defcontext option is "
                               "invalid for this filesystem type\n");
@@ -980,7 +1003,7 @@ static int superblock_doinit(struct super_block *sb, void *data)
                goto out_err;
 
 out:
-       rc = selinux_set_mnt_opts(sb, &opts);
+       rc = selinux_set_mnt_opts(sb, &opts, 0, NULL);
 
 out_err:
        security_free_mnt_opts(&opts);
@@ -1222,6 +1245,8 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
        }
 
        switch (sbsec->behavior) {
+       case SECURITY_FS_USE_NATIVE:
+               break;
        case SECURITY_FS_USE_XATTR:
                if (!inode->i_op->getxattr) {
                        isec->sid = sbsec->def_sid;
@@ -2527,6 +2552,40 @@ static void selinux_inode_free_security(struct inode *inode)
        inode_free_security(inode);
 }
 
+static int selinux_dentry_init_security(struct dentry *dentry, int mode,
+                                       struct qstr *name, void **ctx,
+                                       u32 *ctxlen)
+{
+       const struct cred *cred = current_cred();
+       struct task_security_struct *tsec;
+       struct inode_security_struct *dsec;
+       struct superblock_security_struct *sbsec;
+       struct inode *dir = dentry->d_parent->d_inode;
+       u32 newsid;
+       int rc;
+
+       tsec = cred->security;
+       dsec = dir->i_security;
+       sbsec = dir->i_sb->s_security;
+
+       if (tsec->create_sid && sbsec->behavior != SECURITY_FS_USE_MNTPOINT) {
+               newsid = tsec->create_sid;
+       } else {
+               rc = security_transition_sid(tsec->sid, dsec->sid,
+                                            inode_mode_to_security_class(mode),
+                                            name,
+                                            &newsid);
+               if (rc) {
+                       printk(KERN_WARNING
+                               "%s: security_transition_sid failed, rc=%d\n",
+                              __func__, -rc);
+                       return rc;
+               }
+       }
+
+       return security_sid_to_context(newsid, (char **)ctx, ctxlen);
+}
+
 static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
                                       const struct qstr *qstr, char **name,
                                       void **value, size_t *len)
@@ -2861,7 +2920,10 @@ static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name,
                return;
        }
 
+       isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = newsid;
+       isec->initialized = 1;
+
        return;
 }
 
@@ -2949,6 +3011,7 @@ static int selinux_inode_setsecurity(struct inode *inode, const char *name,
        if (rc)
                return rc;
 
+       isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = newsid;
        isec->initialized = 1;
        return 0;
@@ -5432,6 +5495,11 @@ abort_change:
        return error;
 }
 
+static int selinux_ismaclabel(const char *name)
+{
+       return (strcmp(name, XATTR_SELINUX_SUFFIX) == 0);
+}
+
 static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 {
        return security_sid_to_context(secid, secdata, seclen);
@@ -5574,6 +5642,7 @@ static struct security_operations selinux_ops = {
        .sb_clone_mnt_opts =            selinux_sb_clone_mnt_opts,
        .sb_parse_opts_str =            selinux_parse_opts_str,
 
+       .dentry_init_security =         selinux_dentry_init_security,
 
        .inode_alloc_security =         selinux_inode_alloc_security,
        .inode_free_security =          selinux_inode_free_security,
@@ -5669,6 +5738,7 @@ static struct security_operations selinux_ops = {
        .getprocattr =                  selinux_getprocattr,
        .setprocattr =                  selinux_setprocattr,
 
+       .ismaclabel =                   selinux_ismaclabel,
        .secid_to_secctx =              selinux_secid_to_secctx,
        .secctx_to_secid =              selinux_secctx_to_secid,
        .release_secctx =               selinux_release_secctx,
index 6d3885165d143a27fb218e785aa4a13ac7937045..8fd8e18ea34019c863d91ba88268b8c4018f3410 100644 (file)
@@ -169,6 +169,8 @@ int security_get_allow_unknown(void);
 #define SECURITY_FS_USE_GENFS          4 /* use the genfs support */
 #define SECURITY_FS_USE_NONE           5 /* no labeling support */
 #define SECURITY_FS_USE_MNTPOINT       6 /* use mountpoint labeling */
+#define SECURITY_FS_USE_NATIVE         7 /* use native label support */
+#define SECURITY_FS_USE_MAX            7 /* Highest SECURITY_FS_USE_XXX */
 
 int security_fs_use(const char *fstype, unsigned int *behavior,
        u32 *sid);
index 9cd9b7c661ec16bd821f45a9e1a40ad69320f20c..c8adde3aff8fdbe93fb2f867e55f71b9879685a5 100644 (file)
@@ -2168,7 +2168,10 @@ static int ocontext_read(struct policydb *p, struct policydb_compat_info *info,
 
                                rc = -EINVAL;
                                c->v.behavior = le32_to_cpu(buf[0]);
-                               if (c->v.behavior > SECURITY_FS_USE_NONE)
+                               /* Determined at runtime, not in policy DB. */
+                               if (c->v.behavior == SECURITY_FS_USE_MNTPOINT)
+                                       goto out;
+                               if (c->v.behavior > SECURITY_FS_USE_MAX)
                                        goto out;
 
                                rc = -ENOMEM;
index 6a083303501dd8cf702c26ea0af8df72e7b89650..3f7682a387b730b9c75fc6e547de98d512b46dbb 100644 (file)
@@ -3639,6 +3639,16 @@ static void smack_audit_rule_free(void *vrule)
 
 #endif /* CONFIG_AUDIT */
 
+/**
+ * smack_ismaclabel - check if xattr @name references a smack MAC label
+ * @name: Full xattr name to check.
+ */
+static int smack_ismaclabel(const char *name)
+{
+       return (strcmp(name, XATTR_SMACK_SUFFIX) == 0);
+}
+
+
 /**
  * smack_secid_to_secctx - return the smack label for a secid
  * @secid: incoming integer
@@ -3836,6 +3846,7 @@ struct security_operations smack_ops = {
        .audit_rule_free =              smack_audit_rule_free,
 #endif /* CONFIG_AUDIT */
 
+       .ismaclabel =                   smack_ismaclabel,
        .secid_to_secctx =              smack_secid_to_secctx,
        .secctx_to_secid =              smack_secctx_to_secid,
        .release_secctx =               smack_release_secctx,