]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - fs/nfs/write.c
Merge branch 'bugfixes' into linux-next
[karo-tx-linux.git] / fs / nfs / write.c
index ffb9459f180bc73f3d7d499c14f5ba4c0b219968..d357728ed8baba3c42ed7229e33bc4dee6188ede 100644 (file)
  * Local function declarations
  */
 static void nfs_redirty_request(struct nfs_page *req);
-static const struct rpc_call_ops nfs_write_common_ops;
 static const struct rpc_call_ops nfs_commit_ops;
 static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
 static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
+static const struct nfs_rw_ops nfs_rw_write_ops;
+static void nfs_clear_request_commit(struct nfs_page *req);
 
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
@@ -70,77 +71,20 @@ void nfs_commit_free(struct nfs_commit_data *p)
 }
 EXPORT_SYMBOL_GPL(nfs_commit_free);
 
-struct nfs_write_header *nfs_writehdr_alloc(void)
+static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
-       struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
-
-       if (p) {
-               struct nfs_pgio_header *hdr = &p->header;
+       struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
 
+       if (p)
                memset(p, 0, sizeof(*p));
-               INIT_LIST_HEAD(&hdr->pages);
-               INIT_LIST_HEAD(&hdr->rpc_list);
-               spin_lock_init(&hdr->lock);
-               atomic_set(&hdr->refcnt, 0);
-               hdr->verf = &p->verf;
-       }
        return p;
 }
-EXPORT_SYMBOL_GPL(nfs_writehdr_alloc);
 
-static struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
-                                                 unsigned int pagecount)
+static void nfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
-       struct nfs_write_data *data, *prealloc;
-
-       prealloc = &container_of(hdr, struct nfs_write_header, header)->rpc_data;
-       if (prealloc->header == NULL)
-               data = prealloc;
-       else
-               data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               goto out;
-
-       if (nfs_pgarray_set(&data->pages, pagecount)) {
-               data->header = hdr;
-               atomic_inc(&hdr->refcnt);
-       } else {
-               if (data != prealloc)
-                       kfree(data);
-               data = NULL;
-       }
-out:
-       return data;
+       mempool_free(hdr, nfs_wdata_mempool);
 }
 
-void nfs_writehdr_free(struct nfs_pgio_header *hdr)
-{
-       struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);
-       mempool_free(whdr, nfs_wdata_mempool);
-}
-EXPORT_SYMBOL_GPL(nfs_writehdr_free);
-
-void nfs_writedata_release(struct nfs_write_data *wdata)
-{
-       struct nfs_pgio_header *hdr = wdata->header;
-       struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header);
-
-       put_nfs_open_context(wdata->args.context);
-       if (wdata->pages.pagevec != wdata->pages.page_array)
-               kfree(wdata->pages.pagevec);
-       if (wdata == &write_header->rpc_data) {
-               wdata->header = NULL;
-               wdata = NULL;
-       }
-       if (atomic_dec_and_test(&hdr->refcnt))
-               hdr->completion_ops->completion(hdr);
-       /* Note: we only free the rpc_task after callbacks are done.
-        * See the comment in rpc_free_task() for why
-        */
-       kfree(wdata);
-}
-EXPORT_SYMBOL_GPL(nfs_writedata_release);
-
 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 {
        ctx->error = error;
@@ -148,8 +92,15 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
        set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 }
 
+/*
+ * nfs_page_find_head_request_locked - find head request associated with @page
+ *
+ * must be called while holding the inode lock.
+ *
+ * returns matching head request with reference held, or NULL if not found.
+ */
 static struct nfs_page *
-nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page)
+nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
 {
        struct nfs_page *req = NULL;
 
@@ -161,25 +112,33 @@ nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page)
                /* Linearly search the commit list for the correct req */
                list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {
                        if (freq->wb_page == page) {
-                               req = freq;
+                               req = freq->wb_head;
                                break;
                        }
                }
        }
 
-       if (req)
+       if (req) {
+               WARN_ON_ONCE(req->wb_head != req);
+
                kref_get(&req->wb_kref);
+       }
 
        return req;
 }
 
-static struct nfs_page *nfs_page_find_request(struct page *page)
+/*
+ * nfs_page_find_head_request - find head request associated with @page
+ *
+ * returns matching head request with reference held, or NULL if not found.
+ */
+static struct nfs_page *nfs_page_find_head_request(struct page *page)
 {
        struct inode *inode = page_file_mapping(page)->host;
        struct nfs_page *req = NULL;
 
        spin_lock(&inode->i_lock);
-       req = nfs_page_find_request_locked(NFS_I(inode), page);
+       req = nfs_page_find_head_request_locked(NFS_I(inode), page);
        spin_unlock(&inode->i_lock);
        return req;
 }
@@ -211,18 +170,78 @@ static void nfs_set_pageerror(struct page *page)
        nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
 }
 
+/*
+ * nfs_page_group_search_locked
+ * @head - head request of page group
+ * @page_offset - offset into page
+ *
+ * Search page group with head @head to find a request that contains the
+ * page offset @page_offset.
+ *
+ * Returns a pointer to the first matching nfs request, or NULL if no
+ * match is found.
+ *
+ * Must be called with the page group lock held
+ */
+static struct nfs_page *
+nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
+{
+       struct nfs_page *req;
+
+       WARN_ON_ONCE(head != head->wb_head);
+       WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags));
+
+       req = head;
+       do {
+               if (page_offset >= req->wb_pgbase &&
+                   page_offset < (req->wb_pgbase + req->wb_bytes))
+                       return req;
+
+               req = req->wb_this_page;
+       } while (req != head);
+
+       return NULL;
+}
+
+/*
+ * nfs_page_group_covers_page
+ * @head - head request of page group
+ *
+ * Return true if the page group with head @head covers the whole page,
+ * returns false otherwise
+ */
+static bool nfs_page_group_covers_page(struct nfs_page *req)
+{
+       struct nfs_page *tmp;
+       unsigned int pos = 0;
+       unsigned int len = nfs_page_length(req->wb_page);
+
+       nfs_page_group_lock(req);
+
+       do {
+               tmp = nfs_page_group_search_locked(req->wb_head, pos);
+               if (tmp) {
+                       /* no way this should happen */
+                       WARN_ON_ONCE(tmp->wb_pgbase != pos);
+                       pos += tmp->wb_bytes - (pos - tmp->wb_pgbase);
+               }
+       } while (tmp && pos < len);
+
+       nfs_page_group_unlock(req);
+       WARN_ON_ONCE(pos > len);
+       return pos == len;
+}
+
 /* We can set the PG_uptodate flag if we see that a write request
  * covers the full page.
  */
-static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count)
+static void nfs_mark_uptodate(struct nfs_page *req)
 {
-       if (PageUptodate(page))
-               return;
-       if (base != 0)
+       if (PageUptodate(req->wb_page))
                return;
-       if (count != nfs_page_length(page))
+       if (!nfs_page_group_covers_page(req))
                return;
-       SetPageUptodate(page);
+       SetPageUptodate(req->wb_page);
 }
 
 static int wb_priority(struct writeback_control *wbc)
@@ -258,46 +277,259 @@ static void nfs_set_page_writeback(struct page *page)
        }
 }
 
-static void nfs_end_page_writeback(struct page *page)
+static void nfs_end_page_writeback(struct nfs_page *req)
 {
-       struct inode *inode = page_file_mapping(page)->host;
+       struct inode *inode = page_file_mapping(req->wb_page)->host;
        struct nfs_server *nfss = NFS_SERVER(inode);
 
-       end_page_writeback(page);
+       if (!nfs_page_group_sync_on_bit(req, PG_WB_END))
+               return;
+
+       end_page_writeback(req->wb_page);
        if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
                clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
 
-static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
+
+/* nfs_page_group_clear_bits
+ *   @req - an nfs request
+ * clears all page group related bits from @req
+ */
+static void
+nfs_page_group_clear_bits(struct nfs_page *req)
+{
+       clear_bit(PG_TEARDOWN, &req->wb_flags);
+       clear_bit(PG_UNLOCKPAGE, &req->wb_flags);
+       clear_bit(PG_UPTODATE, &req->wb_flags);
+       clear_bit(PG_WB_END, &req->wb_flags);
+       clear_bit(PG_REMOVE, &req->wb_flags);
+}
+
+
+/*
+ * nfs_unroll_locks_and_wait -  unlock all newly locked reqs and wait on @req
+ *
+ * this is a helper function for nfs_lock_and_join_requests
+ *
+ * @inode - inode associated with request page group, must be holding inode lock
+ * @head  - head request of page group, must be holding head lock
+ * @req   - request that couldn't lock and needs to wait on the req bit lock
+ * @nonblock - if true, don't actually wait
+ *
+ * NOTE: this must be called holding page_group bit lock and inode spin lock
+ *       and BOTH will be released before returning.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+static int
+nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
+                         struct nfs_page *req, bool nonblock)
+       __releases(&inode->i_lock)
+{
+       struct nfs_page *tmp;
+       int ret;
+
+       /* relinquish all the locks successfully grabbed this run */
+       for (tmp = head ; tmp != req; tmp = tmp->wb_this_page)
+               nfs_unlock_request(tmp);
+
+       WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
+
+       /* grab a ref on the request that will be waited on */
+       kref_get(&req->wb_kref);
+
+       nfs_page_group_unlock(head);
+       spin_unlock(&inode->i_lock);
+
+       /* release ref from nfs_page_find_head_request_locked */
+       nfs_release_request(head);
+
+       if (!nonblock)
+               ret = nfs_wait_on_request(req);
+       else
+               ret = -EAGAIN;
+       nfs_release_request(req);
+
+       return ret;
+}
+
+/*
+ * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests
+ *
+ * @destroy_list - request list (using wb_this_page) terminated by @old_head
+ * @old_head - the old head of the list
+ *
+ * All subrequests must be locked and removed from all lists, so at this point
+ * they are only "active" in this function, and possibly in nfs_wait_on_request
+ * with a reference held by some other context.
+ */
+static void
+nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
+                                struct nfs_page *old_head)
+{
+       while (destroy_list) {
+               struct nfs_page *subreq = destroy_list;
+
+               destroy_list = (subreq->wb_this_page == old_head) ?
+                                  NULL : subreq->wb_this_page;
+
+               WARN_ON_ONCE(old_head != subreq->wb_head);
+
+               /* make sure old group is not used */
+               subreq->wb_head = subreq;
+               subreq->wb_this_page = subreq;
+
+               nfs_clear_request_commit(subreq);
+
+               /* subreq is now totally disconnected from page group or any
+                * write / commit lists. last chance to wake any waiters */
+               nfs_unlock_request(subreq);
+
+               if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) {
+                       /* release ref on old head request */
+                       nfs_release_request(old_head);
+
+                       nfs_page_group_clear_bits(subreq);
+
+                       /* release the PG_INODE_REF reference */
+                       if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags))
+                               nfs_release_request(subreq);
+                       else
+                               WARN_ON_ONCE(1);
+               } else {
+                       WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags));
+                       /* zombie requests have already released the last
+                        * reference and were waiting on the rest of the
+                        * group to complete. Since it's no longer part of a
+                        * group, simply free the request */
+                       nfs_page_group_clear_bits(subreq);
+                       nfs_free_request(subreq);
+               }
+       }
+}
+
+/*
+ * nfs_lock_and_join_requests - join all subreqs to the head req and return
+ *                              a locked reference, cancelling any pending
+ *                              operations for this page.
+ *
+ * @page - the page used to lookup the "page group" of nfs_page structures
+ * @nonblock - if true, don't block waiting for request locks
+ *
+ * This function joins all sub requests to the head request by first
+ * locking all requests in the group, cancelling any pending operations
+ * and finally updating the head request to cover the whole range covered by
+ * the (former) group.  All subrequests are removed from any write or commit
+ * lists, unlinked from the group and destroyed.
+ *
+ * Returns a locked, referenced pointer to the head request - which after
+ * this call is guaranteed to be the only request associated with the page.
+ * Returns NULL if no requests are found for @page, or a ERR_PTR if an
+ * error was encountered.
+ */
+static struct nfs_page *
+nfs_lock_and_join_requests(struct page *page, bool nonblock)
 {
        struct inode *inode = page_file_mapping(page)->host;
-       struct nfs_page *req;
+       struct nfs_page *head, *subreq;
+       struct nfs_page *destroy_list = NULL;
+       unsigned int total_bytes;
        int ret;
 
+try_again:
+       total_bytes = 0;
+
+       WARN_ON_ONCE(destroy_list);
+
        spin_lock(&inode->i_lock);
-       for (;;) {
-               req = nfs_page_find_request_locked(NFS_I(inode), page);
-               if (req == NULL)
-                       break;
-               if (nfs_lock_request(req))
-                       break;
-               /* Note: If we hold the page lock, as is the case in nfs_writepage,
-                *       then the call to nfs_lock_request() will always
-                *       succeed provided that someone hasn't already marked the
-                *       request as dirty (in which case we don't care).
-                */
+
+       /*
+        * A reference is taken only on the head request which acts as a
+        * reference to the whole page group - the group will not be destroyed
+        * until the head reference is released.
+        */
+       head = nfs_page_find_head_request_locked(NFS_I(inode), page);
+
+       if (!head) {
                spin_unlock(&inode->i_lock);
-               if (!nonblock)
-                       ret = nfs_wait_on_request(req);
-               else
-                       ret = -EAGAIN;
-               nfs_release_request(req);
-               if (ret != 0)
+               return NULL;
+       }
+
+       /* lock each request in the page group */
+       nfs_page_group_lock(head);
+       subreq = head;
+       do {
+               /*
+                * Subrequests are always contiguous, non overlapping
+                * and in order. If not, it's a programming error.
+                */
+               WARN_ON_ONCE(subreq->wb_offset !=
+                    (head->wb_offset + total_bytes));
+
+               /* keep track of how many bytes this group covers */
+               total_bytes += subreq->wb_bytes;
+
+               if (!nfs_lock_request(subreq)) {
+                       /* releases page group bit lock and
+                        * inode spin lock and all references */
+                       ret = nfs_unroll_locks_and_wait(inode, head,
+                               subreq, nonblock);
+
+                       if (ret == 0)
+                               goto try_again;
+
                        return ERR_PTR(ret);
-               spin_lock(&inode->i_lock);
+               }
+
+               subreq = subreq->wb_this_page;
+       } while (subreq != head);
+
+       /* Now that all requests are locked, make sure they aren't on any list.
+        * Commit list removal accounting is done after locks are dropped */
+       subreq = head;
+       do {
+               nfs_list_remove_request(subreq);
+               subreq = subreq->wb_this_page;
+       } while (subreq != head);
+
+       /* unlink subrequests from head, destroy them later */
+       if (head->wb_this_page != head) {
+               /* destroy list will be terminated by head */
+               destroy_list = head->wb_this_page;
+               head->wb_this_page = head;
+
+               /* change head request to cover whole range that
+                * the former page group covered */
+               head->wb_bytes = total_bytes;
        }
+
+       /*
+        * prepare head request to be added to new pgio descriptor
+        */
+       nfs_page_group_clear_bits(head);
+
+       /*
+        * some part of the group was still on the inode list - otherwise
+        * the group wouldn't be involved in async write.
+        * grab a reference for the head request, iff it needs one.
+        */
+       if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags))
+               kref_get(&head->wb_kref);
+
+       nfs_page_group_unlock(head);
+
+       /* drop lock to clear_request_commit the head req and clean up
+        * requests on destroy list */
        spin_unlock(&inode->i_lock);
-       return req;
+
+       nfs_destroy_unlinked_subrequests(destroy_list, head);
+
+       /* clean up commit list state */
+       nfs_clear_request_commit(head);
+
+       /* still holds ref on head from nfs_page_find_head_request_locked
+        * and still has lock on head from lock loop */
+       return head;
 }
 
 /*
@@ -310,7 +542,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
        struct nfs_page *req;
        int ret = 0;
 
-       req = nfs_find_and_lock_request(page, nonblock);
+       req = nfs_lock_and_join_requests(page, nonblock);
        if (!req)
                goto out;
        ret = PTR_ERR(req);
@@ -354,10 +586,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
        struct nfs_pageio_descriptor pgio;
        int err;
 
-       NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio,
-                                                         page->mapping->host,
-                                                         wb_priority(wbc),
-                                                         &nfs_async_write_completion_ops);
+       nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc),
+                               false, &nfs_async_write_completion_ops);
        err = nfs_do_writepage(page, wbc, &pgio);
        nfs_pageio_complete(&pgio);
        if (err < 0)
@@ -400,7 +630,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
-       NFS_PROTO(inode)->write_pageio_init(&pgio, inode, wb_priority(wbc), &nfs_async_write_completion_ops);
+       nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
+                               &nfs_async_write_completion_ops);
        err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
        nfs_pageio_complete(&pgio);
 
@@ -425,6 +656,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
 
+       WARN_ON_ONCE(req->wb_this_page != req);
+
        /* Lock the request! */
        nfs_lock_request(req);
 
@@ -441,6 +674,9 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
                set_page_private(req->wb_page, (unsigned long)req);
        }
        nfsi->npages++;
+       /* this a head request for a page group - mark it as having an
+        * extra reference so sub groups can follow suit */
+       WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags));
        kref_get(&req->wb_kref);
        spin_unlock(&inode->i_lock);
 }
@@ -452,16 +688,23 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 {
        struct inode *inode = req->wb_context->dentry->d_inode;
        struct nfs_inode *nfsi = NFS_I(inode);
+       struct nfs_page *head;
 
-       spin_lock(&inode->i_lock);
-       if (likely(!PageSwapCache(req->wb_page))) {
-               set_page_private(req->wb_page, 0);
-               ClearPagePrivate(req->wb_page);
-               clear_bit(PG_MAPPED, &req->wb_flags);
+       if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
+               head = req->wb_head;
+
+               spin_lock(&inode->i_lock);
+               if (likely(!PageSwapCache(head->wb_page))) {
+                       set_page_private(head->wb_page, 0);
+                       ClearPagePrivate(head->wb_page);
+                       clear_bit(PG_MAPPED, &head->wb_flags);
+               }
+               nfsi->npages--;
+               spin_unlock(&inode->i_lock);
        }
-       nfsi->npages--;
-       spin_unlock(&inode->i_lock);
-       nfs_release_request(req);
+
+       if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
+               nfs_release_request(req);
 }
 
 static void
@@ -582,12 +825,11 @@ nfs_clear_request_commit(struct nfs_page *req)
        }
 }
 
-static inline
-int nfs_write_need_commit(struct nfs_write_data *data)
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
 {
-       if (data->verf.committed == NFS_DATA_SYNC)
-               return data->header->lseg == NULL;
-       return data->verf.committed != NFS_FILE_SYNC;
+       if (hdr->verf.committed == NFS_DATA_SYNC)
+               return hdr->lseg == NULL;
+       return hdr->verf.committed != NFS_FILE_SYNC;
 }
 
 #else
@@ -613,8 +855,7 @@ nfs_clear_request_commit(struct nfs_page *req)
 {
 }
 
-static inline
-int nfs_write_need_commit(struct nfs_write_data *data)
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
 {
        return 0;
 }
@@ -640,12 +881,8 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
                        nfs_context_set_write_error(req->wb_context, hdr->error);
                        goto remove_req;
                }
-               if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
-                       nfs_mark_request_dirty(req);
-                       goto next;
-               }
-               if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
-                       memcpy(&req->wb_verf, &hdr->verf->verifier, sizeof(req->wb_verf));
+               if (nfs_write_need_commit(hdr)) {
+                       memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
                        nfs_mark_request_commit(req, hdr->lseg, &cinfo);
                        goto next;
                }
@@ -653,7 +890,7 @@ remove_req:
                nfs_inode_remove_request(req);
 next:
                nfs_unlock_request(req);
-               nfs_end_page_writeback(req->wb_page);
+               nfs_end_page_writeback(req);
                nfs_release_request(req);
        }
 out:
@@ -661,7 +898,7 @@ out:
 }
 
 #if  IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
-static unsigned long
+unsigned long
 nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 {
        return cinfo->mds->ncommit;
@@ -718,7 +955,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 }
 
 #else
-static unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
+unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 {
        return 0;
 }
@@ -754,10 +991,14 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
        spin_lock(&inode->i_lock);
 
        for (;;) {
-               req = nfs_page_find_request_locked(NFS_I(inode), page);
+               req = nfs_page_find_head_request_locked(NFS_I(inode), page);
                if (req == NULL)
                        goto out_unlock;
 
+               /* should be handled by nfs_flush_incompatible */
+               WARN_ON_ONCE(req->wb_head != req);
+               WARN_ON_ONCE(req->wb_this_page != req);
+
                rqend = req->wb_offset + req->wb_bytes;
                /*
                 * Tell the caller to flush out the request if
@@ -819,7 +1060,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
        req = nfs_try_to_update_request(inode, page, offset, bytes);
        if (req != NULL)
                goto out;
-       req = nfs_create_request(ctx, inode, page, offset, bytes);
+       req = nfs_create_request(ctx, page, NULL, offset, bytes);
        if (IS_ERR(req))
                goto out;
        nfs_inode_add_request(inode, req);
@@ -837,7 +1078,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
                return PTR_ERR(req);
        /* Update file length */
        nfs_grow_file(page, offset, count);
-       nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+       nfs_mark_uptodate(req);
        nfs_mark_request_dirty(req);
        nfs_unlock_and_release_request(req);
        return 0;
@@ -858,11 +1099,13 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
         * dropped page.
         */
        do {
-               req = nfs_page_find_request(page);
+               req = nfs_page_find_head_request(page);
                if (req == NULL)
                        return 0;
                l_ctx = req->wb_lock_context;
                do_flush = req->wb_page != page || req->wb_context != ctx;
+               /* for now, flush if more than 1 request in page_group */
+               do_flush |= req->wb_this_page != req;
                if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
                        do_flush |= l_ctx->lockowner.l_owner != current->files
                                || l_ctx->lockowner.l_pid != current->tgid;
@@ -913,12 +1156,14 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
 
        if (nfs_have_delegated_attributes(inode))
                goto out;
-       if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
+       if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
                return false;
        smp_rmb();
        if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
                return false;
 out:
+       if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+               return false;
        return PageUptodate(page) != 0;
 }
 
@@ -990,126 +1235,18 @@ static int flush_task_priority(int how)
        return RPC_PRIORITY_NORMAL;
 }
 
-int nfs_initiate_write(struct rpc_clnt *clnt,
-                      struct nfs_write_data *data,
-                      const struct rpc_call_ops *call_ops,
-                      int how, int flags)
+static void nfs_initiate_write(struct nfs_pgio_header *hdr,
+                              struct rpc_message *msg,
+                              struct rpc_task_setup *task_setup_data, int how)
 {
-       struct inode *inode = data->header->inode;
+       struct inode *inode = hdr->inode;
        int priority = flush_task_priority(how);
-       struct rpc_task *task;
-       struct rpc_message msg = {
-               .rpc_argp = &data->args,
-               .rpc_resp = &data->res,
-               .rpc_cred = data->header->cred,
-       };
-       struct rpc_task_setup task_setup_data = {
-               .rpc_client = clnt,
-               .task = &data->task,
-               .rpc_message = &msg,
-               .callback_ops = call_ops,
-               .callback_data = data,
-               .workqueue = nfsiod_workqueue,
-               .flags = RPC_TASK_ASYNC | flags,
-               .priority = priority,
-       };
-       int ret = 0;
 
-       /* Set up the initial task struct.  */
-       NFS_PROTO(inode)->write_setup(data, &msg);
-
-       dprintk("NFS: %5u initiated write call "
-               "(req %s/%llu, %u bytes @ offset %llu)\n",
-               data->task.tk_pid,
-               inode->i_sb->s_id,
-               (unsigned long long)NFS_FILEID(inode),
-               data->args.count,
-               (unsigned long long)data->args.offset);
+       task_setup_data->priority = priority;
+       NFS_PROTO(inode)->write_setup(hdr, msg);
 
        nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
-                                &task_setup_data.rpc_client, &msg, data);
-
-       task = rpc_run_task(&task_setup_data);
-       if (IS_ERR(task)) {
-               ret = PTR_ERR(task);
-               goto out;
-       }
-       if (how & FLUSH_SYNC) {
-               ret = rpc_wait_for_completion_task(task);
-               if (ret == 0)
-                       ret = task->tk_status;
-       }
-       rpc_put_task(task);
-out:
-       return ret;
-}
-EXPORT_SYMBOL_GPL(nfs_initiate_write);
-
-/*
- * Set up the argument/result storage required for the RPC call.
- */
-static void nfs_write_rpcsetup(struct nfs_write_data *data,
-               unsigned int count, unsigned int offset,
-               int how, struct nfs_commit_info *cinfo)
-{
-       struct nfs_page *req = data->header->req;
-
-       /* Set up the RPC argument and reply structs
-        * NB: take care not to mess about with data->commit et al. */
-
-       data->args.fh     = NFS_FH(data->header->inode);
-       data->args.offset = req_offset(req) + offset;
-       /* pnfs_set_layoutcommit needs this */
-       data->mds_offset = data->args.offset;
-       data->args.pgbase = req->wb_pgbase + offset;
-       data->args.pages  = data->pages.pagevec;
-       data->args.count  = count;
-       data->args.context = get_nfs_open_context(req->wb_context);
-       data->args.lock_context = req->wb_lock_context;
-       data->args.stable  = NFS_UNSTABLE;
-       switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
-       case 0:
-               break;
-       case FLUSH_COND_STABLE:
-               if (nfs_reqs_to_commit(cinfo))
-                       break;
-       default:
-               data->args.stable = NFS_FILE_SYNC;
-       }
-
-       data->res.fattr   = &data->fattr;
-       data->res.count   = count;
-       data->res.verf    = &data->verf;
-       nfs_fattr_init(&data->fattr);
-}
-
-static int nfs_do_write(struct nfs_write_data *data,
-               const struct rpc_call_ops *call_ops,
-               int how)
-{
-       struct inode *inode = data->header->inode;
-
-       return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0);
-}
-
-static int nfs_do_multiple_writes(struct list_head *head,
-               const struct rpc_call_ops *call_ops,
-               int how)
-{
-       struct nfs_write_data *data;
-       int ret = 0;
-
-       while (!list_empty(head)) {
-               int ret2;
-
-               data = list_first_entry(head, struct nfs_write_data, list);
-               list_del_init(&data->list);
-               
-               ret2 = nfs_do_write(data, call_ops, how);
-                if (ret == 0)
-                        ret = ret2;
-       }
-       return ret;
+                                &task_setup_data->rpc_client, msg, hdr);
 }
 
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -1120,7 +1257,7 @@ static void nfs_redirty_request(struct nfs_page *req)
 {
        nfs_mark_request_dirty(req);
        nfs_unlock_request(req);
-       nfs_end_page_writeback(req->wb_page);
+       nfs_end_page_writeback(req);
        nfs_release_request(req);
 }
 
@@ -1140,173 +1277,30 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
        .completion = nfs_write_completion,
 };
 
-static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
-               struct nfs_pgio_header *hdr)
-{
-       set_bit(NFS_IOHDR_REDO, &hdr->flags);
-       while (!list_empty(&hdr->rpc_list)) {
-               struct nfs_write_data *data = list_first_entry(&hdr->rpc_list,
-                               struct nfs_write_data, list);
-               list_del(&data->list);
-               nfs_writedata_release(data);
-       }
-       desc->pg_completion_ops->error_cleanup(&desc->pg_list);
-}
-
-/*
- * Generate multiple small requests to write out a single
- * contiguous dirty area on one page.
- */
-static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
-                          struct nfs_pgio_header *hdr)
-{
-       struct nfs_page *req = hdr->req;
-       struct page *page = req->wb_page;
-       struct nfs_write_data *data;
-       size_t wsize = desc->pg_bsize, nbytes;
-       unsigned int offset;
-       int requests = 0;
-       struct nfs_commit_info cinfo;
-
-       nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
-
-       if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
-           (desc->pg_moreio || nfs_reqs_to_commit(&cinfo) ||
-            desc->pg_count > wsize))
-               desc->pg_ioflags &= ~FLUSH_COND_STABLE;
-
-
-       offset = 0;
-       nbytes = desc->pg_count;
-       do {
-               size_t len = min(nbytes, wsize);
-
-               data = nfs_writedata_alloc(hdr, 1);
-               if (!data) {
-                       nfs_flush_error(desc, hdr);
-                       return -ENOMEM;
-               }
-               data->pages.pagevec[0] = page;
-               nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags, &cinfo);
-               list_add(&data->list, &hdr->rpc_list);
-               requests++;
-               nbytes -= len;
-               offset += len;
-       } while (nbytes != 0);
-       nfs_list_remove_request(req);
-       nfs_list_add_request(req, &hdr->pages);
-       desc->pg_rpc_callops = &nfs_write_common_ops;
-       return 0;
-}
-
-/*
- * Create an RPC task for the given write request and kick it.
- * The page must have been locked by the caller.
- *
- * It may happen that the page we're passed is not marked dirty.
- * This is the case if nfs_updatepage detects a conflicting request
- * that has been written but not committed.
- */
-static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
-                        struct nfs_pgio_header *hdr)
-{
-       struct nfs_page         *req;
-       struct page             **pages;
-       struct nfs_write_data   *data;
-       struct list_head *head = &desc->pg_list;
-       struct nfs_commit_info cinfo;
-
-       data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base,
-                                                          desc->pg_count));
-       if (!data) {
-               nfs_flush_error(desc, hdr);
-               return -ENOMEM;
-       }
-
-       nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
-       pages = data->pages.pagevec;
-       while (!list_empty(head)) {
-               req = nfs_list_entry(head->next);
-               nfs_list_remove_request(req);
-               nfs_list_add_request(req, &hdr->pages);
-               *pages++ = req->wb_page;
-       }
-
-       if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
-           (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
-               desc->pg_ioflags &= ~FLUSH_COND_STABLE;
-
-       /* Set up the argument struct */
-       nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
-       list_add(&data->list, &hdr->rpc_list);
-       desc->pg_rpc_callops = &nfs_write_common_ops;
-       return 0;
-}
-
-int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
-                     struct nfs_pgio_header *hdr)
-{
-       if (desc->pg_bsize < PAGE_CACHE_SIZE)
-               return nfs_flush_multi(desc, hdr);
-       return nfs_flush_one(desc, hdr);
-}
-EXPORT_SYMBOL_GPL(nfs_generic_flush);
-
-static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
-{
-       struct nfs_write_header *whdr;
-       struct nfs_pgio_header *hdr;
-       int ret;
-
-       whdr = nfs_writehdr_alloc();
-       if (!whdr) {
-               desc->pg_completion_ops->error_cleanup(&desc->pg_list);
-               return -ENOMEM;
-       }
-       hdr = &whdr->header;
-       nfs_pgheader_init(desc, hdr, nfs_writehdr_free);
-       atomic_inc(&hdr->refcnt);
-       ret = nfs_generic_flush(desc, hdr);
-       if (ret == 0)
-               ret = nfs_do_multiple_writes(&hdr->rpc_list,
-                                            desc->pg_rpc_callops,
-                                            desc->pg_ioflags);
-       if (atomic_dec_and_test(&hdr->refcnt))
-               hdr->completion_ops->completion(hdr);
-       return ret;
-}
-
-static const struct nfs_pageio_ops nfs_pageio_write_ops = {
-       .pg_test = nfs_generic_pg_test,
-       .pg_doio = nfs_generic_pg_writepages,
-};
-
 void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
-                              struct inode *inode, int ioflags,
+                              struct inode *inode, int ioflags, bool force_mds,
                               const struct nfs_pgio_completion_ops *compl_ops)
 {
-       nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, compl_ops,
-                               NFS_SERVER(inode)->wsize, ioflags);
+       struct nfs_server *server = NFS_SERVER(inode);
+       const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops;
+
+#ifdef CONFIG_NFS_V4_1
+       if (server->pnfs_curr_ld && !force_mds)
+               pg_ops = server->pnfs_curr_ld->pg_write_ops;
+#endif
+       nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
+                       server->wsize, ioflags);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
 
 void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
 {
-       pgio->pg_ops = &nfs_pageio_write_ops;
+       pgio->pg_ops = &nfs_pgio_rw_ops;
        pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
 
 
-void nfs_write_prepare(struct rpc_task *task, void *calldata)
-{
-       struct nfs_write_data *data = calldata;
-       int err;
-       err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
-       if (err)
-               rpc_exit(task, err);
-}
-
 void nfs_commit_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs_commit_data *data = calldata;
@@ -1314,59 +1308,45 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
        NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
 }
 
-/*
- * Handle a write reply that flushes a whole page.
- *
- * FIXME: There is an inherent race with invalidate_inode_pages and
- *       writebacks since the page->count is kept > 1 for as long
- *       as the page has a write request pending.
- */
-static void nfs_writeback_done_common(struct rpc_task *task, void *calldata)
+static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
 {
-       struct nfs_write_data   *data = calldata;
-
-       nfs_writeback_done(task, data);
+       /* do nothing! */
 }
 
-static void nfs_writeback_release_common(void *calldata)
+/*
+ * Special version of should_remove_suid() that ignores capabilities.
+ */
+static int nfs_should_remove_suid(const struct inode *inode)
 {
-       struct nfs_write_data   *data = calldata;
-       struct nfs_pgio_header *hdr = data->header;
-       int status = data->task.tk_status;
+       umode_t mode = inode->i_mode;
+       int kill = 0;
 
-       if ((status >= 0) && nfs_write_need_commit(data)) {
-               spin_lock(&hdr->lock);
-               if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
-                       ; /* Do nothing */
-               else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
-                       memcpy(hdr->verf, &data->verf, sizeof(*hdr->verf));
-               else if (memcmp(hdr->verf, &data->verf, sizeof(*hdr->verf)))
-                       set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
-               spin_unlock(&hdr->lock);
-       }
-       nfs_writedata_release(data);
-}
+       /* suid always must be killed */
+       if (unlikely(mode & S_ISUID))
+               kill = ATTR_KILL_SUID;
 
-static const struct rpc_call_ops nfs_write_common_ops = {
-       .rpc_call_prepare = nfs_write_prepare,
-       .rpc_call_done = nfs_writeback_done_common,
-       .rpc_release = nfs_writeback_release_common,
-};
+       /*
+        * sgid without any exec bits is just a mandatory locking mark; leave
+        * it alone.  If some exec bits are set, it's a real sgid; kill it.
+        */
+       if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+               kill |= ATTR_KILL_SGID;
 
+       if (unlikely(kill && S_ISREG(mode)))
+               return kill;
+
+       return 0;
+}
 
 /*
  * This function is called when the WRITE call is complete.
  */
-void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs_writeback_done(struct rpc_task *task,
+                             struct nfs_pgio_header *hdr,
+                             struct inode *inode)
 {
-       struct nfs_writeargs    *argp = &data->args;
-       struct nfs_writeres     *resp = &data->res;
-       struct inode            *inode = data->header->inode;
        int status;
 
-       dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
-               task->tk_pid, task->tk_status);
-
        /*
         * ->write_done will attempt to use post-op attributes to detect
         * conflicting writes by other clients.  A strict interpretation
@@ -1374,13 +1354,14 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
         * another writer had changed the file, but some applications
         * depend on tighter cache coherency when writing.
         */
-       status = NFS_PROTO(inode)->write_done(task, data);
+       status = NFS_PROTO(inode)->write_done(task, hdr);
        if (status != 0)
-               return;
-       nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
+               return status;
+       nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
 
 #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
-       if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
+       if (hdr->res.verf->committed < hdr->args.stable &&
+           task->tk_status >= 0) {
                /* We tried a write call, but the server did not
                 * commit data to stable storage even though we
                 * requested it.
@@ -1396,18 +1377,32 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                        dprintk("NFS:       faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
                                NFS_SERVER(inode)->nfs_client->cl_hostname,
-                               resp->verf->committed, argp->stable);
+                               hdr->res.verf->committed, hdr->args.stable);
                        complain = jiffies + 300 * HZ;
                }
        }
 #endif
-       if (task->tk_status < 0)
-               nfs_set_pgio_error(data->header, task->tk_status, argp->offset);
-       else if (resp->count < argp->count) {
+
+       /* Deal with the suid/sgid bit corner case */
+       if (nfs_should_remove_suid(inode))
+               nfs_mark_for_revalidate(inode);
+       return 0;
+}
+
+/*
+ * This function is called when the WRITE call is complete.
+ */
+static void nfs_writeback_result(struct rpc_task *task,
+                                struct nfs_pgio_header *hdr)
+{
+       struct nfs_pgio_args    *argp = &hdr->args;
+       struct nfs_pgio_res     *resp = &hdr->res;
+
+       if (resp->count < argp->count) {
                static unsigned long    complain;
 
                /* This a short write! */
-               nfs_inc_stats(inode, NFSIOS_SHORTWRITE);
+               nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE);
 
                /* Has the server at least made some progress? */
                if (resp->count == 0) {
@@ -1417,14 +1412,14 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                                       argp->count);
                                complain = jiffies + 300 * HZ;
                        }
-                       nfs_set_pgio_error(data->header, -EIO, argp->offset);
+                       nfs_set_pgio_error(hdr, -EIO, argp->offset);
                        task->tk_status = -EIO;
                        return;
                }
                /* Was this an NFSv2 write or an NFSv3 stable write? */
                if (resp->verf->committed != NFS_UNSTABLE) {
                        /* Resend from where the server left off */
-                       data->mds_offset += resp->count;
+                       hdr->mds_offset += resp->count;
                        argp->offset += resp->count;
                        argp->pgbase += resp->count;
                        argp->count -= resp->count;
@@ -1788,27 +1783,28 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
        struct nfs_page *req;
        int ret = 0;
 
-       for (;;) {
-               wait_on_page_writeback(page);
-               req = nfs_page_find_request(page);
-               if (req == NULL)
-                       break;
-               if (nfs_lock_request(req)) {
-                       nfs_clear_request_commit(req);
-                       nfs_inode_remove_request(req);
-                       /*
-                        * In case nfs_inode_remove_request has marked the
-                        * page as being dirty
-                        */
-                       cancel_dirty_page(page, PAGE_CACHE_SIZE);
-                       nfs_unlock_and_release_request(req);
-                       break;
-               }
-               ret = nfs_wait_on_request(req);
-               nfs_release_request(req);
-               if (ret < 0)
-                       break;
+       wait_on_page_writeback(page);
+
+       /* blocking call to cancel all requests and join to a single (head)
+        * request */
+       req = nfs_lock_and_join_requests(page, false);
+
+       if (IS_ERR(req)) {
+               ret = PTR_ERR(req);
+       } else if (req) {
+               /* all requests from this page have been cancelled by
+                * nfs_lock_and_join_requests, so just remove the head
+                * request from the inode / page_private pointer and
+                * release it */
+               nfs_inode_remove_request(req);
+               /*
+                * In case nfs_inode_remove_request has marked the
+                * page as being dirty
+                */
+               cancel_dirty_page(page, PAGE_CACHE_SIZE);
+               nfs_unlock_and_release_request(req);
        }
+
        return ret;
 }
 
@@ -1874,7 +1870,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 int __init nfs_init_writepagecache(void)
 {
        nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
-                                            sizeof(struct nfs_write_header),
+                                            sizeof(struct nfs_pgio_header),
                                             0, SLAB_HWCACHE_ALIGN,
                                             NULL);
        if (nfs_wdata_cachep == NULL)
@@ -1936,3 +1932,12 @@ void nfs_destroy_writepagecache(void)
        kmem_cache_destroy(nfs_wdata_cachep);
 }
 
+static const struct nfs_rw_ops nfs_rw_write_ops = {
+       .rw_mode                = FMODE_WRITE,
+       .rw_alloc_header        = nfs_writehdr_alloc,
+       .rw_free_header         = nfs_writehdr_free,
+       .rw_release             = nfs_writeback_release_common,
+       .rw_done                = nfs_writeback_done,
+       .rw_result              = nfs_writeback_result,
+       .rw_initiate            = nfs_initiate_write,
+};