From: Linus Torvalds Date: Tue, 28 Feb 2017 23:39:09 +0000 (-0800) Subject: Merge tag 'nfsd-4.11' of git://linux-nfs.org/~bfields/linux X-Git-Tag: v4.11-rc1~46 X-Git-Url: https://git.kernelconcepts.de/?a=commitdiff_plain;h=8313064c2e75542201e557e2b496668811c2484a;hp=b2deee2dc06db7cdf99b84346e69bdb9db9baa85;p=karo-tx-linux.git Merge tag 'nfsd-4.11' of git://linux-nfs.org/~bfields/linux Pull nfsd updates from Bruce Fields: "The nfsd update this round is mainly a lot of miscellaneous cleanups and bugfixes. A couple changes could theoretically break working setups on upgrade. I don't expect complaints in practice, but they seem worth calling out just in case: - NFS security labels are now off by default; a new security_label export flag reenables it per export. But, having them on by default is a disaster, as it generally only makes sense if all your clients and servers have similar enough selinux policies. Thanks to Jason Tibbitts for pointing this out. - NFSv4/UDP support is off. It was never really supported, and the spec explicitly forbids it. We only ever left it on out of laziness; thanks to Jeff Layton for finally fixing that" * tag 'nfsd-4.11' of git://linux-nfs.org/~bfields/linux: (34 commits) nfsd: Fix display of the version string nfsd: fix configuration of supported minor versions sunrpc: don't register UDP port with rpcbind when version needs congestion control nfs/nfsd/sunrpc: enforce transport requirements for NFSv4 sunrpc: flag transports as having congestion control sunrpc: turn bitfield flags in svc_version into bools nfsd: remove superfluous KERN_INFO nfsd: special case truncates some more nfsd: minor nfsd_setattr cleanup NFSD: Reserve adequate space for LOCKT operation NFSD: Get response size before operation for all RPCs nfsd/callback: Drop a useless data copy when comparing sessionid nfsd/callback: skip the callback tag nfsd/callback: Cleanup callback cred on shutdown nfsd/idmap: return nfserr_inval for 0-length names SUNRPC/Cache: Always treat the invalid cache as unexpired SUNRPC: Drop all entries from cache_detail when cache_purge() svcrdma: Poll CQs in "workqueue" mode svcrdma: Combine list fields in struct svc_rdma_op_ctxt svcrdma: Remove unused sc_dto_q field ... --- diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 1c13dd80744f..7e4ea3b9f472 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -322,6 +322,8 @@ static int lockd_inet6addr_event(struct notifier_block *this, dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; + if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, (struct sockaddr *)&sin6); } diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index eb094c6011d8..fd0284c1dc32 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -1083,7 +1083,8 @@ struct svc_version nfs4_callback_version1 = { .vs_proc = nfs4_callback_procedures1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, .vs_dispatch = NULL, - .vs_hidden = 1, + .vs_hidden = true, + .vs_need_cong_ctrl = true, }; struct svc_version nfs4_callback_version4 = { @@ -1092,5 +1093,6 @@ struct svc_version nfs4_callback_version4 = { .vs_proc = nfs4_callback_procedures1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, .vs_dispatch = NULL, - .vs_hidden = 1, + .vs_hidden = true, + .vs_need_cong_ctrl = true, }; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 43e109cc0ccc..e71f11b1a180 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1102,6 +1102,7 @@ static struct flags { { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, { NFSEXP_PNFS, {"pnfs", ""}}, + { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, { 0, {"", ""}} }; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index d08cd88155c7..838f90f3f890 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -376,5 +376,4 @@ struct svc_version nfsd_acl_version2 = { .vs_proc = nfsd_acl_procedures2, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS3_SVC_XDRSIZE, - .vs_hidden = 0, }; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 0c890347cde3..dcb5f79076c0 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -266,6 +266,5 @@ struct svc_version nfsd_acl_version3 = { .vs_proc = nfsd_acl_procedures3, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS3_SVC_XDRSIZE, - .vs_hidden = 0, }; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index d818e4ffd79f..045c9081eabe 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -193,11 +193,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; - nfserr = nfsd_write(rqstp, &resp->fh, NULL, - argp->offset, - rqstp->rq_vec, argp->vlen, - &cnt, - &resp->committed); + nfserr = nfsd_write(rqstp, &resp->fh, argp->offset, + rqstp->rq_vec, argp->vlen, + &cnt, resp->committed); resp->count = cnt; RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index eb78109d666c..0274db6e65d0 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -303,6 +303,7 @@ static int decode_cb_compound4res(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, length + 4); if (unlikely(p == NULL)) goto out_overflow; + p += XDR_QUADLEN(length); hdr->nops = be32_to_cpup(p); return 0; out_overflow: @@ -396,13 +397,10 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, struct nfsd4_callback *cb) { struct nfsd4_session *session = cb->cb_clp->cl_cb_session; - struct nfs4_sessionid id; - int status; + int status = -ESERVERFAULT; __be32 *p; u32 dummy; - status = -ESERVERFAULT; - /* * If the server returns different values for sessionID, slotID or * sequence number, the server is looney tunes. @@ -410,9 +408,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4); if (unlikely(p == NULL)) goto out_overflow; - memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); - if (memcmp(id.data, session->se_sessionid.data, - NFS4_MAX_SESSIONID_LEN) != 0) { + + if (memcmp(p, session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("NFS: %s Invalid session id\n", __func__); goto out; } @@ -753,6 +750,14 @@ int set_callback_cred(void) return 0; } +void cleanup_callback_cred(void) +{ + if (callback_cred) { + put_rpccred(callback_cred); + callback_cred = NULL; + } +} + static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) { if (clp->cl_minorversion == 0) { diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 5b20577dcdd2..6b9b6cca469f 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -628,6 +628,10 @@ nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, { __be32 status; u32 id = -1; + + if (name == NULL || namelen == 0) + return nfserr_inval; + status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id); *uid = make_kuid(&init_user_ns, id); if (!uid_valid(*uid)) @@ -641,6 +645,10 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, { __be32 status; u32 id = -1; + + if (name == NULL || namelen == 0) + return nfserr_inval; + status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id); *gid = make_kgid(&init_user_ns, id); if (!gid_valid(*gid)) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 74a6e573e061..cbeeda1e94a2 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -95,11 +95,15 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, u32 *bmval, u32 *writable) { struct dentry *dentry = cstate->current_fh.fh_dentry; + struct svc_export *exp = cstate->current_fh.fh_export; if (!nfsd_attrs_supported(cstate->minorversion, bmval)) return nfserr_attrnotsupp; if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry))) return nfserr_attrnotsupp; + if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) && + !(exp->ex_flags & NFSEXP_SECURITY_LABEL)) + return nfserr_attrnotsupp; if (writable && !bmval_is_subset(bmval, writable)) return nfserr_inval; if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) && @@ -983,7 +987,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, write->wr_offset, rqstp->rq_vec, nvecs, &cnt, - &write->wr_how_written); + write->wr_how_written); fput(filp); write->wr_bytes_written = cnt; @@ -1838,6 +1842,12 @@ static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32); } +static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + /* ac_supported, ac_resp_access */ + return (op_encode_hdr_size + 2)* sizeof(__be32); +} + static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); @@ -1892,6 +1902,11 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, return ret; } +static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE; +} + static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) @@ -1933,6 +1948,11 @@ static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o XDR_QUADLEN(rlen)) * sizeof(__be32); } +static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE; +} + static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) @@ -1952,11 +1972,23 @@ static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp, + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32); } +static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids) + * sizeof(__be32); +} + static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); } +static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR * + (4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32); +} + static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) * @@ -2011,6 +2043,19 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) } #ifdef CONFIG_NFSD_PNFS +static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = 0, rlen = 0; + + maxcount = svc_max_payload(rqstp); + rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount); + + return (op_encode_hdr_size + + 1 /* gd_layout_type*/ + + XDR_QUADLEN(rlen) + + 2 /* gd_notify_types */) * sizeof(__be32); +} + /* * At this stage we don't really know what layout driver will handle the request, * so we need to define an arbitrary upper bound here. @@ -2040,10 +2085,17 @@ static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_ } #endif /* CONFIG_NFSD_PNFS */ + +static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 3) * sizeof(__be32); +} + static struct nfsd4_operation nfsd4_ops[] = { [OP_ACCESS] = { .op_func = (nfsd4op_func)nfsd4_access, .op_name = "OP_ACCESS", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_access_rsize, }, [OP_CLOSE] = { .op_func = (nfsd4op_func)nfsd4_close, @@ -2081,6 +2133,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_GETFH] = { .op_func = (nfsd4op_func)nfsd4_getfh, .op_name = "OP_GETFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_getfh_rsize, }, [OP_LINK] = { .op_func = (nfsd4op_func)nfsd4_link, @@ -2099,6 +2152,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_LOCKT] = { .op_func = (nfsd4op_func)nfsd4_lockt, .op_name = "OP_LOCKT", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize, }, [OP_LOCKU] = { .op_func = (nfsd4op_func)nfsd4_locku, @@ -2111,15 +2165,18 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_lookup, .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID, .op_name = "OP_LOOKUP", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_LOOKUPP] = { .op_func = (nfsd4op_func)nfsd4_lookupp, .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID, .op_name = "OP_LOOKUPP", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_NVERIFY] = { .op_func = (nfsd4op_func)nfsd4_nverify, .op_name = "OP_NVERIFY", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_OPEN] = { .op_func = (nfsd4op_func)nfsd4_open, @@ -2177,6 +2234,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_READLINK] = { .op_func = (nfsd4op_func)nfsd4_readlink, .op_name = "OP_READLINK", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_readlink_rsize, }, [OP_REMOVE] = { .op_func = (nfsd4op_func)nfsd4_remove, @@ -2215,6 +2273,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_secinfo, .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize, }, [OP_SETATTR] = { .op_func = (nfsd4op_func)nfsd4_setattr, @@ -2240,6 +2299,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_VERIFY] = { .op_func = (nfsd4op_func)nfsd4_verify, .op_name = "OP_VERIFY", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_WRITE] = { .op_func = (nfsd4op_func)nfsd4_write, @@ -2314,11 +2374,13 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO_NO_NAME", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize, }, [OP_TEST_STATEID] = { .op_func = (nfsd4op_func)nfsd4_test_stateid, .op_flags = ALLOWED_WITHOUT_FH, .op_name = "OP_TEST_STATEID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_test_stateid_rsize, }, [OP_FREE_STATEID] = { .op_func = (nfsd4op_func)nfsd4_free_stateid, @@ -2332,6 +2394,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_getdeviceinfo, .op_flags = ALLOWED_WITHOUT_FH, .op_name = "OP_GETDEVICEINFO", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_getdeviceinfo_rsize, }, [OP_LAYOUTGET] = { .op_func = (nfsd4op_func)nfsd4_layoutget, @@ -2381,6 +2444,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_seek_rsize, }, }; @@ -2425,14 +2489,11 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) { - struct nfsd4_operation *opdesc; - nfsd4op_rsize estimator; - if (op->opnum == OP_ILLEGAL) return op_encode_hdr_size * sizeof(__be32); - opdesc = OPDESC(op); - estimator = opdesc->op_rsize_bop; - return estimator ? estimator(rqstp, op) : PAGE_SIZE; + + BUG_ON(OPDESC(op)->op_rsize_bop == NULL); + return OPDESC(op)->op_rsize_bop(rqstp, op); } void warn_on_nonidempotent_op(struct nfsd4_op *op) @@ -2476,12 +2537,13 @@ static struct svc_procedure nfsd_procedures4[2] = { }; struct svc_version nfsd_version4 = { - .vs_vers = 4, - .vs_nproc = 2, - .vs_proc = nfsd_procedures4, - .vs_dispatch = nfsd_dispatch, - .vs_xdrsize = NFS4_SVC_XDRSIZE, - .vs_rpcb_optnl = 1, + .vs_vers = 4, + .vs_nproc = 2, + .vs_proc = nfsd_procedures4, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS4_SVC_XDRSIZE, + .vs_rpcb_optnl = true, + .vs_need_cong_ctrl = true, }; /* diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a0dee8ae9f97..e9ef50addddb 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2281,7 +2281,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r out_err: conn->cb_addr.ss_family = AF_UNSPEC; conn->cb_addrlen = 0; - dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " + dprintk("NFSD: this client (clientid %08x/%08x) " "will not receive delegations\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); @@ -7012,23 +7012,24 @@ nfs4_state_start(void) ret = set_callback_cred(); if (ret) - return -ENOMEM; + return ret; + laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); if (laundry_wq == NULL) { ret = -ENOMEM; - goto out_recovery; + goto out_cleanup_cred; } ret = nfsd4_create_callback_queue(); if (ret) goto out_free_laundry; set_max_delegations(); - return 0; out_free_laundry: destroy_workqueue(laundry_wq); -out_recovery: +out_cleanup_cred: + cleanup_callback_cred(); return ret; } @@ -7086,6 +7087,7 @@ nfs4_state_shutdown(void) { destroy_workqueue(laundry_wq); nfsd4_destroy_callback_queue(); + cleanup_callback_cred(); } static void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 8fae53ce21d1..382c1fd05b4c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -58,7 +58,7 @@ #define NFSDDBG_FACILITY NFSDDBG_XDR -u32 nfsd_suppattrs[3][3] = { +const u32 nfsd_suppattrs[3][3] = { {NFSD4_SUPPORTED_ATTRS_WORD0, NFSD4_SUPPORTED_ATTRS_WORD1, NFSD4_SUPPORTED_ATTRS_WORD2}, @@ -1250,7 +1250,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) READ_BUF(16); p = xdr_decode_hyper(p, &write->wr_offset); write->wr_stable_how = be32_to_cpup(p++); - if (write->wr_stable_how > 2) + if (write->wr_stable_how > NFS_FILE_SYNC) goto xdr_error; write->wr_buflen = be32_to_cpup(p++); @@ -1941,12 +1941,12 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) } else max_reply += nfsd4_max_reply(argp->rqstp, op); /* - * OP_LOCK may return a conflicting lock. (Special case - * because it will just skip encoding this if it runs - * out of xdr buffer space, and it is the only operation - * that behaves this way.) + * OP_LOCK and OP_LOCKT may return a conflicting lock. + * (Special case because it will just skip encoding this + * if it runs out of xdr buffer space, and it is the only + * operation that behaves this way.) */ - if (op->opnum == OP_LOCK) + if (op->opnum == OP_LOCK || op->opnum == OP_LOCKT) max_reply += NFS4_OPAQUE_LIMIT; if (op->status) { @@ -1966,9 +1966,13 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) DECODE_TAIL; } -static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode) +static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, + struct svc_export *exp) { - if (IS_I_VERSION(inode)) { + if (exp->ex_flags & NFSEXP_V4ROOT) { + *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time)); + *p++ = 0; + } else if (IS_I_VERSION(inode)) { p = xdr_encode_hyper(p, inode->i_version); } else { *p++ = cpu_to_be32(stat->ctime.tv_sec); @@ -2417,8 +2421,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, #ifdef CONFIG_NFSD_V4_SECURITY_LABEL if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) || bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { - err = security_inode_getsecctx(d_inode(dentry), + if (exp->ex_flags & NFSEXP_SECURITY_LABEL) + err = security_inode_getsecctx(d_inode(dentry), &context, &contextlen); + else + err = -EOPNOTSUPP; contextsupport = (err == 0); if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { if (err == -EOPNOTSUPP) @@ -2490,7 +2497,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, p = xdr_reserve_space(xdr, 8); if (!p) goto out_resource; - p = encode_change(p, &stat, d_inode(dentry)); + p = encode_change(p, &stat, d_inode(dentry), exp); } if (bmval0 & FATTR4_WORD0_SIZE) { p = xdr_reserve_space(xdr, 8); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f3b2f34b10a3..73e75ac90525 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -536,6 +536,19 @@ out_free: return rv; } +static ssize_t +nfsd_print_version_support(char *buf, int remaining, const char *sep, + unsigned vers, unsigned minor) +{ + const char *format = (minor == 0) ? "%s%c%u" : "%s%c%u.%u"; + bool supported = !!nfsd_vers(vers, NFSD_TEST); + + if (vers == 4 && !nfsd_minorversion(minor, NFSD_TEST)) + supported = false; + return snprintf(buf, remaining, format, sep, + supported ? '+' : '-', vers, minor); +} + static ssize_t __write_versions(struct file *file, char *buf, size_t size) { char *mesg = buf; @@ -561,6 +574,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) len = qword_get(&mesg, vers, size); if (len <= 0) return -EINVAL; do { + enum vers_op cmd; sign = *vers; if (sign == '+' || sign == '-') num = simple_strtol((vers+1), &minorp, 0); @@ -569,24 +583,22 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) if (*minorp == '.') { if (num != 4) return -EINVAL; - minor = simple_strtoul(minorp+1, NULL, 0); - if (minor == 0) - return -EINVAL; - if (nfsd_minorversion(minor, sign == '-' ? - NFSD_CLEAR : NFSD_SET) < 0) + if (kstrtouint(minorp+1, 0, &minor) < 0) return -EINVAL; - goto next; - } + } else + minor = 0; + cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET; switch(num) { case 2: case 3: - case 4: - nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET); + nfsd_vers(num, cmd); break; + case 4: + if (nfsd_minorversion(minor, cmd) >= 0) + break; default: return -EINVAL; } - next: vers += len + 1; } while ((len = qword_get(&mesg, vers, size)) > 0); /* If all get turned off, turn them back on, as @@ -599,35 +611,23 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) len = 0; sep = ""; remaining = SIMPLE_TRANSACTION_LIMIT; - for (num=2 ; num <= 4 ; num++) - if (nfsd_vers(num, NFSD_AVAIL)) { - len = snprintf(buf, remaining, "%s%c%d", sep, - nfsd_vers(num, NFSD_TEST)?'+':'-', - num); - sep = " "; - - if (len >= remaining) - break; - remaining -= len; - buf += len; - tlen += len; - } - if (nfsd_vers(4, NFSD_AVAIL)) - for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; - minor++) { - len = snprintf(buf, remaining, " %c4.%u", - (nfsd_vers(4, NFSD_TEST) && - nfsd_minorversion(minor, NFSD_TEST)) ? - '+' : '-', - minor); - + for (num=2 ; num <= 4 ; num++) { + if (!nfsd_vers(num, NFSD_AVAIL)) + continue; + minor = 0; + do { + len = nfsd_print_version_support(buf, remaining, + sep, num, minor); if (len >= remaining) - break; + goto out; remaining -= len; buf += len; tlen += len; - } - + minor++; + sep = " "; + } while (num == 4 && minor <= NFSD_SUPPORTED_MINOR_VERSION); + } +out: len = snprintf(buf, remaining, "\n"); if (len >= remaining) return -EINVAL; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index d74c8c44dc35..d96606801d47 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -362,16 +362,16 @@ void nfsd_lockd_shutdown(void); FATTR4_WORD2_MODE_UMASK | \ NFSD4_2_SECURITY_ATTRS) -extern u32 nfsd_suppattrs[3][3]; +extern const u32 nfsd_suppattrs[3][3]; -static inline bool bmval_is_subset(u32 *bm1, u32 *bm2) +static inline bool bmval_is_subset(const u32 *bm1, const u32 *bm2) { return !((bm1[0] & ~bm2[0]) || (bm1[1] & ~bm2[1]) || (bm1[2] & ~bm2[2])); } -static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval) +static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval) { return bmval_is_subset(bmval, nfsd_suppattrs[minorversion]); } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 010aff5c5a79..fa82b7707e85 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -204,18 +204,14 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp, struct nfsd_attrstat *resp) { __be32 nfserr; - int stable = 1; unsigned long cnt = argp->len; dprintk("nfsd: WRITE %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->len, argp->offset); - nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, - argp->offset, - rqstp->rq_vec, argp->vlen, - &cnt, - &stable); + nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, + rqstp->rq_vec, argp->vlen, &cnt, NFS_DATA_SYNC); return nfsd_return_attrs(nfserr, resp); } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index e6bfd96734c0..efd66da99201 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -153,6 +153,18 @@ int nfsd_vers(int vers, enum vers_op change) return 0; } +static void +nfsd_adjust_nfsd_versions4(void) +{ + unsigned i; + + for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) { + if (nfsd_supported_minorversions[i]) + return; + } + nfsd_vers(4, NFSD_CLEAR); +} + int nfsd_minorversion(u32 minorversion, enum vers_op change) { if (minorversion > NFSD_SUPPORTED_MINOR_VERSION) @@ -160,9 +172,11 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change) switch(change) { case NFSD_SET: nfsd_supported_minorversions[minorversion] = true; + nfsd_vers(4, NFSD_SET); break; case NFSD_CLEAR: nfsd_supported_minorversions[minorversion] = false; + nfsd_adjust_nfsd_versions4(); break; case NFSD_TEST: return nfsd_supported_minorversions[minorversion]; @@ -354,6 +368,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this, dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; + if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4516e8b7d776..005c911b34ac 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -615,6 +615,7 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, extern __be32 nfs4_check_open_reclaim(clientid_t *clid, struct nfsd4_compound_state *cstate, struct nfsd_net *nn); extern int set_callback_cred(void); +extern void cleanup_callback_cred(void); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 26c6fdb4bf67..19d50f600e8d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -377,7 +377,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, __be32 err; int host_err; bool get_write_count; - int size_change = 0; + bool size_change = (iap->ia_valid & ATTR_SIZE); if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; @@ -390,11 +390,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, /* Get inode */ err = fh_verify(rqstp, fhp, ftype, accmode); if (err) - goto out; + return err; if (get_write_count) { host_err = fh_want_write(fhp); if (host_err) - return nfserrno(host_err); + goto out; } dentry = fhp->fh_dentry; @@ -405,20 +405,28 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid &= ~ATTR_MODE; if (!iap->ia_valid) - goto out; + return 0; nfsd_sanitize_attrs(inode, iap); + if (check_guard && guardtime != inode->i_ctime.tv_sec) + return nfserr_notsync; + /* * The size case is special, it changes the file in addition to the - * attributes. + * attributes, and file systems don't expect it to be mixed with + * "random" attribute changes. We thus split out the size change + * into a separate call to ->setattr, and do the rest as a separate + * setattr call. */ - if (iap->ia_valid & ATTR_SIZE) { + if (size_change) { err = nfsd_get_write_access(rqstp, fhp, iap); if (err) - goto out; - size_change = 1; + return err; + } + fh_lock(fhp); + if (size_change) { /* * RFC5661, Section 18.30.4: * Changing the size of a file with SETATTR indirectly @@ -426,29 +434,36 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, * * (and similar for the older RFCs) */ - if (iap->ia_size != i_size_read(inode)) - iap->ia_valid |= ATTR_MTIME; - } + struct iattr size_attr = { + .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME, + .ia_size = iap->ia_size, + }; - iap->ia_valid |= ATTR_CTIME; + host_err = notify_change(dentry, &size_attr, NULL); + if (host_err) + goto out_unlock; + iap->ia_valid &= ~ATTR_SIZE; - if (check_guard && guardtime != inode->i_ctime.tv_sec) { - err = nfserr_notsync; - goto out_put_write_access; + /* + * Avoid the additional setattr call below if the only other + * attribute that the client sends is the mtime, as we update + * it as part of the size change above. + */ + if ((iap->ia_valid & ~ATTR_MTIME) == 0) + goto out_unlock; } - fh_lock(fhp); + iap->ia_valid |= ATTR_CTIME; host_err = notify_change(dentry, iap, NULL); - fh_unlock(fhp); - err = nfserrno(host_err); -out_put_write_access: +out_unlock: + fh_unlock(fhp); if (size_change) put_write_access(inode); - if (!err) - err = nfserrno(commit_metadata(fhp)); out: - return err; + if (!host_err) + host_err = commit_metadata(fhp); + return nfserrno(host_err); } #if defined(CONFIG_NFSD_V4) @@ -940,14 +955,12 @@ static int wait_for_concurrent_writes(struct file *file) __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long *cnt, int *stablep) + unsigned long *cnt, int stable) { struct svc_export *exp; - struct inode *inode; mm_segment_t oldfs; __be32 err = 0; int host_err; - int stable = *stablep; int use_wgather; loff_t pos = offset; unsigned int pflags = current->flags; @@ -962,13 +975,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, */ current->flags |= PF_LESS_THROTTLE; - inode = file_inode(file); - exp = fhp->fh_export; - + exp = fhp->fh_export; use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); if (!EX_ISSYNC(exp)) - stable = 0; + stable = NFS_UNSTABLE; if (stable && !use_wgather) flags |= RWF_SYNC; @@ -1035,35 +1046,22 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, * N.B. After this call fhp needs an fh_put */ __be32 -nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, - int *stablep) +nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, + struct kvec *vec, int vlen, unsigned long *cnt, int stable) { - __be32 err = 0; + struct file *file = NULL; + __be32 err = 0; trace_write_start(rqstp, fhp, offset, vlen); - if (file) { - err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, - NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE); - if (err) - goto out; - trace_write_opened(rqstp, fhp, offset, vlen); - err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, - stablep); - trace_write_io_done(rqstp, fhp, offset, vlen); - } else { - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); - if (err) - goto out; + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); + if (err) + goto out; - trace_write_opened(rqstp, fhp, offset, vlen); - if (cnt) - err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, - cnt, stablep); - trace_write_io_done(rqstp, fhp, offset, vlen); - fput(file); - } + trace_write_opened(rqstp, fhp, offset, vlen); + err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable); + trace_write_io_done(rqstp, fhp, offset, vlen); + fput(file); out: trace_write_done(rqstp, fhp, offset, vlen); return err; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 0bf9e7bf5800..db98c48c735a 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -83,12 +83,12 @@ __be32 nfsd_readv(struct file *, loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *); -__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, - loff_t, struct kvec *,int, unsigned long *, int *); +__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, + struct kvec *, int, unsigned long *, int); __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, - int *stablep); + int stable); __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, char *, int *); __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 8a511c0985aa..20d157a518a7 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -204,8 +204,11 @@ static inline void cache_put(struct cache_head *h, struct cache_detail *cd) kref_put(&h->ref, cd->cache_put); } -static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h) +static inline bool cache_is_expired(struct cache_detail *detail, struct cache_head *h) { + if (!test_bit(CACHE_VALID, &h->flags)) + return false; + return (h->expiry_time < seconds_since_boot()) || (detail->flush_time >= h->last_refresh); } @@ -227,6 +230,7 @@ extern void sunrpc_destroy_cache_detail(struct cache_detail *cd); extern int sunrpc_cache_register_pipefs(struct dentry *parent, const char *, umode_t, struct cache_detail *); extern void sunrpc_cache_unregister_pipefs(struct cache_detail *); +extern void sunrpc_cache_unhash(struct cache_detail *, struct cache_head *); /* Must store cache_detail in seq_file->private if using next three functions */ extern void *cache_seq_start(struct seq_file *file, loff_t *pos); diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index cfda6adcf33c..245fc59b7324 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -109,6 +109,15 @@ struct rpcrdma_msg { } rm_body; }; +/* + * XDR sizes, in quads + */ +enum { + rpcrdma_fixed_maxsz = 4, + rpcrdma_segment_maxsz = 4, + rpcrdma_readchunk_maxsz = 2 + rpcrdma_segment_maxsz, +}; + /* * Smallest RPC/RDMA header: rm_xid through rm_type, then rm_nochunks */ diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 7321ae933867..e770abeed32d 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -400,10 +400,14 @@ struct svc_version { struct svc_procedure * vs_proc; /* per-procedure info */ u32 vs_xdrsize; /* xdrsize needed for this version */ - unsigned int vs_hidden : 1, /* Don't register with portmapper. - * Only used for nfsacl so far. */ - vs_rpcb_optnl:1;/* Don't care the result of register. - * Only used for nfsv4. */ + /* Don't register with rpcbind */ + bool vs_hidden; + + /* Don't care if the rpcbind registration fails */ + bool vs_rpcb_optnl; + + /* Need xprt with congestion control */ + bool vs_need_cong_ctrl; /* Override dispatch function (e.g. when caching replies). * A return value of 0 means drop the request. diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 757fb963696c..b105f73e3ca2 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -70,7 +70,7 @@ extern atomic_t rdma_stat_sq_prod; * completes. */ struct svc_rdma_op_ctxt { - struct list_head free; + struct list_head list; struct svc_rdma_op_ctxt *read_hdr; struct svc_rdma_fastreg_mr *frmr; int hdr_count; @@ -78,7 +78,6 @@ struct svc_rdma_op_ctxt { struct ib_cqe cqe; struct ib_cqe reg_cqe; struct ib_cqe inv_cqe; - struct list_head dto_q; u32 byte_len; u32 position; struct svcxprt_rdma *xprt; @@ -141,7 +140,8 @@ struct svcxprt_rdma { atomic_t sc_sq_avail; /* SQEs ready to be consumed */ unsigned int sc_sq_depth; /* Depth of SQ */ unsigned int sc_rq_depth; /* Depth of RQ */ - u32 sc_max_requests; /* Forward credits */ + __be32 sc_fc_credits; /* Forward credits */ + u32 sc_max_requests; /* Max requests */ u32 sc_max_bc_requests;/* Backward credits */ int sc_max_req_size; /* Size of each RQ WR buf */ @@ -171,7 +171,6 @@ struct svcxprt_rdma { wait_queue_head_t sc_send_wait; /* SQ exhaustion waitlist */ unsigned long sc_flags; - struct list_head sc_dto_q; /* DTO tasklet I/O pending Q */ struct list_head sc_read_complete_q; struct work_struct sc_work; }; @@ -214,11 +213,7 @@ extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int); extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int); extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int, __be32, __be64, u32); -extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *, - struct rpcrdma_msg *, - struct rpcrdma_msg *, - enum rpcrdma_proc); -extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *); +extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp); /* svc_rdma_recvfrom.c */ extern int svc_rdma_recvfrom(struct svc_rqst *); diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 7440290f64ac..ddb7f94a9d06 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -67,6 +67,7 @@ struct svc_xprt { #define XPT_CACHE_AUTH 11 /* cache auth info */ #define XPT_LOCAL 12 /* connection from loopback interface */ #define XPT_KILL_TEMP 13 /* call xpo_kill_temp_xprt before closing */ +#define XPT_CONG_CTRL 14 /* has congestion control */ struct svc_serv *xpt_server; /* service for transport */ atomic_t xpt_reserved; /* space on outq that is rsvd */ diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h index 0df7bd5d2fb1..c3be256107c6 100644 --- a/include/uapi/linux/nfsd/export.h +++ b/include/uapi/linux/nfsd/export.h @@ -32,7 +32,8 @@ #define NFSEXP_ASYNC 0x0010 #define NFSEXP_GATHERED_WRITES 0x0020 #define NFSEXP_NOREADDIRPLUS 0x0040 -/* 80 100 currently unused */ +#define NFSEXP_SECURITY_LABEL 0x0080 +/* 0x100 currently unused */ #define NFSEXP_NOHIDE 0x0200 #define NFSEXP_NOSUBTREECHECK 0x0400 #define NFSEXP_NOAUTHNLM 0x0800 /* Don't authenticate NLM requests - just trust */ @@ -53,7 +54,7 @@ #define NFSEXP_PNFS 0x20000 /* All flags that we claim to support. (Note we don't support NOACL.) */ -#define NFSEXP_ALLFLAGS 0x3FE7F +#define NFSEXP_ALLFLAGS 0x3FEFF /* The flags that may vary depending on security flavor: */ #define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \ diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 153082598522..a54a7a3d28f5 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1489,8 +1489,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) case RPC_GSS_PROC_DESTROY: if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; - rsci->h.expiry_time = seconds_since_boot(); - set_bit(CACHE_NEGATIVE, &rsci->h.flags); + /* Delete the entry from the cache_list and call cache_put */ + sunrpc_cache_unhash(sn->rsc_cache, &rsci->h); if (resv->iov_len + 4 > PAGE_SIZE) goto drop; svc_putnl(resv, RPC_SUCCESS); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index f39e3e11f9aa..d8639da06d9c 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -362,11 +362,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) cache_purge(cd); spin_lock(&cache_list_lock); write_lock(&cd->hash_lock); - if (cd->entries) { - write_unlock(&cd->hash_lock); - spin_unlock(&cache_list_lock); - goto out; - } if (current_detail == cd) current_detail = NULL; list_del_init(&cd->others); @@ -376,9 +371,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) /* module must be being unloaded so its safe to kill the worker */ cancel_delayed_work_sync(&cache_cleaner); } - return; -out: - printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name); } EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail); @@ -497,13 +489,32 @@ EXPORT_SYMBOL_GPL(cache_flush); void cache_purge(struct cache_detail *detail) { - time_t now = seconds_since_boot(); - if (detail->flush_time >= now) - now = detail->flush_time + 1; - /* 'now' is the maximum value any 'last_refresh' can have */ - detail->flush_time = now; - detail->nextcheck = seconds_since_boot(); - cache_flush(); + struct cache_head *ch = NULL; + struct hlist_head *head = NULL; + struct hlist_node *tmp = NULL; + int i = 0; + + write_lock(&detail->hash_lock); + if (!detail->entries) { + write_unlock(&detail->hash_lock); + return; + } + + dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name); + for (i = 0; i < detail->hash_size; i++) { + head = &detail->hash_table[i]; + hlist_for_each_entry_safe(ch, tmp, head, cache_list) { + hlist_del_init(&ch->cache_list); + detail->entries--; + + set_bit(CACHE_CLEANED, &ch->flags); + write_unlock(&detail->hash_lock); + cache_fresh_unlocked(ch, detail); + cache_put(ch, detail); + write_lock(&detail->hash_lock); + } + } + write_unlock(&detail->hash_lock); } EXPORT_SYMBOL_GPL(cache_purge); @@ -1855,3 +1866,15 @@ void sunrpc_cache_unregister_pipefs(struct cache_detail *cd) } EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs); +void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h) +{ + write_lock(&cd->hash_lock); + if (!hlist_unhashed(&h->cache_list)){ + hlist_del_init(&h->cache_list); + cd->entries--; + write_unlock(&cd->hash_lock); + cache_put(h, cd); + } else + write_unlock(&cd->hash_lock); +} +EXPORT_SYMBOL_GPL(sunrpc_cache_unhash); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 2e22889a8837..b94efd93d3e4 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -385,7 +385,7 @@ static int svc_uses_rpcbind(struct svc_serv *serv) for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) continue; - if (progp->pg_vers[i]->vs_hidden == 0) + if (!progp->pg_vers[i]->vs_hidden) return 1; } } @@ -976,6 +976,13 @@ int svc_register(const struct svc_serv *serv, struct net *net, if (vers->vs_hidden) continue; + /* + * Don't register a UDP port if we need congestion + * control. + */ + if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP) + continue; + error = __svc_register(net, progp->pg_name, progp->pg_prog, i, family, proto, port); @@ -1169,6 +1176,21 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) !(versp = progp->pg_vers[vers])) goto err_bad_vers; + /* + * Some protocol versions (namely NFSv4) require some form of + * congestion control. (See RFC 7530 section 3.1 paragraph 2) + * In other words, UDP is not allowed. We mark those when setting + * up the svc_xprt, and verify that here. + * + * The spec is not very clear about what error should be returned + * when someone tries to access a server that is listening on UDP + * for lower versions. RPC_PROG_MISMATCH seems to be the closest + * fit. + */ + if (versp->vs_need_cong_ctrl && + !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags)) + goto err_bad_vers; + procp = versp->vs_proc + proc; if (proc >= versp->vs_nproc || !procp->pc_func) goto err_bad_proc; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d227d97f7ad4..8931e33b6541 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class, &svsk->sk_xprt, serv); set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags); if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index cb1e48e54eb1..ff1df40f0d26 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -201,19 +201,20 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) { struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer; + __be32 *p; int rc; /* Space in the send buffer for an RPC/RDMA header is reserved * via xprt->tsh_size. */ - headerp->rm_xid = rqst->rq_xid; - headerp->rm_vers = rpcrdma_version; - headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); - headerp->rm_type = rdma_msg; - headerp->rm_body.rm_chunks[0] = xdr_zero; - headerp->rm_body.rm_chunks[1] = xdr_zero; - headerp->rm_body.rm_chunks[2] = xdr_zero; + p = rqst->rq_buffer; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); + *p++ = rdma_msg; + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; #ifdef SVCRDMA_BACKCHANNEL_DEBUG pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c index 0ba9887f3e22..1c4aabf0f657 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2016 Oracle. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -47,102 +48,43 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -/* - * Decodes a read chunk list. The expected format is as follows: - * descrim : xdr_one - * position : __be32 offset into XDR stream - * handle : __be32 RKEY - * . . . - * end-of-list: xdr_zero - */ -static __be32 *decode_read_list(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_read_list(__be32 *p, __be32 *end) { - struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; + __be32 *next; - while (ch->rc_discrim != xdr_zero) { - if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > - (unsigned long)vaend) { - dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch); + while (*p++ != xdr_zero) { + next = p + rpcrdma_readchunk_maxsz - 1; + if (next > end) return NULL; - } - ch++; + p = next; } - return &ch->rc_position; + return p; } -/* - * Decodes a write chunk list. The expected format is as follows: - * descrim : xdr_one - * nchunks : - * handle : __be32 RKEY ---+ - * length : __be32 | - * offset : remove va + - * . . . | - * ---+ - */ -static __be32 *decode_write_list(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_write_list(__be32 *p, __be32 *end) { - unsigned long start, end; - int nchunks; - - struct rpcrdma_write_array *ary = - (struct rpcrdma_write_array *)va; + __be32 *next; - /* Check for not write-array */ - if (ary->wc_discrim == xdr_zero) - return &ary->wc_nchunks; - - if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > - (unsigned long)vaend) { - dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); - return NULL; - } - nchunks = be32_to_cpu(ary->wc_nchunks); - - start = (unsigned long)&ary->wc_array[0]; - end = (unsigned long)vaend; - if (nchunks < 0 || - nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) || - (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) { - dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", - ary, nchunks, vaend); - return NULL; + while (*p++ != xdr_zero) { + next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + if (next > end) + return NULL; + p = next; } - /* - * rs_length is the 2nd 4B field in wc_target and taking its - * address skips the list terminator - */ - return &ary->wc_array[nchunks].wc_target.rs_length; + return p; } -static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end) { - unsigned long start, end; - int nchunks; - struct rpcrdma_write_array *ary = - (struct rpcrdma_write_array *)va; - - /* Check for no reply-array */ - if (ary->wc_discrim == xdr_zero) - return &ary->wc_nchunks; - - if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > - (unsigned long)vaend) { - dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); - return NULL; - } - nchunks = be32_to_cpu(ary->wc_nchunks); - - start = (unsigned long)&ary->wc_array[0]; - end = (unsigned long)vaend; - if (nchunks < 0 || - nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) || - (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) { - dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", - ary, nchunks, vaend); - return NULL; + __be32 *next; + + if (*p++ != xdr_zero) { + next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + if (next > end) + return NULL; + p = next; } - return (__be32 *)&ary->wc_array[nchunks]; + return p; } /** @@ -158,87 +100,71 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) */ int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) { - struct rpcrdma_msg *rmsgp; - __be32 *va, *vaend; - unsigned int len; - u32 hdr_len; + __be32 *p, *end, *rdma_argp; + unsigned int hdr_len; /* Verify that there's enough bytes for header + something */ - if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) { - dprintk("svcrdma: header too short = %d\n", - rq_arg->len); - return -EINVAL; - } + if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) + goto out_short; - rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base; - if (rmsgp->rm_vers != rpcrdma_version) { - dprintk("%s: bad version %u\n", __func__, - be32_to_cpu(rmsgp->rm_vers)); - return -EPROTONOSUPPORT; - } + rdma_argp = rq_arg->head[0].iov_base; + if (*(rdma_argp + 1) != rpcrdma_version) + goto out_version; - switch (be32_to_cpu(rmsgp->rm_type)) { - case RDMA_MSG: - case RDMA_NOMSG: + switch (*(rdma_argp + 3)) { + case rdma_msg: + case rdma_nomsg: break; - case RDMA_DONE: - /* Just drop it */ - dprintk("svcrdma: dropping RDMA_DONE message\n"); - return 0; - - case RDMA_ERROR: - /* Possible if this is a backchannel reply. - * XXX: We should cancel this XID, though. - */ - dprintk("svcrdma: dropping RDMA_ERROR message\n"); - return 0; - - case RDMA_MSGP: - /* Pull in the extra for the padded case, bump our pointer */ - rmsgp->rm_body.rm_padded.rm_align = - be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align); - rmsgp->rm_body.rm_padded.rm_thresh = - be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh); - - va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; - rq_arg->head[0].iov_base = va; - len = (u32)((unsigned long)va - (unsigned long)rmsgp); - rq_arg->head[0].iov_len -= len; - if (len > rq_arg->len) - return -EINVAL; - return len; - default: - dprintk("svcrdma: bad rdma procedure (%u)\n", - be32_to_cpu(rmsgp->rm_type)); - return -EINVAL; - } + case rdma_done: + goto out_drop; - /* The chunk list may contain either a read chunk list or a write - * chunk list and a reply chunk list. - */ - va = &rmsgp->rm_body.rm_chunks[0]; - vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len); - va = decode_read_list(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode read list\n"); - return -EINVAL; - } - va = decode_write_list(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode write list\n"); - return -EINVAL; - } - va = decode_reply_array(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode reply chunk\n"); - return -EINVAL; + case rdma_error: + goto out_drop; + + default: + goto out_proc; } - rq_arg->head[0].iov_base = va; - hdr_len = (unsigned long)va - (unsigned long)rmsgp; + end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); + p = xdr_check_read_list(rdma_argp + 4, end); + if (!p) + goto out_inval; + p = xdr_check_write_list(p, end); + if (!p) + goto out_inval; + p = xdr_check_reply_chunk(p, end); + if (!p) + goto out_inval; + if (p > end) + goto out_inval; + + rq_arg->head[0].iov_base = p; + hdr_len = (unsigned long)p - (unsigned long)rdma_argp; rq_arg->head[0].iov_len -= hdr_len; return hdr_len; + +out_short: + dprintk("svcrdma: header too short = %d\n", rq_arg->len); + return -EINVAL; + +out_version: + dprintk("svcrdma: bad xprt version: %u\n", + be32_to_cpup(rdma_argp + 1)); + return -EPROTONOSUPPORT; + +out_drop: + dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n"); + return 0; + +out_proc: + dprintk("svcrdma: bad rdma procedure (%u)\n", + be32_to_cpup(rdma_argp + 3)); + return -EINVAL; + +out_inval: + dprintk("svcrdma: failed to parse transport header\n"); + return -EINVAL; } int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, @@ -249,7 +175,7 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, *va++ = rmsgp->rm_xid; *va++ = rmsgp->rm_vers; - *va++ = cpu_to_be32(xprt->sc_max_requests); + *va++ = xprt->sc_fc_credits; *va++ = rdma_error; *va++ = cpu_to_be32(err); if (err == ERR_VERS) { @@ -260,32 +186,35 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, return (int)((unsigned long)va - (unsigned long)startp); } -int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) +/** + * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header + * @rdma_resp: buffer containing Reply transport header + * + * Returns length of transport header, in bytes. + */ +unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp) { - struct rpcrdma_write_array *wr_ary; + unsigned int nsegs; + __be32 *p; - /* There is no read-list in a reply */ + p = rdma_resp; - /* skip write list */ - wr_ary = (struct rpcrdma_write_array *) - &rmsgp->rm_body.rm_chunks[1]; - if (wr_ary->wc_discrim) - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]. - wc_target.rs_length; - else - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_nchunks; - - /* skip reply array */ - if (wr_ary->wc_discrim) - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]; - else - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_nchunks; - - return (unsigned long) wr_ary - (unsigned long) rmsgp; + /* RPC-over-RDMA V1 replies never have a Read list. */ + p += rpcrdma_fixed_maxsz + 1; + + /* Skip Write list. */ + while (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } + + /* Skip Reply chunk. */ + if (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } + + return (unsigned long)p - (unsigned long)rdma_resp; } void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) @@ -326,19 +255,3 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, seg->rs_offset = rs_offset; seg->rs_length = cpu_to_be32(write_len); } - -void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rdma_argp, - struct rpcrdma_msg *rdma_resp, - enum rpcrdma_proc rdma_type) -{ - rdma_resp->rm_xid = rdma_argp->rm_xid; - rdma_resp->rm_vers = rdma_argp->rm_vers; - rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests); - rdma_resp->rm_type = cpu_to_be32(rdma_type); - - /* Encode chunks lists */ - rdma_resp->rm_body.rm_chunks[0] = xdr_zero; - rdma_resp->rm_body.rm_chunks[1] = xdr_zero; - rdma_resp->rm_body.rm_chunks[2] = xdr_zero; -} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 172b537f8cfc..f7b2daf72a86 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -606,26 +606,24 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) dprintk("svcrdma: rqstp=%p\n", rqstp); - spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); + spin_lock(&rdma_xprt->sc_rq_dto_lock); if (!list_empty(&rdma_xprt->sc_read_complete_q)) { - ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); - spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); rdma_read_complete(rqstp, ctxt); goto complete; } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { - ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); } else { atomic_inc(&rdma_stat_rq_starve); clear_bit(XPT_DATA, &xprt->xpt_flags); ctxt = NULL; } - spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); if (!ctxt) { /* This is the EAGAIN path. The svc_recv routine will * return -EAGAIN, the nfsd thread will go to call into diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index ad4d286a83c5..515221b16d09 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -476,7 +476,8 @@ static int send_reply(struct svcxprt_rdma *rdma, /* Prepare the SGE for the RPCRDMA Header */ ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; - ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); + ctxt->sge[0].length = + svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, ctxt->sge[0].length, DMA_TO_DEVICE); @@ -559,12 +560,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) struct rpcrdma_msg *rdma_argp; struct rpcrdma_msg *rdma_resp; struct rpcrdma_write_array *wr_ary, *rp_ary; - enum rpcrdma_proc reply_type; int ret; int inline_bytes; struct page *res_page; struct svc_rdma_req_map *vec; u32 inv_rkey; + __be32 *p; dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); @@ -596,12 +597,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!res_page) goto err0; rdma_resp = page_address(res_page); - if (rp_ary) - reply_type = RDMA_NOMSG; - else - reply_type = RDMA_MSG; - svc_rdma_xdr_encode_reply_header(rdma, rdma_argp, - rdma_resp, reply_type); + + p = &rdma_resp->rm_xid; + *p++ = rdma_argp->rm_xid; + *p++ = rdma_argp->rm_vers; + *p++ = rdma->sc_fc_credits; + *p++ = rp_ary ? rdma_nomsg : rdma_msg; + + /* Start with empty chunks */ + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; /* Send any write-chunk data and build resp write-list */ if (wr_ary) { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 39652d390a9c..c13a5c35ce14 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -157,8 +157,7 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt, ctxt = kmalloc(sizeof(*ctxt), flags); if (ctxt) { ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->free); - INIT_LIST_HEAD(&ctxt->dto_q); + INIT_LIST_HEAD(&ctxt->list); } return ctxt; } @@ -180,7 +179,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) dprintk("svcrdma: No memory for RDMA ctxt\n"); return false; } - list_add(&ctxt->free, &xprt->sc_ctxts); + list_add(&ctxt->list, &xprt->sc_ctxts); } return true; } @@ -189,15 +188,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) { struct svc_rdma_op_ctxt *ctxt = NULL; - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used++; if (list_empty(&xprt->sc_ctxts)) goto out_empty; ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, free); - list_del_init(&ctxt->free); - spin_unlock_bh(&xprt->sc_ctxt_lock); + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); + spin_unlock(&xprt->sc_ctxt_lock); out: ctxt->count = 0; @@ -209,15 +208,15 @@ out_empty: /* Either pre-allocation missed the mark, or send * queue accounting is broken. */ - spin_unlock_bh(&xprt->sc_ctxt_lock); + spin_unlock(&xprt->sc_ctxt_lock); ctxt = alloc_ctxt(xprt, GFP_NOIO); if (ctxt) goto out; - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used--; - spin_unlock_bh(&xprt->sc_ctxt_lock); + spin_unlock(&xprt->sc_ctxt_lock); WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n"); return NULL; } @@ -254,10 +253,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) for (i = 0; i < ctxt->count; i++) put_page(ctxt->pages[i]); - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used--; - list_add(&ctxt->free, &xprt->sc_ctxts); - spin_unlock_bh(&xprt->sc_ctxt_lock); + list_add(&ctxt->list, &xprt->sc_ctxts); + spin_unlock(&xprt->sc_ctxt_lock); } static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) @@ -266,8 +265,8 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) struct svc_rdma_op_ctxt *ctxt; ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, free); - list_del(&ctxt->free); + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); kfree(ctxt); } } @@ -404,7 +403,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* All wc fields are now known to be valid */ ctxt->byte_len = wc->byte_len; spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); + list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q); spin_unlock(&xprt->sc_rq_dto_lock); set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); @@ -525,7 +524,7 @@ void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc) read_hdr = ctxt->read_hdr; spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&read_hdr->dto_q, + list_add_tail(&read_hdr->list, &xprt->sc_read_complete_q); spin_unlock(&xprt->sc_rq_dto_lock); @@ -557,7 +556,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, return NULL; svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); - INIT_LIST_HEAD(&cma_xprt->sc_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); @@ -571,6 +569,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, spin_lock_init(&cma_xprt->sc_ctxt_lock); spin_lock_init(&cma_xprt->sc_map_lock); + /* + * Note that this implies that the underlying transport support + * has some form of congestion control (see RFC 7530 section 3.1 + * paragraph 2). For now, we assume that all supported RDMA + * transports are suitable here. + */ + set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags); + if (listener) set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); @@ -923,14 +929,14 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) { struct svc_rdma_fastreg_mr *frmr = NULL; - spin_lock_bh(&rdma->sc_frmr_q_lock); + spin_lock(&rdma->sc_frmr_q_lock); if (!list_empty(&rdma->sc_frmr_q)) { frmr = list_entry(rdma->sc_frmr_q.next, struct svc_rdma_fastreg_mr, frmr_list); list_del_init(&frmr->frmr_list); frmr->sg_nents = 0; } - spin_unlock_bh(&rdma->sc_frmr_q_lock); + spin_unlock(&rdma->sc_frmr_q_lock); if (frmr) return frmr; @@ -943,10 +949,10 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, if (frmr) { ib_dma_unmap_sg(rdma->sc_cm_id->device, frmr->sg, frmr->sg_nents, frmr->direction); - spin_lock_bh(&rdma->sc_frmr_q_lock); + spin_lock(&rdma->sc_frmr_q_lock); WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); list_add(&frmr->frmr_list, &rdma->sc_frmr_q); - spin_unlock_bh(&rdma->sc_frmr_q_lock); + spin_unlock(&rdma->sc_frmr_q_lock); } } @@ -1002,6 +1008,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, svcrdma_max_requests); + newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, svcrdma_max_bc_requests); newxprt->sc_rq_depth = newxprt->sc_max_requests + @@ -1027,13 +1034,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) goto errout; } newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth, - 0, IB_POLL_SOFTIRQ); + 0, IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_sq_cq)) { dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth, - 0, IB_POLL_SOFTIRQ); + 0, IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_rq_cq)) { dprintk("svcrdma: error creating RQ CQ for connect request\n"); goto errout; @@ -1213,20 +1220,18 @@ static void __svc_rdma_free(struct work_struct *work) */ while (!list_empty(&rdma->sc_read_complete_q)) { struct svc_rdma_op_ctxt *ctxt; - ctxt = list_entry(rdma->sc_read_complete_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma->sc_read_complete_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); svc_rdma_put_context(ctxt, 1); } /* Destroy queued, but not processed recv completions */ while (!list_empty(&rdma->sc_rq_dto_q)) { struct svc_rdma_op_ctxt *ctxt; - ctxt = list_entry(rdma->sc_rq_dto_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma->sc_rq_dto_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); svc_rdma_put_context(ctxt, 1); }