]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/staging/lustre/lustre/llite/namei.c
Merge tag 'trace-v4.12-3' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt...
[karo-tx-linux.git] / drivers / staging / lustre / lustre / llite / namei.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2015, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  */
32
33 #include <linux/fs.h>
34 #include <linux/sched.h>
35 #include <linux/mm.h>
36 #include <linux/quotaops.h>
37 #include <linux/highmem.h>
38 #include <linux/pagemap.h>
39 #include <linux/security.h>
40
41 #define DEBUG_SUBSYSTEM S_LLITE
42
43 #include "../include/obd_support.h"
44 #include "../include/lustre_fid.h"
45 #include "../include/lustre_dlm.h"
46 #include "../include/lustre_ver.h"
47 #include "llite_internal.h"
48
49 static int ll_create_it(struct inode *dir, struct dentry *dentry,
50                         struct lookup_intent *it);
51
52 /* called from iget5_locked->find_inode() under inode_hash_lock spinlock */
53 static int ll_test_inode(struct inode *inode, void *opaque)
54 {
55         struct ll_inode_info *lli = ll_i2info(inode);
56         struct lustre_md     *md = opaque;
57
58         if (unlikely(!(md->body->mbo_valid & OBD_MD_FLID))) {
59                 CERROR("MDS body missing FID\n");
60                 return 0;
61         }
62
63         if (!lu_fid_eq(&lli->lli_fid, &md->body->mbo_fid1))
64                 return 0;
65
66         return 1;
67 }
68
69 static int ll_set_inode(struct inode *inode, void *opaque)
70 {
71         struct ll_inode_info *lli = ll_i2info(inode);
72         struct mdt_body *body = ((struct lustre_md *)opaque)->body;
73
74         if (unlikely(!(body->mbo_valid & OBD_MD_FLID))) {
75                 CERROR("MDS body missing FID\n");
76                 return -EINVAL;
77         }
78
79         lli->lli_fid = body->mbo_fid1;
80         if (unlikely(!(body->mbo_valid & OBD_MD_FLTYPE))) {
81                 CERROR("Can not initialize inode " DFID
82                        " without object type: valid = %#llx\n",
83                        PFID(&lli->lli_fid), body->mbo_valid);
84                 return -EINVAL;
85         }
86
87         inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mbo_mode & S_IFMT);
88         if (unlikely(inode->i_mode == 0)) {
89                 CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid));
90                 return -EINVAL;
91         }
92
93         ll_lli_init(lli);
94
95         return 0;
96 }
97
98 /**
99  * Get an inode by inode number(@hash), which is already instantiated by
100  * the intent lookup).
101  */
102 struct inode *ll_iget(struct super_block *sb, ino_t hash,
103                       struct lustre_md *md)
104 {
105         struct inode     *inode;
106         int rc = 0;
107
108         LASSERT(hash != 0);
109         inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md);
110         if (!inode)
111                 return ERR_PTR(-ENOMEM);
112
113         if (inode->i_state & I_NEW) {
114                 rc = ll_read_inode2(inode, md);
115                 if (!rc && S_ISREG(inode->i_mode) &&
116                     !ll_i2info(inode)->lli_clob)
117                         rc = cl_file_inode_init(inode, md);
118
119                 if (rc) {
120                         /*
121                          * Let's clear directory lsm here, otherwise
122                          * make_bad_inode() will reset the inode mode
123                          * to regular, then ll_clear_inode will not
124                          * be able to clear lsm_md
125                          */
126                         if (S_ISDIR(inode->i_mode))
127                                 ll_dir_clear_lsm_md(inode);
128                         make_bad_inode(inode);
129                         unlock_new_inode(inode);
130                         iput(inode);
131                         inode = ERR_PTR(rc);
132                 } else {
133                         unlock_new_inode(inode);
134                 }
135         } else if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
136                 rc = ll_update_inode(inode, md);
137                 CDEBUG(D_VFSTRACE, "got inode: "DFID"(%p): rc = %d\n",
138                        PFID(&md->body->mbo_fid1), inode, rc);
139                 if (rc) {
140                         if (S_ISDIR(inode->i_mode))
141                                 ll_dir_clear_lsm_md(inode);
142                         iput(inode);
143                         inode = ERR_PTR(rc);
144                 }
145         }
146         return inode;
147 }
148
149 static void ll_invalidate_negative_children(struct inode *dir)
150 {
151         struct dentry *dentry, *tmp_subdir;
152
153         spin_lock(&dir->i_lock);
154         hlist_for_each_entry(dentry, &dir->i_dentry, d_u.d_alias) {
155                 spin_lock(&dentry->d_lock);
156                 if (!list_empty(&dentry->d_subdirs)) {
157                         struct dentry *child;
158
159                         list_for_each_entry_safe(child, tmp_subdir,
160                                                  &dentry->d_subdirs,
161                                                  d_child) {
162                                 if (d_really_is_negative(child))
163                                         d_lustre_invalidate(child, 1);
164                         }
165                 }
166                 spin_unlock(&dentry->d_lock);
167         }
168         spin_unlock(&dir->i_lock);
169 }
170
171 int ll_test_inode_by_fid(struct inode *inode, void *opaque)
172 {
173         return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque);
174 }
175
176 int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
177                        void *data, int flag)
178 {
179         struct lustre_handle lockh;
180         int rc;
181
182         switch (flag) {
183         case LDLM_CB_BLOCKING:
184                 ldlm_lock2handle(lock, &lockh);
185                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
186                 if (rc < 0) {
187                         CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc);
188                         return rc;
189                 }
190                 break;
191         case LDLM_CB_CANCELING: {
192                 struct inode *inode = ll_inode_from_resource_lock(lock);
193                 __u64 bits = lock->l_policy_data.l_inodebits.bits;
194
195                 /* Inode is set to lock->l_resource->lr_lvb_inode
196                  * for mdc - bug 24555
197                  */
198                 LASSERT(!lock->l_ast_data);
199
200                 if (!inode)
201                         break;
202
203                 /* Invalidate all dentries associated with this inode */
204                 LASSERT(ldlm_is_canceling(lock));
205
206                 if (!fid_res_name_eq(ll_inode2fid(inode),
207                                      &lock->l_resource->lr_name)) {
208                         LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)",
209                                    PFID(ll_inode2fid(inode)), inode);
210                         LBUG();
211                 }
212
213                 if (bits & MDS_INODELOCK_XATTR) {
214                         if (S_ISDIR(inode->i_mode))
215                                 ll_i2info(inode)->lli_def_stripe_offset = -1;
216                         ll_xattr_cache_destroy(inode);
217                         bits &= ~MDS_INODELOCK_XATTR;
218                 }
219
220                 /* For OPEN locks we differentiate between lock modes
221                  * LCK_CR, LCK_CW, LCK_PR - bug 22891
222                  */
223                 if (bits & MDS_INODELOCK_OPEN)
224                         ll_have_md_lock(inode, &bits, lock->l_req_mode);
225
226                 if (bits & MDS_INODELOCK_OPEN) {
227                         fmode_t fmode;
228
229                         switch (lock->l_req_mode) {
230                         case LCK_CW:
231                                 fmode = FMODE_WRITE;
232                                 break;
233                         case LCK_PR:
234                                 fmode = FMODE_EXEC;
235                                 break;
236                         case LCK_CR:
237                                 fmode = FMODE_READ;
238                                 break;
239                         default:
240                                 LDLM_ERROR(lock, "bad lock mode for OPEN lock");
241                                 LBUG();
242                         }
243
244                         ll_md_real_close(inode, fmode);
245                 }
246
247                 if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
248                             MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
249                         ll_have_md_lock(inode, &bits, LCK_MINMODE);
250
251                 if (bits & MDS_INODELOCK_LAYOUT) {
252                         struct cl_object_conf conf = {
253                                 .coc_opc = OBJECT_CONF_INVALIDATE,
254                                 .coc_inode = inode,
255                         };
256
257                         rc = ll_layout_conf(inode, &conf);
258                         if (rc < 0)
259                                 CDEBUG(D_INODE, "cannot invalidate layout of "
260                                        DFID": rc = %d\n",
261                                        PFID(ll_inode2fid(inode)), rc);
262                 }
263
264                 if (bits & MDS_INODELOCK_UPDATE) {
265                         struct ll_inode_info *lli = ll_i2info(inode);
266
267                         spin_lock(&lli->lli_lock);
268                         LTIME_S(inode->i_mtime) = 0;
269                         LTIME_S(inode->i_atime) = 0;
270                         LTIME_S(inode->i_ctime) = 0;
271                         spin_unlock(&lli->lli_lock);
272                 }
273
274                 if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
275                         struct ll_inode_info *lli = ll_i2info(inode);
276
277                         CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, pfid  = "DFID"\n",
278                                PFID(ll_inode2fid(inode)), lli,
279                                PFID(&lli->lli_pfid));
280
281                         truncate_inode_pages(inode->i_mapping, 0);
282
283                         if (unlikely(!fid_is_zero(&lli->lli_pfid))) {
284                                 struct inode *master_inode = NULL;
285                                 unsigned long hash;
286
287                                 /*
288                                  * This is slave inode, since all of the child
289                                  * dentry is connected on the master inode, so
290                                  * we have to invalidate the negative children
291                                  * on master inode
292                                  */
293                                 CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n",
294                                        PFID(ll_inode2fid(inode)),
295                                        PFID(&lli->lli_pfid));
296
297                                 hash = cl_fid_build_ino(&lli->lli_pfid,
298                                                         ll_need_32bit_api(ll_i2sbi(inode)));
299                                 /*
300                                  * Do not lookup the inode with ilookup5,
301                                  * otherwise it will cause dead lock,
302                                  *
303                                  * 1. Client1 send chmod req to the MDT0, then
304                                  * on MDT0, it enqueues master and all of its
305                                  * slaves lock, (mdt_attr_set() ->
306                                  * mdt_lock_slaves()), after gets master and
307                                  * stripe0 lock, it will send the enqueue req
308                                  * (for stripe1) to MDT1, then MDT1 finds the
309                                  * lock has been granted to client2. Then MDT1
310                                  * sends blocking ast to client2.
311                                  *
312                                  * 2. At the same time, client2 tries to unlink
313                                  * the striped dir (rm -rf striped_dir), and
314                                  * during lookup, it will hold the master inode
315                                  * of the striped directory, whose inode state
316                                  * is NEW, then tries to revalidate all of its
317                                  * slaves, (ll_prep_inode()->ll_iget()->
318                                  * ll_read_inode2()-> ll_update_inode().). And
319                                  * it will be blocked on the server side because
320                                  * of 1.
321                                  *
322                                  * 3. Then the client get the blocking_ast req,
323                                  * cancel the lock, but being blocked if using
324                                  * ->ilookup5()), because master inode state is
325                                  *  NEW.
326                                  */
327                                 master_inode = ilookup5_nowait(inode->i_sb,
328                                                                hash,
329                                                                ll_test_inode_by_fid,
330                                                                (void *)&lli->lli_pfid);
331                                 if (master_inode) {
332                                         ll_invalidate_negative_children(master_inode);
333                                         iput(master_inode);
334                                 }
335                         } else {
336                                 ll_invalidate_negative_children(inode);
337                         }
338                 }
339
340                 if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
341                     inode->i_sb->s_root &&
342                     !is_root_inode(inode))
343                         ll_invalidate_aliases(inode);
344
345                 iput(inode);
346                 break;
347         }
348         default:
349                 LBUG();
350         }
351
352         return 0;
353 }
354
355 __u32 ll_i2suppgid(struct inode *i)
356 {
357         if (in_group_p(i->i_gid))
358                 return (__u32)from_kgid(&init_user_ns, i->i_gid);
359         else
360                 return (__u32)(-1);
361 }
362
363 /* Pack the required supplementary groups into the supplied groups array.
364  * If we don't need to use the groups from the target inode(s) then we
365  * instead pack one or more groups from the user's supplementary group
366  * array in case it might be useful.  Not needed if doing an MDS-side upcall.
367  */
368 void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2)
369 {
370         LASSERT(i1);
371
372         suppgids[0] = ll_i2suppgid(i1);
373
374         if (i2)
375                 suppgids[1] = ll_i2suppgid(i2);
376                 else
377                         suppgids[1] = -1;
378 }
379
380 /*
381  * try to reuse three types of dentry:
382  * 1. unhashed alias, this one is unhashed by d_invalidate (but it may be valid
383  *    by concurrent .revalidate).
384  * 2. INVALID alias (common case for no valid ldlm lock held, but this flag may
385  *    be cleared by others calling d_lustre_revalidate).
386  * 3. DISCONNECTED alias.
387  */
388 static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry)
389 {
390         struct dentry *alias, *discon_alias, *invalid_alias;
391
392         if (hlist_empty(&inode->i_dentry))
393                 return NULL;
394
395         discon_alias = NULL;
396         invalid_alias = NULL;
397
398         spin_lock(&inode->i_lock);
399         hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
400                 LASSERT(alias != dentry);
401
402                 spin_lock(&alias->d_lock);
403                 if ((alias->d_flags & DCACHE_DISCONNECTED) &&
404                     S_ISDIR(inode->i_mode))
405                         /* LASSERT(last_discon == NULL); LU-405, bz 20055 */
406                         discon_alias = alias;
407                 else if (alias->d_parent == dentry->d_parent         &&
408                          alias->d_name.hash == dentry->d_name.hash       &&
409                          alias->d_name.len == dentry->d_name.len         &&
410                          memcmp(alias->d_name.name, dentry->d_name.name,
411                                 dentry->d_name.len) == 0)
412                         invalid_alias = alias;
413                 spin_unlock(&alias->d_lock);
414
415                 if (invalid_alias)
416                         break;
417         }
418         alias = invalid_alias ?: discon_alias ?: NULL;
419         if (alias) {
420                 spin_lock(&alias->d_lock);
421                 dget_dlock(alias);
422                 spin_unlock(&alias->d_lock);
423         }
424         spin_unlock(&inode->i_lock);
425
426         return alias;
427 }
428
429 /*
430  * Similar to d_splice_alias(), but lustre treats invalid alias
431  * similar to DCACHE_DISCONNECTED, and tries to use it anyway.
432  */
433 struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
434 {
435         if (inode) {
436                 struct dentry *new = ll_find_alias(inode, de);
437
438                 if (new) {
439                         d_move(new, de);
440                         iput(inode);
441                         CDEBUG(D_DENTRY,
442                                "Reuse dentry %p inode %p refc %d flags %#x\n",
443                               new, d_inode(new), d_count(new), new->d_flags);
444                         return new;
445                 }
446         }
447         d_add(de, inode);
448         CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n",
449                de, d_inode(de), d_count(de), de->d_flags);
450         return de;
451 }
452
453 static int ll_lookup_it_finish(struct ptlrpc_request *request,
454                                struct lookup_intent *it,
455                                struct inode *parent, struct dentry **de)
456 {
457         struct inode *inode = NULL;
458         __u64 bits = 0;
459         int rc = 0;
460         struct dentry *alias;
461
462         /* NB 1 request reference will be taken away by ll_intent_lock()
463          * when I return
464          */
465         CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it,
466                it->it_disposition);
467         if (!it_disposition(it, DISP_LOOKUP_NEG)) {
468                 rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
469                 if (rc)
470                         return rc;
471
472                 ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
473
474                 /* We used to query real size from OSTs here, but actually
475                  * this is not needed. For stat() calls size would be updated
476                  * from subsequent do_revalidate()->ll_inode_revalidate_it() in
477                  * 2.4 and
478                  * vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
479                  * Everybody else who needs correct file size would call
480                  * ll_glimpse_size or some equivalent themselves anyway.
481                  * Also see bug 7198.
482                  */
483         }
484
485         alias = ll_splice_alias(inode, *de);
486         if (IS_ERR(alias)) {
487                 rc = PTR_ERR(alias);
488                 goto out;
489         }
490         *de = alias;
491
492         if (!it_disposition(it, DISP_LOOKUP_NEG)) {
493                 /* we have lookup look - unhide dentry */
494                 if (bits & MDS_INODELOCK_LOOKUP)
495                         d_lustre_revalidate(*de);
496         } else if (!it_disposition(it, DISP_OPEN_CREATE)) {
497                 /* If file created on server, don't depend on parent UPDATE
498                  * lock to unhide it. It is left hidden and next lookup can
499                  * find it in ll_splice_alias.
500                  */
501                 /* Check that parent has UPDATE lock. */
502                 struct lookup_intent parent_it = {
503                                         .it_op = IT_GETATTR,
504                                         .it_lock_handle = 0 };
505                 struct lu_fid fid = ll_i2info(parent)->lli_fid;
506
507                 /* If it is striped directory, get the real stripe parent */
508                 if (unlikely(ll_i2info(parent)->lli_lsm_md)) {
509                         rc = md_get_fid_from_lsm(ll_i2mdexp(parent),
510                                                  ll_i2info(parent)->lli_lsm_md,
511                                                  (*de)->d_name.name,
512                                                  (*de)->d_name.len, &fid);
513                         if (rc)
514                                 return rc;
515                 }
516
517                 if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it, &fid,
518                                        NULL)) {
519                         d_lustre_revalidate(*de);
520                         ll_intent_release(&parent_it);
521                 }
522         }
523
524 out:
525         if (rc != 0 && it->it_op & IT_OPEN)
526                 ll_open_cleanup((*de)->d_sb, request);
527
528         return rc;
529 }
530
531 static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
532                                    struct lookup_intent *it)
533 {
534         struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
535         struct dentry *save = dentry, *retval;
536         struct ptlrpc_request *req = NULL;
537         struct md_op_data *op_data = NULL;
538         struct inode *inode;
539         __u32 opc;
540         int rc;
541
542         if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen)
543                 return ERR_PTR(-ENAMETOOLONG);
544
545         CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p),intent=%s\n",
546                dentry, PFID(ll_inode2fid(parent)), parent, LL_IT2STR(it));
547
548         if (d_mountpoint(dentry))
549                 CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it));
550
551         if (!it || it->it_op == IT_GETXATTR)
552                 it = &lookup_it;
553
554         if (it->it_op == IT_GETATTR && dentry_may_statahead(parent, dentry)) {
555                 rc = ll_statahead(parent, &dentry, 0);
556                 if (rc == 1) {
557                         if (dentry == save)
558                                 retval = NULL;
559                         else
560                                 retval = dentry;
561                         goto out;
562                 }
563         }
564
565         if (it->it_op & IT_OPEN && it->it_flags & FMODE_WRITE &&
566             dentry->d_sb->s_flags & MS_RDONLY)
567                 return ERR_PTR(-EROFS);
568
569         if (it->it_op & IT_CREAT)
570                 opc = LUSTRE_OPC_CREATE;
571         else
572                 opc = LUSTRE_OPC_ANY;
573
574         op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name,
575                                      dentry->d_name.len, 0, opc, NULL);
576         if (IS_ERR(op_data))
577                 return (void *)op_data;
578
579         /* enforce umask if acl disabled or MDS doesn't support umask */
580         if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
581                 it->it_create_mode &= ~current_umask();
582
583         rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req,
584                             &ll_md_blocking_ast, 0);
585         /*
586          * If the MDS allows the client to chgrp (CFS_SETGRP_PERM), but the
587          * client does not know which suppgid should be sent to the MDS, or
588          * some other(s) changed the target file's GID after this RPC sent
589          * to the MDS with the suppgid as the original GID, then we should
590          * try again with right suppgid.
591          */
592         if (rc == -EACCES && it->it_op & IT_OPEN &&
593             it_disposition(it, DISP_OPEN_DENY)) {
594                 struct mdt_body *body;
595
596                 LASSERT(req);
597
598                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
599                 if (op_data->op_suppgids[0] == body->mbo_gid ||
600                     op_data->op_suppgids[1] == body->mbo_gid ||
601                     !in_group_p(make_kgid(&init_user_ns, body->mbo_gid))) {
602                         retval = ERR_PTR(-EACCES);
603                         goto out;
604                 }
605
606                 fid_zero(&op_data->op_fid2);
607                 op_data->op_suppgids[1] = body->mbo_gid;
608                 ptlrpc_req_finished(req);
609                 req = NULL;
610                 ll_intent_release(it);
611                 rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req,
612                                     ll_md_blocking_ast, 0);
613         }
614
615         if (rc < 0) {
616                 retval = ERR_PTR(rc);
617                 goto out;
618         }
619
620         rc = ll_lookup_it_finish(req, it, parent, &dentry);
621         if (rc != 0) {
622                 ll_intent_release(it);
623                 retval = ERR_PTR(rc);
624                 goto out;
625         }
626
627         inode = d_inode(dentry);
628         if ((it->it_op & IT_OPEN) && inode &&
629             !S_ISREG(inode->i_mode) &&
630             !S_ISDIR(inode->i_mode)) {
631                 ll_release_openhandle(inode, it);
632         }
633         ll_lookup_finish_locks(it, inode);
634
635         if (dentry == save)
636                 retval = NULL;
637         else
638                 retval = dentry;
639 out:
640         if (op_data && !IS_ERR(op_data))
641                 ll_finish_md_op_data(op_data);
642
643         ptlrpc_req_finished(req);
644         return retval;
645 }
646
647 static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
648                                    unsigned int flags)
649 {
650         struct lookup_intent *itp, it = { .it_op = IT_GETATTR };
651         struct dentry *de;
652
653         CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p),flags=%u\n",
654                dentry, PFID(ll_inode2fid(parent)), parent, flags);
655
656         /* Optimize away (CREATE && !OPEN). Let .create handle the race.
657          * but only if we have write permissions there, otherwise we need
658          * to proceed with lookup. LU-4185
659          */
660         if ((flags & LOOKUP_CREATE) && !(flags & LOOKUP_OPEN) &&
661             (inode_permission(parent, MAY_WRITE | MAY_EXEC) == 0))
662                 return NULL;
663
664         if (flags & (LOOKUP_PARENT | LOOKUP_OPEN | LOOKUP_CREATE))
665                 itp = NULL;
666         else
667                 itp = &it;
668         de = ll_lookup_it(parent, dentry, itp);
669
670         if (itp)
671                 ll_intent_release(itp);
672
673         return de;
674 }
675
676 /*
677  * For cached negative dentry and new dentry, handle lookup/create/open
678  * together.
679  */
680 static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
681                           struct file *file, unsigned open_flags,
682                           umode_t mode, int *opened)
683 {
684         struct lookup_intent *it;
685         struct dentry *de;
686         int rc = 0;
687
688         CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p),file %p,open_flags %x,mode %x opened %d\n",
689                dentry, PFID(ll_inode2fid(dir)), dir, file, open_flags, mode,
690                *opened);
691
692         /* Only negative dentries enter here */
693         LASSERT(!d_inode(dentry));
694
695         if (!d_in_lookup(dentry)) {
696                 /* A valid negative dentry that just passed revalidation,
697                  * there's little point to try and open it server-side,
698                  * even though there's a minuscle chance it might succeed.
699                  * Either way it's a valid race to just return -ENOENT here.
700                  */
701                 if (!(open_flags & O_CREAT))
702                         return -ENOENT;
703
704                 /* Otherwise we just unhash it to be rehashed afresh via
705                  * lookup if necessary
706                  */
707                 d_drop(dentry);
708         }
709
710         it = kzalloc(sizeof(*it), GFP_NOFS);
711         if (!it)
712                 return -ENOMEM;
713
714         it->it_op = IT_OPEN;
715         if (open_flags & O_CREAT)
716                 it->it_op |= IT_CREAT;
717         it->it_create_mode = (mode & S_IALLUGO) | S_IFREG;
718         it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags);
719         it->it_flags &= ~MDS_OPEN_FL_INTERNAL;
720
721         /* Dentry added to dcache tree in ll_lookup_it */
722         de = ll_lookup_it(dir, dentry, it);
723         if (IS_ERR(de))
724                 rc = PTR_ERR(de);
725         else if (de)
726                 dentry = de;
727
728         if (!rc) {
729                 if (it_disposition(it, DISP_OPEN_CREATE)) {
730                         /* Dentry instantiated in ll_create_it. */
731                         rc = ll_create_it(dir, dentry, it);
732                         if (rc) {
733                                 /* We dget in ll_splice_alias. */
734                                 if (de)
735                                         dput(de);
736                                 goto out_release;
737                         }
738
739                         *opened |= FILE_CREATED;
740                 }
741                 if (d_really_is_positive(dentry) && it_disposition(it, DISP_OPEN_OPEN)) {
742                         /* Open dentry. */
743                         if (S_ISFIFO(d_inode(dentry)->i_mode)) {
744                                 /* We cannot call open here as it might
745                                  * deadlock. This case is unreachable in
746                                  * practice because of OBD_CONNECT_NODEVOH.
747                                  */
748                                 rc = finish_no_open(file, de);
749                         } else {
750                                 file->private_data = it;
751                                 rc = finish_open(file, dentry, NULL, opened);
752                                 /* We dget in ll_splice_alias. finish_open takes
753                                  * care of dget for fd open.
754                                  */
755                                 if (de)
756                                         dput(de);
757                         }
758                 } else {
759                         rc = finish_no_open(file, de);
760                 }
761         }
762
763 out_release:
764         ll_intent_release(it);
765         kfree(it);
766
767         return rc;
768 }
769
770 /* We depend on "mode" being set with the proper file type/umask by now */
771 static struct inode *ll_create_node(struct inode *dir, struct lookup_intent *it)
772 {
773         struct inode *inode = NULL;
774         struct ptlrpc_request *request = NULL;
775         struct ll_sb_info *sbi = ll_i2sbi(dir);
776         int rc;
777
778         LASSERT(it && it->it_disposition);
779
780         LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF));
781         request = it->it_request;
782         it_clear_disposition(it, DISP_ENQ_CREATE_REF);
783         rc = ll_prep_inode(&inode, request, dir->i_sb, it);
784         if (rc) {
785                 inode = ERR_PTR(rc);
786                 goto out;
787         }
788
789         LASSERT(hlist_empty(&inode->i_dentry));
790
791         /* We asked for a lock on the directory, but were granted a
792          * lock on the inode.  Since we finally have an inode pointer,
793          * stuff it in the lock.
794          */
795         CDEBUG(D_DLMTRACE, "setting l_ast_data to inode "DFID"(%p)\n",
796                PFID(ll_inode2fid(dir)), inode);
797         ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
798  out:
799         ptlrpc_req_finished(request);
800         return inode;
801 }
802
803 /*
804  * By the time this is called, we already have created the directory cache
805  * entry for the new file, but it is so far negative - it has no inode.
806  *
807  * We defer creating the OBD object(s) until open, to keep the intent and
808  * non-intent code paths similar, and also because we do not have the MDS
809  * inode number before calling ll_create_node() (which is needed for LOV),
810  * so we would need to do yet another RPC to the MDS to store the LOV EA
811  * data on the MDS.  If needed, we would pass the PACKED lmm as data and
812  * lmm_size in datalen (the MDS still has code which will handle that).
813  *
814  * If the create succeeds, we fill in the inode information
815  * with d_instantiate().
816  */
817 static int ll_create_it(struct inode *dir, struct dentry *dentry,
818                         struct lookup_intent *it)
819 {
820         struct inode *inode;
821         int rc = 0;
822
823         CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p), intent=%s\n",
824                dentry, PFID(ll_inode2fid(dir)), dir, LL_IT2STR(it));
825
826         rc = it_open_error(DISP_OPEN_CREATE, it);
827         if (rc)
828                 return rc;
829
830         inode = ll_create_node(dir, it);
831         if (IS_ERR(inode))
832                 return PTR_ERR(inode);
833
834         d_instantiate(dentry, inode);
835
836         return ll_init_security(dentry, inode, dir);
837 }
838
839 void ll_update_times(struct ptlrpc_request *request, struct inode *inode)
840 {
841         struct mdt_body *body = req_capsule_server_get(&request->rq_pill,
842                                                        &RMF_MDT_BODY);
843
844         LASSERT(body);
845         if (body->mbo_valid & OBD_MD_FLMTIME &&
846             body->mbo_mtime > LTIME_S(inode->i_mtime)) {
847                 CDEBUG(D_INODE, "setting fid "DFID" mtime from %lu to %llu\n",
848                        PFID(ll_inode2fid(inode)), LTIME_S(inode->i_mtime),
849                        body->mbo_mtime);
850                 LTIME_S(inode->i_mtime) = body->mbo_mtime;
851         }
852         if (body->mbo_valid & OBD_MD_FLCTIME &&
853             body->mbo_ctime > LTIME_S(inode->i_ctime))
854                 LTIME_S(inode->i_ctime) = body->mbo_ctime;
855 }
856
857 static int ll_new_node(struct inode *dir, struct dentry *dentry,
858                        const char *tgt, umode_t mode, int rdev,
859                        __u32 opc)
860 {
861         struct ptlrpc_request *request = NULL;
862         struct md_op_data *op_data;
863         struct inode *inode = NULL;
864         struct ll_sb_info *sbi = ll_i2sbi(dir);
865         int tgt_len = 0;
866         int err;
867
868         if (unlikely(tgt))
869                 tgt_len = strlen(tgt) + 1;
870 again:
871         op_data = ll_prep_md_op_data(NULL, dir, NULL,
872                                      dentry->d_name.name,
873                                      dentry->d_name.len,
874                                      0, opc, NULL);
875         if (IS_ERR(op_data)) {
876                 err = PTR_ERR(op_data);
877                 goto err_exit;
878         }
879
880         err = md_create(sbi->ll_md_exp, op_data, tgt, tgt_len, mode,
881                         from_kuid(&init_user_ns, current_fsuid()),
882                         from_kgid(&init_user_ns, current_fsgid()),
883                         cfs_curproc_cap_pack(), rdev, &request);
884         ll_finish_md_op_data(op_data);
885         if (err < 0 && err != -EREMOTE)
886                 goto err_exit;
887
888         /*
889          * If the client doesn't know where to create a subdirectory (or
890          * in case of a race that sends the RPC to the wrong MDS), the
891          * MDS will return -EREMOTE and the client will fetch the layout
892          * of the directory, then create the directory on the right MDT.
893          */
894         if (unlikely(err == -EREMOTE)) {
895                 struct ll_inode_info *lli = ll_i2info(dir);
896                 struct lmv_user_md *lum;
897                 int lumsize, err2;
898
899                 ptlrpc_req_finished(request);
900                 request = NULL;
901
902                 err2 = ll_dir_getstripe(dir, (void **)&lum, &lumsize, &request,
903                                         OBD_MD_DEFAULT_MEA);
904                 if (!err2) {
905                         /* Update stripe_offset and retry */
906                         lli->lli_def_stripe_offset = lum->lum_stripe_offset;
907                 } else if (err2 == -ENODATA &&
908                            lli->lli_def_stripe_offset != -1) {
909                         /*
910                          * If there are no default stripe EA on the MDT, but the
911                          * client has default stripe, then it probably means
912                          * default stripe EA has just been deleted.
913                          */
914                         lli->lli_def_stripe_offset = -1;
915                 } else {
916                         goto err_exit;
917                 }
918
919                 ptlrpc_req_finished(request);
920                 request = NULL;
921                 goto again;
922         }
923
924         ll_update_times(request, dir);
925
926         err = ll_prep_inode(&inode, request, dir->i_sb, NULL);
927         if (err)
928                 goto err_exit;
929
930         d_instantiate(dentry, inode);
931
932         err = ll_init_security(dentry, inode, dir);
933 err_exit:
934         if (request)
935                 ptlrpc_req_finished(request);
936
937         return err;
938 }
939
940 static int ll_mknod(struct inode *dir, struct dentry *dchild,
941                     umode_t mode, dev_t rdev)
942 {
943         int err;
944
945         CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p) mode %o dev %x\n",
946                dchild, PFID(ll_inode2fid(dir)), dir, mode,
947                old_encode_dev(rdev));
948
949         if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
950                 mode &= ~current_umask();
951
952         switch (mode & S_IFMT) {
953         case 0:
954                 mode |= S_IFREG; /* for mode = 0 case, fallthrough */
955         case S_IFREG:
956         case S_IFCHR:
957         case S_IFBLK:
958         case S_IFIFO:
959         case S_IFSOCK:
960                 err = ll_new_node(dir, dchild, NULL, mode,
961                                   old_encode_dev(rdev),
962                                   LUSTRE_OPC_MKNOD);
963                 break;
964         case S_IFDIR:
965                 err = -EPERM;
966                 break;
967         default:
968                 err = -EINVAL;
969         }
970
971         if (!err)
972                 ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
973
974         return err;
975 }
976
977 /*
978  * Plain create. Intent create is handled in atomic_open.
979  */
980 static int ll_create_nd(struct inode *dir, struct dentry *dentry,
981                         umode_t mode, bool want_excl)
982 {
983         int rc;
984
985         CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p), flags=%u, excl=%d\n",
986                dentry, PFID(ll_inode2fid(dir)), dir, mode, want_excl);
987
988         rc = ll_mknod(dir, dentry, mode, 0);
989
990         ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1);
991
992         CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, unhashed %d\n",
993                dentry, d_unhashed(dentry));
994
995         return rc;
996 }
997
998 static int ll_unlink(struct inode *dir, struct dentry *dchild)
999 {
1000         struct ptlrpc_request *request = NULL;
1001         struct md_op_data *op_data;
1002         int rc;
1003
1004         CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p)\n",
1005                dchild, dir->i_ino, dir->i_generation, dir);
1006
1007         op_data = ll_prep_md_op_data(NULL, dir, NULL,
1008                                      dchild->d_name.name,
1009                                      dchild->d_name.len,
1010                                      0, LUSTRE_OPC_ANY, NULL);
1011         if (IS_ERR(op_data))
1012                 return PTR_ERR(op_data);
1013
1014         if (dchild->d_inode)
1015                 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
1016
1017         op_data->op_fid2 = op_data->op_fid3;
1018         rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
1019         ll_finish_md_op_data(op_data);
1020         if (rc)
1021                 goto out;
1022
1023         ll_update_times(request, dir);
1024         ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
1025
1026  out:
1027         ptlrpc_req_finished(request);
1028         return rc;
1029 }
1030
1031 static int ll_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1032 {
1033         int err;
1034
1035         CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir"DFID"(%p)\n",
1036                dentry, PFID(ll_inode2fid(dir)), dir);
1037
1038         if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
1039                 mode &= ~current_umask();
1040         mode = (mode & (0777 | S_ISVTX)) | S_IFDIR;
1041
1042         err = ll_new_node(dir, dentry, NULL, mode, 0, LUSTRE_OPC_MKDIR);
1043         if (!err)
1044                 ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1);
1045
1046         return err;
1047 }
1048
1049 static int ll_rmdir(struct inode *dir, struct dentry *dchild)
1050 {
1051         struct ptlrpc_request *request = NULL;
1052         struct md_op_data *op_data;
1053         int rc;
1054
1055         CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p)\n",
1056                dchild, PFID(ll_inode2fid(dir)), dir);
1057
1058         op_data = ll_prep_md_op_data(NULL, dir, NULL,
1059                                      dchild->d_name.name,
1060                                      dchild->d_name.len,
1061                                      S_IFDIR, LUSTRE_OPC_ANY, NULL);
1062         if (IS_ERR(op_data))
1063                 return PTR_ERR(op_data);
1064
1065         if (dchild->d_inode)
1066                 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
1067
1068         op_data->op_fid2 = op_data->op_fid3;
1069         rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
1070         ll_finish_md_op_data(op_data);
1071         if (rc == 0) {
1072                 ll_update_times(request, dir);
1073                 ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
1074         }
1075
1076         ptlrpc_req_finished(request);
1077         return rc;
1078 }
1079
1080 static int ll_symlink(struct inode *dir, struct dentry *dentry,
1081                       const char *oldname)
1082 {
1083         int err;
1084
1085         CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p),target=%.*s\n",
1086                dentry, PFID(ll_inode2fid(dir)), dir, 3000, oldname);
1087
1088         err = ll_new_node(dir, dentry, oldname, S_IFLNK | 0777,
1089                           0, LUSTRE_OPC_SYMLINK);
1090
1091         if (!err)
1092                 ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, 1);
1093
1094         return err;
1095 }
1096
1097 static int ll_link(struct dentry *old_dentry, struct inode *dir,
1098                    struct dentry *new_dentry)
1099 {
1100         struct inode *src = d_inode(old_dentry);
1101         struct ll_sb_info *sbi = ll_i2sbi(dir);
1102         struct ptlrpc_request *request = NULL;
1103         struct md_op_data *op_data;
1104         int err;
1105
1106         CDEBUG(D_VFSTRACE, "VFS Op: inode="DFID"(%p), dir="DFID"(%p), target=%pd\n",
1107                PFID(ll_inode2fid(src)), src, PFID(ll_inode2fid(dir)), dir,
1108                new_dentry);
1109
1110         op_data = ll_prep_md_op_data(NULL, src, dir, new_dentry->d_name.name,
1111                                      new_dentry->d_name.len,
1112                                      0, LUSTRE_OPC_ANY, NULL);
1113         if (IS_ERR(op_data))
1114                 return PTR_ERR(op_data);
1115
1116         err = md_link(sbi->ll_md_exp, op_data, &request);
1117         ll_finish_md_op_data(op_data);
1118         if (err)
1119                 goto out;
1120
1121         ll_update_times(request, dir);
1122         ll_stats_ops_tally(sbi, LPROC_LL_LINK, 1);
1123 out:
1124         ptlrpc_req_finished(request);
1125         return err;
1126 }
1127
1128 static int ll_rename(struct inode *src, struct dentry *src_dchild,
1129                      struct inode *tgt, struct dentry *tgt_dchild,
1130                      unsigned int flags)
1131 {
1132         struct ptlrpc_request *request = NULL;
1133         struct ll_sb_info *sbi = ll_i2sbi(src);
1134         struct md_op_data *op_data;
1135         int err;
1136
1137         if (flags)
1138                 return -EINVAL;
1139
1140         CDEBUG(D_VFSTRACE,
1141                "VFS Op:oldname=%pd, src_dir="DFID"(%p), newname=%pd, tgt_dir="DFID"(%p)\n",
1142                src_dchild, PFID(ll_inode2fid(src)), src,
1143                tgt_dchild, PFID(ll_inode2fid(tgt)), tgt);
1144
1145         op_data = ll_prep_md_op_data(NULL, src, tgt, NULL, 0, 0,
1146                                      LUSTRE_OPC_ANY, NULL);
1147         if (IS_ERR(op_data))
1148                 return PTR_ERR(op_data);
1149
1150         if (src_dchild->d_inode)
1151                 op_data->op_fid3 = *ll_inode2fid(src_dchild->d_inode);
1152         if (tgt_dchild->d_inode)
1153                 op_data->op_fid4 = *ll_inode2fid(tgt_dchild->d_inode);
1154
1155         err = md_rename(sbi->ll_md_exp, op_data,
1156                         src_dchild->d_name.name,
1157                         src_dchild->d_name.len,
1158                         tgt_dchild->d_name.name,
1159                         tgt_dchild->d_name.len, &request);
1160         ll_finish_md_op_data(op_data);
1161         if (!err) {
1162                 ll_update_times(request, src);
1163                 ll_update_times(request, tgt);
1164                 ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1);
1165         }
1166
1167         ptlrpc_req_finished(request);
1168         if (!err)
1169                 d_move(src_dchild, tgt_dchild);
1170         return err;
1171 }
1172
1173 const struct inode_operations ll_dir_inode_operations = {
1174         .mknod        = ll_mknod,
1175         .atomic_open        = ll_atomic_open,
1176         .lookup      = ll_lookup_nd,
1177         .create      = ll_create_nd,
1178         /* We need all these non-raw things for NFSD, to not patch it. */
1179         .unlink      = ll_unlink,
1180         .mkdir        = ll_mkdir,
1181         .rmdir        = ll_rmdir,
1182         .symlink            = ll_symlink,
1183         .link          = ll_link,
1184         .rename         = ll_rename,
1185         .setattr            = ll_setattr,
1186         .getattr            = ll_getattr,
1187         .permission      = ll_inode_permission,
1188         .listxattr        = ll_listxattr,
1189         .get_acl            = ll_get_acl,
1190 };
1191
1192 const struct inode_operations ll_special_inode_operations = {
1193         .setattr        = ll_setattr,
1194         .getattr        = ll_getattr,
1195         .permission     = ll_inode_permission,
1196         .listxattr      = ll_listxattr,
1197         .get_acl            = ll_get_acl,
1198 };