]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - fs/xfs/xfs_vnodeops.c
Merge remote-tracking branch 'kumar/next' into merge
[karo-tx-linux.git] / fs / xfs / xfs_vnodeops.c
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18
19 #include "xfs.h"
20 #include "xfs_fs.h"
21 #include "xfs_types.h"
22 #include "xfs_bit.h"
23 #include "xfs_log.h"
24 #include "xfs_inum.h"
25 #include "xfs_trans.h"
26 #include "xfs_sb.h"
27 #include "xfs_ag.h"
28 #include "xfs_dir2.h"
29 #include "xfs_mount.h"
30 #include "xfs_da_btree.h"
31 #include "xfs_bmap_btree.h"
32 #include "xfs_ialloc_btree.h"
33 #include "xfs_dinode.h"
34 #include "xfs_inode.h"
35 #include "xfs_inode_item.h"
36 #include "xfs_itable.h"
37 #include "xfs_ialloc.h"
38 #include "xfs_alloc.h"
39 #include "xfs_bmap.h"
40 #include "xfs_acl.h"
41 #include "xfs_attr.h"
42 #include "xfs_rw.h"
43 #include "xfs_error.h"
44 #include "xfs_quota.h"
45 #include "xfs_utils.h"
46 #include "xfs_rtalloc.h"
47 #include "xfs_trans_space.h"
48 #include "xfs_log_priv.h"
49 #include "xfs_filestream.h"
50 #include "xfs_vnodeops.h"
51 #include "xfs_trace.h"
52
53 /*
54  * The maximum pathlen is 1024 bytes. Since the minimum file system
55  * blocksize is 512 bytes, we can get a max of 2 extents back from
56  * bmapi.
57  */
58 #define SYMLINK_MAPS 2
59
60 STATIC int
61 xfs_readlink_bmap(
62         xfs_inode_t     *ip,
63         char            *link)
64 {
65         xfs_mount_t     *mp = ip->i_mount;
66         int             pathlen = ip->i_d.di_size;
67         int             nmaps = SYMLINK_MAPS;
68         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
69         xfs_daddr_t     d;
70         int             byte_cnt;
71         int             n;
72         xfs_buf_t       *bp;
73         int             error = 0;
74
75         error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps,
76                                0);
77         if (error)
78                 goto out;
79
80         for (n = 0; n < nmaps; n++) {
81                 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
82                 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
83
84                 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
85                                   XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
86                 if (!bp)
87                         return XFS_ERROR(ENOMEM);
88                 error = bp->b_error;
89                 if (error) {
90                         xfs_buf_ioerror_alert(bp, __func__);
91                         xfs_buf_relse(bp);
92                         goto out;
93                 }
94                 if (pathlen < byte_cnt)
95                         byte_cnt = pathlen;
96                 pathlen -= byte_cnt;
97
98                 memcpy(link, bp->b_addr, byte_cnt);
99                 xfs_buf_relse(bp);
100         }
101
102         link[ip->i_d.di_size] = '\0';
103         error = 0;
104
105  out:
106         return error;
107 }
108
109 int
110 xfs_readlink(
111         xfs_inode_t     *ip,
112         char            *link)
113 {
114         xfs_mount_t     *mp = ip->i_mount;
115         xfs_fsize_t     pathlen;
116         int             error = 0;
117
118         trace_xfs_readlink(ip);
119
120         if (XFS_FORCED_SHUTDOWN(mp))
121                 return XFS_ERROR(EIO);
122
123         xfs_ilock(ip, XFS_ILOCK_SHARED);
124
125         pathlen = ip->i_d.di_size;
126         if (!pathlen)
127                 goto out;
128
129         if (pathlen < 0 || pathlen > MAXPATHLEN) {
130                 xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
131                          __func__, (unsigned long long) ip->i_ino,
132                          (long long) pathlen);
133                 ASSERT(0);
134                 error = XFS_ERROR(EFSCORRUPTED);
135                 goto out;
136         }
137
138
139         if (ip->i_df.if_flags & XFS_IFINLINE) {
140                 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
141                 link[pathlen] = '\0';
142         } else {
143                 error = xfs_readlink_bmap(ip, link);
144         }
145
146  out:
147         xfs_iunlock(ip, XFS_ILOCK_SHARED);
148         return error;
149 }
150
151 /*
152  * Flags for xfs_free_eofblocks
153  */
154 #define XFS_FREE_EOF_TRYLOCK    (1<<0)
155
156 /*
157  * This is called by xfs_inactive to free any blocks beyond eof
158  * when the link count isn't zero and by xfs_dm_punch_hole() when
159  * punching a hole to EOF.
160  */
161 STATIC int
162 xfs_free_eofblocks(
163         xfs_mount_t     *mp,
164         xfs_inode_t     *ip,
165         int             flags)
166 {
167         xfs_trans_t     *tp;
168         int             error;
169         xfs_fileoff_t   end_fsb;
170         xfs_fileoff_t   last_fsb;
171         xfs_filblks_t   map_len;
172         int             nimaps;
173         xfs_bmbt_irec_t imap;
174
175         /*
176          * Figure out if there are any blocks beyond the end
177          * of the file.  If not, then there is nothing to do.
178          */
179         end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
180         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
181         if (last_fsb <= end_fsb)
182                 return 0;
183         map_len = last_fsb - end_fsb;
184
185         nimaps = 1;
186         xfs_ilock(ip, XFS_ILOCK_SHARED);
187         error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
188         xfs_iunlock(ip, XFS_ILOCK_SHARED);
189
190         if (!error && (nimaps != 0) &&
191             (imap.br_startblock != HOLESTARTBLOCK ||
192              ip->i_delayed_blks)) {
193                 /*
194                  * Attach the dquots to the inode up front.
195                  */
196                 error = xfs_qm_dqattach(ip, 0);
197                 if (error)
198                         return error;
199
200                 /*
201                  * There are blocks after the end of file.
202                  * Free them up now by truncating the file to
203                  * its current size.
204                  */
205                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
206
207                 if (flags & XFS_FREE_EOF_TRYLOCK) {
208                         if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
209                                 xfs_trans_cancel(tp, 0);
210                                 return 0;
211                         }
212                 } else {
213                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
214                 }
215
216                 error = xfs_trans_reserve(tp, 0,
217                                           XFS_ITRUNCATE_LOG_RES(mp),
218                                           0, XFS_TRANS_PERM_LOG_RES,
219                                           XFS_ITRUNCATE_LOG_COUNT);
220                 if (error) {
221                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
222                         xfs_trans_cancel(tp, 0);
223                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
224                         return error;
225                 }
226
227                 xfs_ilock(ip, XFS_ILOCK_EXCL);
228                 xfs_trans_ijoin(tp, ip, 0);
229
230                 /*
231                  * Do not update the on-disk file size.  If we update the
232                  * on-disk file size and then the system crashes before the
233                  * contents of the file are flushed to disk then the files
234                  * may be full of holes (ie NULL files bug).
235                  */
236                 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
237                                               XFS_ISIZE(ip));
238                 if (error) {
239                         /*
240                          * If we get an error at this point we simply don't
241                          * bother truncating the file.
242                          */
243                         xfs_trans_cancel(tp,
244                                          (XFS_TRANS_RELEASE_LOG_RES |
245                                           XFS_TRANS_ABORT));
246                 } else {
247                         error = xfs_trans_commit(tp,
248                                                 XFS_TRANS_RELEASE_LOG_RES);
249                 }
250                 xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
251         }
252         return error;
253 }
254
255 /*
256  * Free a symlink that has blocks associated with it.
257  */
258 STATIC int
259 xfs_inactive_symlink_rmt(
260         xfs_inode_t     *ip,
261         xfs_trans_t     **tpp)
262 {
263         xfs_buf_t       *bp;
264         int             committed;
265         int             done;
266         int             error;
267         xfs_fsblock_t   first_block;
268         xfs_bmap_free_t free_list;
269         int             i;
270         xfs_mount_t     *mp;
271         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
272         int             nmaps;
273         xfs_trans_t     *ntp;
274         int             size;
275         xfs_trans_t     *tp;
276
277         tp = *tpp;
278         mp = ip->i_mount;
279         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
280         /*
281          * We're freeing a symlink that has some
282          * blocks allocated to it.  Free the
283          * blocks here.  We know that we've got
284          * either 1 or 2 extents and that we can
285          * free them all in one bunmapi call.
286          */
287         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
288         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
289                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
290                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
291                 xfs_trans_cancel(tp, 0);
292                 *tpp = NULL;
293                 return error;
294         }
295         /*
296          * Lock the inode, fix the size, and join it to the transaction.
297          * Hold it so in the normal path, we still have it locked for
298          * the second transaction.  In the error paths we need it
299          * held so the cancel won't rele it, see below.
300          */
301         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
302         size = (int)ip->i_d.di_size;
303         ip->i_d.di_size = 0;
304         xfs_trans_ijoin(tp, ip, 0);
305         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
306         /*
307          * Find the block(s) so we can inval and unmap them.
308          */
309         done = 0;
310         xfs_bmap_init(&free_list, &first_block);
311         nmaps = ARRAY_SIZE(mval);
312         error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size),
313                                 mval, &nmaps, 0);
314         if (error)
315                 goto error0;
316         /*
317          * Invalidate the block(s).
318          */
319         for (i = 0; i < nmaps; i++) {
320                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
321                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
322                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
323                 if (!bp) {
324                         error = ENOMEM;
325                         goto error1;
326                 }
327                 xfs_trans_binval(tp, bp);
328         }
329         /*
330          * Unmap the dead block(s) to the free_list.
331          */
332         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
333                         &first_block, &free_list, &done)))
334                 goto error1;
335         ASSERT(done);
336         /*
337          * Commit the first transaction.  This logs the EFI and the inode.
338          */
339         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
340                 goto error1;
341         /*
342          * The transaction must have been committed, since there were
343          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
344          * The new tp has the extent freeing and EFDs.
345          */
346         ASSERT(committed);
347         /*
348          * The first xact was committed, so add the inode to the new one.
349          * Mark it dirty so it will be logged and moved forward in the log as
350          * part of every commit.
351          */
352         xfs_trans_ijoin(tp, ip, 0);
353         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
354         /*
355          * Get a new, empty transaction to return to our caller.
356          */
357         ntp = xfs_trans_dup(tp);
358         /*
359          * Commit the transaction containing extent freeing and EFDs.
360          * If we get an error on the commit here or on the reserve below,
361          * we need to unlock the inode since the new transaction doesn't
362          * have the inode attached.
363          */
364         error = xfs_trans_commit(tp, 0);
365         tp = ntp;
366         if (error) {
367                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
368                 goto error0;
369         }
370         /*
371          * transaction commit worked ok so we can drop the extra ticket
372          * reference that we gained in xfs_trans_dup()
373          */
374         xfs_log_ticket_put(tp->t_ticket);
375
376         /*
377          * Remove the memory for extent descriptions (just bookkeeping).
378          */
379         if (ip->i_df.if_bytes)
380                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
381         ASSERT(ip->i_df.if_bytes == 0);
382         /*
383          * Put an itruncate log reservation in the new transaction
384          * for our caller.
385          */
386         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
387                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
388                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
389                 goto error0;
390         }
391         /*
392          * Return with the inode locked but not joined to the transaction.
393          */
394         *tpp = tp;
395         return 0;
396
397  error1:
398         xfs_bmap_cancel(&free_list);
399  error0:
400         /*
401          * Have to come here with the inode locked and either
402          * (held and in the transaction) or (not in the transaction).
403          * If the inode isn't held then cancel would iput it, but
404          * that's wrong since this is inactive and the vnode ref
405          * count is 0 already.
406          * Cancel won't do anything to the inode if held, but it still
407          * needs to be locked until the cancel is done, if it was
408          * joined to the transaction.
409          */
410         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
411         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
412         *tpp = NULL;
413         return error;
414
415 }
416
417 STATIC int
418 xfs_inactive_symlink_local(
419         xfs_inode_t     *ip,
420         xfs_trans_t     **tpp)
421 {
422         int             error;
423
424         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
425         /*
426          * We're freeing a symlink which fit into
427          * the inode.  Just free the memory used
428          * to hold the old symlink.
429          */
430         error = xfs_trans_reserve(*tpp, 0,
431                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
432                                   0, XFS_TRANS_PERM_LOG_RES,
433                                   XFS_ITRUNCATE_LOG_COUNT);
434
435         if (error) {
436                 xfs_trans_cancel(*tpp, 0);
437                 *tpp = NULL;
438                 return error;
439         }
440         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
441
442         /*
443          * Zero length symlinks _can_ exist.
444          */
445         if (ip->i_df.if_bytes > 0) {
446                 xfs_idata_realloc(ip,
447                                   -(ip->i_df.if_bytes),
448                                   XFS_DATA_FORK);
449                 ASSERT(ip->i_df.if_bytes == 0);
450         }
451         return 0;
452 }
453
454 STATIC int
455 xfs_inactive_attrs(
456         xfs_inode_t     *ip,
457         xfs_trans_t     **tpp)
458 {
459         xfs_trans_t     *tp;
460         int             error;
461         xfs_mount_t     *mp;
462
463         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
464         tp = *tpp;
465         mp = ip->i_mount;
466         ASSERT(ip->i_d.di_forkoff != 0);
467         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
468         xfs_iunlock(ip, XFS_ILOCK_EXCL);
469         if (error)
470                 goto error_unlock;
471
472         error = xfs_attr_inactive(ip);
473         if (error)
474                 goto error_unlock;
475
476         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
477         error = xfs_trans_reserve(tp, 0,
478                                   XFS_IFREE_LOG_RES(mp),
479                                   0, XFS_TRANS_PERM_LOG_RES,
480                                   XFS_INACTIVE_LOG_COUNT);
481         if (error)
482                 goto error_cancel;
483
484         xfs_ilock(ip, XFS_ILOCK_EXCL);
485         xfs_trans_ijoin(tp, ip, 0);
486         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
487
488         ASSERT(ip->i_d.di_anextents == 0);
489
490         *tpp = tp;
491         return 0;
492
493 error_cancel:
494         ASSERT(XFS_FORCED_SHUTDOWN(mp));
495         xfs_trans_cancel(tp, 0);
496 error_unlock:
497         *tpp = NULL;
498         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
499         return error;
500 }
501
502 int
503 xfs_release(
504         xfs_inode_t     *ip)
505 {
506         xfs_mount_t     *mp = ip->i_mount;
507         int             error;
508
509         if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
510                 return 0;
511
512         /* If this is a read-only mount, don't do this (would generate I/O) */
513         if (mp->m_flags & XFS_MOUNT_RDONLY)
514                 return 0;
515
516         if (!XFS_FORCED_SHUTDOWN(mp)) {
517                 int truncated;
518
519                 /*
520                  * If we are using filestreams, and we have an unlinked
521                  * file that we are processing the last close on, then nothing
522                  * will be able to reopen and write to this file. Purge this
523                  * inode from the filestreams cache so that it doesn't delay
524                  * teardown of the inode.
525                  */
526                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
527                         xfs_filestream_deassociate(ip);
528
529                 /*
530                  * If we previously truncated this file and removed old data
531                  * in the process, we want to initiate "early" writeout on
532                  * the last close.  This is an attempt to combat the notorious
533                  * NULL files problem which is particularly noticeable from a
534                  * truncate down, buffered (re-)write (delalloc), followed by
535                  * a crash.  What we are effectively doing here is
536                  * significantly reducing the time window where we'd otherwise
537                  * be exposed to that problem.
538                  */
539                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
540                 if (truncated) {
541                         xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
542                         if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
543                                 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
544                 }
545         }
546
547         if (ip->i_d.di_nlink == 0)
548                 return 0;
549
550         if ((S_ISREG(ip->i_d.di_mode) &&
551              (VFS_I(ip)->i_size > 0 ||
552               (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
553              (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
554             (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
555
556                 /*
557                  * If we can't get the iolock just skip truncating the blocks
558                  * past EOF because we could deadlock with the mmap_sem
559                  * otherwise.  We'll get another chance to drop them once the
560                  * last reference to the inode is dropped, so we'll never leak
561                  * blocks permanently.
562                  *
563                  * Further, check if the inode is being opened, written and
564                  * closed frequently and we have delayed allocation blocks
565                  * outstanding (e.g. streaming writes from the NFS server),
566                  * truncating the blocks past EOF will cause fragmentation to
567                  * occur.
568                  *
569                  * In this case don't do the truncation, either, but we have to
570                  * be careful how we detect this case. Blocks beyond EOF show
571                  * up as i_delayed_blks even when the inode is clean, so we
572                  * need to truncate them away first before checking for a dirty
573                  * release. Hence on the first dirty close we will still remove
574                  * the speculative allocation, but after that we will leave it
575                  * in place.
576                  */
577                 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
578                         return 0;
579
580                 error = xfs_free_eofblocks(mp, ip,
581                                            XFS_FREE_EOF_TRYLOCK);
582                 if (error)
583                         return error;
584
585                 /* delalloc blocks after truncation means it really is dirty */
586                 if (ip->i_delayed_blks)
587                         xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
588         }
589         return 0;
590 }
591
592 /*
593  * xfs_inactive
594  *
595  * This is called when the vnode reference count for the vnode
596  * goes to zero.  If the file has been unlinked, then it must
597  * now be truncated.  Also, we clear all of the read-ahead state
598  * kept for the inode here since the file is now closed.
599  */
600 int
601 xfs_inactive(
602         xfs_inode_t     *ip)
603 {
604         xfs_bmap_free_t free_list;
605         xfs_fsblock_t   first_block;
606         int             committed;
607         xfs_trans_t     *tp;
608         xfs_mount_t     *mp;
609         int             error;
610         int             truncate;
611
612         /*
613          * If the inode is already free, then there can be nothing
614          * to clean up here.
615          */
616         if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
617                 ASSERT(ip->i_df.if_real_bytes == 0);
618                 ASSERT(ip->i_df.if_broot_bytes == 0);
619                 return VN_INACTIVE_CACHE;
620         }
621
622         /*
623          * Only do a truncate if it's a regular file with
624          * some actual space in it.  It's OK to look at the
625          * inode's fields without the lock because we're the
626          * only one with a reference to the inode.
627          */
628         truncate = ((ip->i_d.di_nlink == 0) &&
629             ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
630              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
631             S_ISREG(ip->i_d.di_mode));
632
633         mp = ip->i_mount;
634
635         error = 0;
636
637         /* If this is a read-only mount, don't do this (would generate I/O) */
638         if (mp->m_flags & XFS_MOUNT_RDONLY)
639                 goto out;
640
641         if (ip->i_d.di_nlink != 0) {
642                 if ((S_ISREG(ip->i_d.di_mode) &&
643                     (VFS_I(ip)->i_size > 0 ||
644                      (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
645                     (ip->i_df.if_flags & XFS_IFEXTENTS) &&
646                     (!(ip->i_d.di_flags &
647                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
648                      ip->i_delayed_blks != 0))) {
649                         error = xfs_free_eofblocks(mp, ip, 0);
650                         if (error)
651                                 return VN_INACTIVE_CACHE;
652                 }
653                 goto out;
654         }
655
656         ASSERT(ip->i_d.di_nlink == 0);
657
658         error = xfs_qm_dqattach(ip, 0);
659         if (error)
660                 return VN_INACTIVE_CACHE;
661
662         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
663         if (truncate) {
664                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
665
666                 error = xfs_trans_reserve(tp, 0,
667                                           XFS_ITRUNCATE_LOG_RES(mp),
668                                           0, XFS_TRANS_PERM_LOG_RES,
669                                           XFS_ITRUNCATE_LOG_COUNT);
670                 if (error) {
671                         /* Don't call itruncate_cleanup */
672                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
673                         xfs_trans_cancel(tp, 0);
674                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
675                         return VN_INACTIVE_CACHE;
676                 }
677
678                 xfs_ilock(ip, XFS_ILOCK_EXCL);
679                 xfs_trans_ijoin(tp, ip, 0);
680
681                 ip->i_d.di_size = 0;
682                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
683
684                 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
685                 if (error) {
686                         xfs_trans_cancel(tp,
687                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
688                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
689                         return VN_INACTIVE_CACHE;
690                 }
691
692                 ASSERT(ip->i_d.di_nextents == 0);
693         } else if (S_ISLNK(ip->i_d.di_mode)) {
694
695                 /*
696                  * If we get an error while cleaning up a
697                  * symlink we bail out.
698                  */
699                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
700                         xfs_inactive_symlink_rmt(ip, &tp) :
701                         xfs_inactive_symlink_local(ip, &tp);
702
703                 if (error) {
704                         ASSERT(tp == NULL);
705                         return VN_INACTIVE_CACHE;
706                 }
707
708                 xfs_trans_ijoin(tp, ip, 0);
709         } else {
710                 error = xfs_trans_reserve(tp, 0,
711                                           XFS_IFREE_LOG_RES(mp),
712                                           0, XFS_TRANS_PERM_LOG_RES,
713                                           XFS_INACTIVE_LOG_COUNT);
714                 if (error) {
715                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
716                         xfs_trans_cancel(tp, 0);
717                         return VN_INACTIVE_CACHE;
718                 }
719
720                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
721                 xfs_trans_ijoin(tp, ip, 0);
722         }
723
724         /*
725          * If there are attributes associated with the file
726          * then blow them away now.  The code calls a routine
727          * that recursively deconstructs the attribute fork.
728          * We need to just commit the current transaction
729          * because we can't use it for xfs_attr_inactive().
730          */
731         if (ip->i_d.di_anextents > 0) {
732                 error = xfs_inactive_attrs(ip, &tp);
733                 /*
734                  * If we got an error, the transaction is already
735                  * cancelled, and the inode is unlocked. Just get out.
736                  */
737                  if (error)
738                          return VN_INACTIVE_CACHE;
739         } else if (ip->i_afp) {
740                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
741         }
742
743         /*
744          * Free the inode.
745          */
746         xfs_bmap_init(&free_list, &first_block);
747         error = xfs_ifree(tp, ip, &free_list);
748         if (error) {
749                 /*
750                  * If we fail to free the inode, shut down.  The cancel
751                  * might do that, we need to make sure.  Otherwise the
752                  * inode might be lost for a long time or forever.
753                  */
754                 if (!XFS_FORCED_SHUTDOWN(mp)) {
755                         xfs_notice(mp, "%s: xfs_ifree returned error %d",
756                                 __func__, error);
757                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
758                 }
759                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
760         } else {
761                 /*
762                  * Credit the quota account(s). The inode is gone.
763                  */
764                 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
765
766                 /*
767                  * Just ignore errors at this point.  There is nothing we can
768                  * do except to try to keep going. Make sure it's not a silent
769                  * error.
770                  */
771                 error = xfs_bmap_finish(&tp,  &free_list, &committed);
772                 if (error)
773                         xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
774                                 __func__, error);
775                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
776                 if (error)
777                         xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
778                                 __func__, error);
779         }
780
781         /*
782          * Release the dquots held by inode, if any.
783          */
784         xfs_qm_dqdetach(ip);
785         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
786
787  out:
788         return VN_INACTIVE_CACHE;
789 }
790
791 /*
792  * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
793  * is allowed, otherwise it has to be an exact match. If a CI match is found,
794  * ci_name->name will point to a the actual name (caller must free) or
795  * will be set to NULL if an exact match is found.
796  */
797 int
798 xfs_lookup(
799         xfs_inode_t             *dp,
800         struct xfs_name         *name,
801         xfs_inode_t             **ipp,
802         struct xfs_name         *ci_name)
803 {
804         xfs_ino_t               inum;
805         int                     error;
806         uint                    lock_mode;
807
808         trace_xfs_lookup(dp, name);
809
810         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
811                 return XFS_ERROR(EIO);
812
813         lock_mode = xfs_ilock_map_shared(dp);
814         error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
815         xfs_iunlock_map_shared(dp, lock_mode);
816
817         if (error)
818                 goto out;
819
820         error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
821         if (error)
822                 goto out_free_name;
823
824         return 0;
825
826 out_free_name:
827         if (ci_name)
828                 kmem_free(ci_name->name);
829 out:
830         *ipp = NULL;
831         return error;
832 }
833
834 int
835 xfs_create(
836         xfs_inode_t             *dp,
837         struct xfs_name         *name,
838         umode_t                 mode,
839         xfs_dev_t               rdev,
840         xfs_inode_t             **ipp)
841 {
842         int                     is_dir = S_ISDIR(mode);
843         struct xfs_mount        *mp = dp->i_mount;
844         struct xfs_inode        *ip = NULL;
845         struct xfs_trans        *tp = NULL;
846         int                     error;
847         xfs_bmap_free_t         free_list;
848         xfs_fsblock_t           first_block;
849         boolean_t               unlock_dp_on_error = B_FALSE;
850         uint                    cancel_flags;
851         int                     committed;
852         prid_t                  prid;
853         struct xfs_dquot        *udqp = NULL;
854         struct xfs_dquot        *gdqp = NULL;
855         uint                    resblks;
856         uint                    log_res;
857         uint                    log_count;
858
859         trace_xfs_create(dp, name);
860
861         if (XFS_FORCED_SHUTDOWN(mp))
862                 return XFS_ERROR(EIO);
863
864         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
865                 prid = xfs_get_projid(dp);
866         else
867                 prid = XFS_PROJID_DEFAULT;
868
869         /*
870          * Make sure that we have allocated dquot(s) on disk.
871          */
872         error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
873                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
874         if (error)
875                 return error;
876
877         if (is_dir) {
878                 rdev = 0;
879                 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
880                 log_res = XFS_MKDIR_LOG_RES(mp);
881                 log_count = XFS_MKDIR_LOG_COUNT;
882                 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
883         } else {
884                 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
885                 log_res = XFS_CREATE_LOG_RES(mp);
886                 log_count = XFS_CREATE_LOG_COUNT;
887                 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
888         }
889
890         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
891
892         /*
893          * Initially assume that the file does not exist and
894          * reserve the resources for that case.  If that is not
895          * the case we'll drop the one we have and get a more
896          * appropriate transaction later.
897          */
898         error = xfs_trans_reserve(tp, resblks, log_res, 0,
899                         XFS_TRANS_PERM_LOG_RES, log_count);
900         if (error == ENOSPC) {
901                 /* flush outstanding delalloc blocks and retry */
902                 xfs_flush_inodes(dp);
903                 error = xfs_trans_reserve(tp, resblks, log_res, 0,
904                                 XFS_TRANS_PERM_LOG_RES, log_count);
905         }
906         if (error == ENOSPC) {
907                 /* No space at all so try a "no-allocation" reservation */
908                 resblks = 0;
909                 error = xfs_trans_reserve(tp, 0, log_res, 0,
910                                 XFS_TRANS_PERM_LOG_RES, log_count);
911         }
912         if (error) {
913                 cancel_flags = 0;
914                 goto out_trans_cancel;
915         }
916
917         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
918         unlock_dp_on_error = B_TRUE;
919
920         xfs_bmap_init(&free_list, &first_block);
921
922         /*
923          * Reserve disk quota and the inode.
924          */
925         error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
926         if (error)
927                 goto out_trans_cancel;
928
929         error = xfs_dir_canenter(tp, dp, name, resblks);
930         if (error)
931                 goto out_trans_cancel;
932
933         /*
934          * A newly created regular or special file just has one directory
935          * entry pointing to them, but a directory also the "." entry
936          * pointing to itself.
937          */
938         error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
939                                prid, resblks > 0, &ip, &committed);
940         if (error) {
941                 if (error == ENOSPC)
942                         goto out_trans_cancel;
943                 goto out_trans_abort;
944         }
945
946         /*
947          * Now we join the directory inode to the transaction.  We do not do it
948          * earlier because xfs_dir_ialloc might commit the previous transaction
949          * (and release all the locks).  An error from here on will result in
950          * the transaction cancel unlocking dp so don't do it explicitly in the
951          * error path.
952          */
953         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
954         unlock_dp_on_error = B_FALSE;
955
956         error = xfs_dir_createname(tp, dp, name, ip->i_ino,
957                                         &first_block, &free_list, resblks ?
958                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
959         if (error) {
960                 ASSERT(error != ENOSPC);
961                 goto out_trans_abort;
962         }
963         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
964         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
965
966         if (is_dir) {
967                 error = xfs_dir_init(tp, ip, dp);
968                 if (error)
969                         goto out_bmap_cancel;
970
971                 error = xfs_bumplink(tp, dp);
972                 if (error)
973                         goto out_bmap_cancel;
974         }
975
976         /*
977          * If this is a synchronous mount, make sure that the
978          * create transaction goes to disk before returning to
979          * the user.
980          */
981         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
982                 xfs_trans_set_sync(tp);
983
984         /*
985          * Attach the dquot(s) to the inodes and modify them incore.
986          * These ids of the inode couldn't have changed since the new
987          * inode has been locked ever since it was created.
988          */
989         xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
990
991         error = xfs_bmap_finish(&tp, &free_list, &committed);
992         if (error)
993                 goto out_bmap_cancel;
994
995         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
996         if (error)
997                 goto out_release_inode;
998
999         xfs_qm_dqrele(udqp);
1000         xfs_qm_dqrele(gdqp);
1001
1002         *ipp = ip;
1003         return 0;
1004
1005  out_bmap_cancel:
1006         xfs_bmap_cancel(&free_list);
1007  out_trans_abort:
1008         cancel_flags |= XFS_TRANS_ABORT;
1009  out_trans_cancel:
1010         xfs_trans_cancel(tp, cancel_flags);
1011  out_release_inode:
1012         /*
1013          * Wait until after the current transaction is aborted to
1014          * release the inode.  This prevents recursive transactions
1015          * and deadlocks from xfs_inactive.
1016          */
1017         if (ip)
1018                 IRELE(ip);
1019
1020         xfs_qm_dqrele(udqp);
1021         xfs_qm_dqrele(gdqp);
1022
1023         if (unlock_dp_on_error)
1024                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1025         return error;
1026 }
1027
1028 #ifdef DEBUG
1029 int xfs_locked_n;
1030 int xfs_small_retries;
1031 int xfs_middle_retries;
1032 int xfs_lots_retries;
1033 int xfs_lock_delays;
1034 #endif
1035
1036 /*
1037  * Bump the subclass so xfs_lock_inodes() acquires each lock with
1038  * a different value
1039  */
1040 static inline int
1041 xfs_lock_inumorder(int lock_mode, int subclass)
1042 {
1043         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
1044                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
1045         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
1046                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
1047
1048         return lock_mode;
1049 }
1050
1051 /*
1052  * The following routine will lock n inodes in exclusive mode.
1053  * We assume the caller calls us with the inodes in i_ino order.
1054  *
1055  * We need to detect deadlock where an inode that we lock
1056  * is in the AIL and we start waiting for another inode that is locked
1057  * by a thread in a long running transaction (such as truncate). This can
1058  * result in deadlock since the long running trans might need to wait
1059  * for the inode we just locked in order to push the tail and free space
1060  * in the log.
1061  */
1062 void
1063 xfs_lock_inodes(
1064         xfs_inode_t     **ips,
1065         int             inodes,
1066         uint            lock_mode)
1067 {
1068         int             attempts = 0, i, j, try_lock;
1069         xfs_log_item_t  *lp;
1070
1071         ASSERT(ips && (inodes >= 2)); /* we need at least two */
1072
1073         try_lock = 0;
1074         i = 0;
1075
1076 again:
1077         for (; i < inodes; i++) {
1078                 ASSERT(ips[i]);
1079
1080                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
1081                         continue;
1082
1083                 /*
1084                  * If try_lock is not set yet, make sure all locked inodes
1085                  * are not in the AIL.
1086                  * If any are, set try_lock to be used later.
1087                  */
1088
1089                 if (!try_lock) {
1090                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
1091                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
1092                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1093                                         try_lock++;
1094                                 }
1095                         }
1096                 }
1097
1098                 /*
1099                  * If any of the previous locks we have locked is in the AIL,
1100                  * we must TRY to get the second and subsequent locks. If
1101                  * we can't get any, we must release all we have
1102                  * and try again.
1103                  */
1104
1105                 if (try_lock) {
1106                         /* try_lock must be 0 if i is 0. */
1107                         /*
1108                          * try_lock means we have an inode locked
1109                          * that is in the AIL.
1110                          */
1111                         ASSERT(i != 0);
1112                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
1113                                 attempts++;
1114
1115                                 /*
1116                                  * Unlock all previous guys and try again.
1117                                  * xfs_iunlock will try to push the tail
1118                                  * if the inode is in the AIL.
1119                                  */
1120
1121                                 for(j = i - 1; j >= 0; j--) {
1122
1123                                         /*
1124                                          * Check to see if we've already
1125                                          * unlocked this one.
1126                                          * Not the first one going back,
1127                                          * and the inode ptr is the same.
1128                                          */
1129                                         if ((j != (i - 1)) && ips[j] ==
1130                                                                 ips[j+1])
1131                                                 continue;
1132
1133                                         xfs_iunlock(ips[j], lock_mode);
1134                                 }
1135
1136                                 if ((attempts % 5) == 0) {
1137                                         delay(1); /* Don't just spin the CPU */
1138 #ifdef DEBUG
1139                                         xfs_lock_delays++;
1140 #endif
1141                                 }
1142                                 i = 0;
1143                                 try_lock = 0;
1144                                 goto again;
1145                         }
1146                 } else {
1147                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
1148                 }
1149         }
1150
1151 #ifdef DEBUG
1152         if (attempts) {
1153                 if (attempts < 5) xfs_small_retries++;
1154                 else if (attempts < 100) xfs_middle_retries++;
1155                 else xfs_lots_retries++;
1156         } else {
1157                 xfs_locked_n++;
1158         }
1159 #endif
1160 }
1161
1162 /*
1163  * xfs_lock_two_inodes() can only be used to lock one type of lock
1164  * at a time - the iolock or the ilock, but not both at once. If
1165  * we lock both at once, lockdep will report false positives saying
1166  * we have violated locking orders.
1167  */
1168 void
1169 xfs_lock_two_inodes(
1170         xfs_inode_t             *ip0,
1171         xfs_inode_t             *ip1,
1172         uint                    lock_mode)
1173 {
1174         xfs_inode_t             *temp;
1175         int                     attempts = 0;
1176         xfs_log_item_t          *lp;
1177
1178         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
1179                 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
1180         ASSERT(ip0->i_ino != ip1->i_ino);
1181
1182         if (ip0->i_ino > ip1->i_ino) {
1183                 temp = ip0;
1184                 ip0 = ip1;
1185                 ip1 = temp;
1186         }
1187
1188  again:
1189         xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
1190
1191         /*
1192          * If the first lock we have locked is in the AIL, we must TRY to get
1193          * the second lock. If we can't get it, we must release the first one
1194          * and try again.
1195          */
1196         lp = (xfs_log_item_t *)ip0->i_itemp;
1197         if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1198                 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
1199                         xfs_iunlock(ip0, lock_mode);
1200                         if ((++attempts % 5) == 0)
1201                                 delay(1); /* Don't just spin the CPU */
1202                         goto again;
1203                 }
1204         } else {
1205                 xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
1206         }
1207 }
1208
1209 int
1210 xfs_remove(
1211         xfs_inode_t             *dp,
1212         struct xfs_name         *name,
1213         xfs_inode_t             *ip)
1214 {
1215         xfs_mount_t             *mp = dp->i_mount;
1216         xfs_trans_t             *tp = NULL;
1217         int                     is_dir = S_ISDIR(ip->i_d.di_mode);
1218         int                     error = 0;
1219         xfs_bmap_free_t         free_list;
1220         xfs_fsblock_t           first_block;
1221         int                     cancel_flags;
1222         int                     committed;
1223         int                     link_zero;
1224         uint                    resblks;
1225         uint                    log_count;
1226
1227         trace_xfs_remove(dp, name);
1228
1229         if (XFS_FORCED_SHUTDOWN(mp))
1230                 return XFS_ERROR(EIO);
1231
1232         error = xfs_qm_dqattach(dp, 0);
1233         if (error)
1234                 goto std_return;
1235
1236         error = xfs_qm_dqattach(ip, 0);
1237         if (error)
1238                 goto std_return;
1239
1240         if (is_dir) {
1241                 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
1242                 log_count = XFS_DEFAULT_LOG_COUNT;
1243         } else {
1244                 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
1245                 log_count = XFS_REMOVE_LOG_COUNT;
1246         }
1247         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1248
1249         /*
1250          * We try to get the real space reservation first,
1251          * allowing for directory btree deletion(s) implying
1252          * possible bmap insert(s).  If we can't get the space
1253          * reservation then we use 0 instead, and avoid the bmap
1254          * btree insert(s) in the directory code by, if the bmap
1255          * insert tries to happen, instead trimming the LAST
1256          * block from the directory.
1257          */
1258         resblks = XFS_REMOVE_SPACE_RES(mp);
1259         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
1260                                   XFS_TRANS_PERM_LOG_RES, log_count);
1261         if (error == ENOSPC) {
1262                 resblks = 0;
1263                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
1264                                           XFS_TRANS_PERM_LOG_RES, log_count);
1265         }
1266         if (error) {
1267                 ASSERT(error != ENOSPC);
1268                 cancel_flags = 0;
1269                 goto out_trans_cancel;
1270         }
1271
1272         xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
1273
1274         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1275         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1276
1277         /*
1278          * If we're removing a directory perform some additional validation.
1279          */
1280         if (is_dir) {
1281                 ASSERT(ip->i_d.di_nlink >= 2);
1282                 if (ip->i_d.di_nlink != 2) {
1283                         error = XFS_ERROR(ENOTEMPTY);
1284                         goto out_trans_cancel;
1285                 }
1286                 if (!xfs_dir_isempty(ip)) {
1287                         error = XFS_ERROR(ENOTEMPTY);
1288                         goto out_trans_cancel;
1289                 }
1290         }
1291
1292         xfs_bmap_init(&free_list, &first_block);
1293         error = xfs_dir_removename(tp, dp, name, ip->i_ino,
1294                                         &first_block, &free_list, resblks);
1295         if (error) {
1296                 ASSERT(error != ENOENT);
1297                 goto out_bmap_cancel;
1298         }
1299         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1300
1301         if (is_dir) {
1302                 /*
1303                  * Drop the link from ip's "..".
1304                  */
1305                 error = xfs_droplink(tp, dp);
1306                 if (error)
1307                         goto out_bmap_cancel;
1308
1309                 /*
1310                  * Drop the "." link from ip to self.
1311                  */
1312                 error = xfs_droplink(tp, ip);
1313                 if (error)
1314                         goto out_bmap_cancel;
1315         } else {
1316                 /*
1317                  * When removing a non-directory we need to log the parent
1318                  * inode here.  For a directory this is done implicitly
1319                  * by the xfs_droplink call for the ".." entry.
1320                  */
1321                 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1322         }
1323
1324         /*
1325          * Drop the link from dp to ip.
1326          */
1327         error = xfs_droplink(tp, ip);
1328         if (error)
1329                 goto out_bmap_cancel;
1330
1331         /*
1332          * Determine if this is the last link while
1333          * we are in the transaction.
1334          */
1335         link_zero = (ip->i_d.di_nlink == 0);
1336
1337         /*
1338          * If this is a synchronous mount, make sure that the
1339          * remove transaction goes to disk before returning to
1340          * the user.
1341          */
1342         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1343                 xfs_trans_set_sync(tp);
1344
1345         error = xfs_bmap_finish(&tp, &free_list, &committed);
1346         if (error)
1347                 goto out_bmap_cancel;
1348
1349         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1350         if (error)
1351                 goto std_return;
1352
1353         /*
1354          * If we are using filestreams, kill the stream association.
1355          * If the file is still open it may get a new one but that
1356          * will get killed on last close in xfs_close() so we don't
1357          * have to worry about that.
1358          */
1359         if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
1360                 xfs_filestream_deassociate(ip);
1361
1362         return 0;
1363
1364  out_bmap_cancel:
1365         xfs_bmap_cancel(&free_list);
1366         cancel_flags |= XFS_TRANS_ABORT;
1367  out_trans_cancel:
1368         xfs_trans_cancel(tp, cancel_flags);
1369  std_return:
1370         return error;
1371 }
1372
1373 int
1374 xfs_link(
1375         xfs_inode_t             *tdp,
1376         xfs_inode_t             *sip,
1377         struct xfs_name         *target_name)
1378 {
1379         xfs_mount_t             *mp = tdp->i_mount;
1380         xfs_trans_t             *tp;
1381         int                     error;
1382         xfs_bmap_free_t         free_list;
1383         xfs_fsblock_t           first_block;
1384         int                     cancel_flags;
1385         int                     committed;
1386         int                     resblks;
1387
1388         trace_xfs_link(tdp, target_name);
1389
1390         ASSERT(!S_ISDIR(sip->i_d.di_mode));
1391
1392         if (XFS_FORCED_SHUTDOWN(mp))
1393                 return XFS_ERROR(EIO);
1394
1395         error = xfs_qm_dqattach(sip, 0);
1396         if (error)
1397                 goto std_return;
1398
1399         error = xfs_qm_dqattach(tdp, 0);
1400         if (error)
1401                 goto std_return;
1402
1403         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
1404         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1405         resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1406         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
1407                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1408         if (error == ENOSPC) {
1409                 resblks = 0;
1410                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
1411                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1412         }
1413         if (error) {
1414                 cancel_flags = 0;
1415                 goto error_return;
1416         }
1417
1418         xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1419
1420         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1421         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1422
1423         /*
1424          * If we are using project inheritance, we only allow hard link
1425          * creation in our tree when the project IDs are the same; else
1426          * the tree quota mechanism could be circumvented.
1427          */
1428         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1429                      (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1430                 error = XFS_ERROR(EXDEV);
1431                 goto error_return;
1432         }
1433
1434         error = xfs_dir_canenter(tp, tdp, target_name, resblks);
1435         if (error)
1436                 goto error_return;
1437
1438         xfs_bmap_init(&free_list, &first_block);
1439
1440         error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1441                                         &first_block, &free_list, resblks);
1442         if (error)
1443                 goto abort_return;
1444         xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1445         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1446
1447         error = xfs_bumplink(tp, sip);
1448         if (error)
1449                 goto abort_return;
1450
1451         /*
1452          * If this is a synchronous mount, make sure that the
1453          * link transaction goes to disk before returning to
1454          * the user.
1455          */
1456         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1457                 xfs_trans_set_sync(tp);
1458         }
1459
1460         error = xfs_bmap_finish (&tp, &free_list, &committed);
1461         if (error) {
1462                 xfs_bmap_cancel(&free_list);
1463                 goto abort_return;
1464         }
1465
1466         return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1467
1468  abort_return:
1469         cancel_flags |= XFS_TRANS_ABORT;
1470  error_return:
1471         xfs_trans_cancel(tp, cancel_flags);
1472  std_return:
1473         return error;
1474 }
1475
1476 int
1477 xfs_symlink(
1478         xfs_inode_t             *dp,
1479         struct xfs_name         *link_name,
1480         const char              *target_path,
1481         umode_t                 mode,
1482         xfs_inode_t             **ipp)
1483 {
1484         xfs_mount_t             *mp = dp->i_mount;
1485         xfs_trans_t             *tp;
1486         xfs_inode_t             *ip;
1487         int                     error;
1488         int                     pathlen;
1489         xfs_bmap_free_t         free_list;
1490         xfs_fsblock_t           first_block;
1491         boolean_t               unlock_dp_on_error = B_FALSE;
1492         uint                    cancel_flags;
1493         int                     committed;
1494         xfs_fileoff_t           first_fsb;
1495         xfs_filblks_t           fs_blocks;
1496         int                     nmaps;
1497         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
1498         xfs_daddr_t             d;
1499         const char              *cur_chunk;
1500         int                     byte_cnt;
1501         int                     n;
1502         xfs_buf_t               *bp;
1503         prid_t                  prid;
1504         struct xfs_dquot        *udqp, *gdqp;
1505         uint                    resblks;
1506
1507         *ipp = NULL;
1508         error = 0;
1509         ip = NULL;
1510         tp = NULL;
1511
1512         trace_xfs_symlink(dp, link_name);
1513
1514         if (XFS_FORCED_SHUTDOWN(mp))
1515                 return XFS_ERROR(EIO);
1516
1517         /*
1518          * Check component lengths of the target path name.
1519          */
1520         pathlen = strlen(target_path);
1521         if (pathlen >= MAXPATHLEN)      /* total string too long */
1522                 return XFS_ERROR(ENAMETOOLONG);
1523
1524         udqp = gdqp = NULL;
1525         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1526                 prid = xfs_get_projid(dp);
1527         else
1528                 prid = XFS_PROJID_DEFAULT;
1529
1530         /*
1531          * Make sure that we have allocated dquot(s) on disk.
1532          */
1533         error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
1534                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
1535         if (error)
1536                 goto std_return;
1537
1538         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
1539         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1540         /*
1541          * The symlink will fit into the inode data fork?
1542          * There can't be any attributes so we get the whole variable part.
1543          */
1544         if (pathlen <= XFS_LITINO(mp))
1545                 fs_blocks = 0;
1546         else
1547                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
1548         resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
1549         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
1550                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
1551         if (error == ENOSPC && fs_blocks == 0) {
1552                 resblks = 0;
1553                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
1554                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
1555         }
1556         if (error) {
1557                 cancel_flags = 0;
1558                 goto error_return;
1559         }
1560
1561         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1562         unlock_dp_on_error = B_TRUE;
1563
1564         /*
1565          * Check whether the directory allows new symlinks or not.
1566          */
1567         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
1568                 error = XFS_ERROR(EPERM);
1569                 goto error_return;
1570         }
1571
1572         /*
1573          * Reserve disk quota : blocks and inode.
1574          */
1575         error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
1576         if (error)
1577                 goto error_return;
1578
1579         /*
1580          * Check for ability to enter directory entry, if no space reserved.
1581          */
1582         error = xfs_dir_canenter(tp, dp, link_name, resblks);
1583         if (error)
1584                 goto error_return;
1585         /*
1586          * Initialize the bmap freelist prior to calling either
1587          * bmapi or the directory create code.
1588          */
1589         xfs_bmap_init(&free_list, &first_block);
1590
1591         /*
1592          * Allocate an inode for the symlink.
1593          */
1594         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
1595                                prid, resblks > 0, &ip, NULL);
1596         if (error) {
1597                 if (error == ENOSPC)
1598                         goto error_return;
1599                 goto error1;
1600         }
1601
1602         /*
1603          * An error after we've joined dp to the transaction will result in the
1604          * transaction cancel unlocking dp so don't do it explicitly in the
1605          * error path.
1606          */
1607         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1608         unlock_dp_on_error = B_FALSE;
1609
1610         /*
1611          * Also attach the dquot(s) to it, if applicable.
1612          */
1613         xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
1614
1615         if (resblks)
1616                 resblks -= XFS_IALLOC_SPACE_RES(mp);
1617         /*
1618          * If the symlink will fit into the inode, write it inline.
1619          */
1620         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
1621                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
1622                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
1623                 ip->i_d.di_size = pathlen;
1624
1625                 /*
1626                  * The inode was initially created in extent format.
1627                  */
1628                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
1629                 ip->i_df.if_flags |= XFS_IFINLINE;
1630
1631                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
1632                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
1633
1634         } else {
1635                 first_fsb = 0;
1636                 nmaps = SYMLINK_MAPS;
1637
1638                 error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
1639                                   XFS_BMAPI_METADATA, &first_block, resblks,
1640                                   mval, &nmaps, &free_list);
1641                 if (error)
1642                         goto error2;
1643
1644                 if (resblks)
1645                         resblks -= fs_blocks;
1646                 ip->i_d.di_size = pathlen;
1647                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1648
1649                 cur_chunk = target_path;
1650                 for (n = 0; n < nmaps; n++) {
1651                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1652                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1653                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
1654                                                BTOBB(byte_cnt), 0);
1655                         if (!bp) {
1656                                 error = ENOMEM;
1657                                 goto error2;
1658                         }
1659                         if (pathlen < byte_cnt) {
1660                                 byte_cnt = pathlen;
1661                         }
1662                         pathlen -= byte_cnt;
1663
1664                         memcpy(bp->b_addr, cur_chunk, byte_cnt);
1665                         cur_chunk += byte_cnt;
1666
1667                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
1668                 }
1669         }
1670
1671         /*
1672          * Create the directory entry for the symlink.
1673          */
1674         error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
1675                                         &first_block, &free_list, resblks);
1676         if (error)
1677                 goto error2;
1678         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1679         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1680
1681         /*
1682          * If this is a synchronous mount, make sure that the
1683          * symlink transaction goes to disk before returning to
1684          * the user.
1685          */
1686         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1687                 xfs_trans_set_sync(tp);
1688         }
1689
1690         error = xfs_bmap_finish(&tp, &free_list, &committed);
1691         if (error) {
1692                 goto error2;
1693         }
1694         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1695         xfs_qm_dqrele(udqp);
1696         xfs_qm_dqrele(gdqp);
1697
1698         *ipp = ip;
1699         return 0;
1700
1701  error2:
1702         IRELE(ip);
1703  error1:
1704         xfs_bmap_cancel(&free_list);
1705         cancel_flags |= XFS_TRANS_ABORT;
1706  error_return:
1707         xfs_trans_cancel(tp, cancel_flags);
1708         xfs_qm_dqrele(udqp);
1709         xfs_qm_dqrele(gdqp);
1710
1711         if (unlock_dp_on_error)
1712                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1713  std_return:
1714         return error;
1715 }
1716
1717 int
1718 xfs_set_dmattrs(
1719         xfs_inode_t     *ip,
1720         u_int           evmask,
1721         u_int16_t       state)
1722 {
1723         xfs_mount_t     *mp = ip->i_mount;
1724         xfs_trans_t     *tp;
1725         int             error;
1726
1727         if (!capable(CAP_SYS_ADMIN))
1728                 return XFS_ERROR(EPERM);
1729
1730         if (XFS_FORCED_SHUTDOWN(mp))
1731                 return XFS_ERROR(EIO);
1732
1733         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
1734         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
1735         if (error) {
1736                 xfs_trans_cancel(tp, 0);
1737                 return error;
1738         }
1739         xfs_ilock(ip, XFS_ILOCK_EXCL);
1740         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1741
1742         ip->i_d.di_dmevmask = evmask;
1743         ip->i_d.di_dmstate  = state;
1744
1745         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1746         error = xfs_trans_commit(tp, 0);
1747
1748         return error;
1749 }
1750
1751 /*
1752  * xfs_alloc_file_space()
1753  *      This routine allocates disk space for the given file.
1754  *
1755  *      If alloc_type == 0, this request is for an ALLOCSP type
1756  *      request which will change the file size.  In this case, no
1757  *      DMAPI event will be generated by the call.  A TRUNCATE event
1758  *      will be generated later by xfs_setattr.
1759  *
1760  *      If alloc_type != 0, this request is for a RESVSP type
1761  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
1762  *      lower block boundary byte address is less than the file's
1763  *      length.
1764  *
1765  * RETURNS:
1766  *       0 on success
1767  *      errno on error
1768  *
1769  */
1770 STATIC int
1771 xfs_alloc_file_space(
1772         xfs_inode_t             *ip,
1773         xfs_off_t               offset,
1774         xfs_off_t               len,
1775         int                     alloc_type,
1776         int                     attr_flags)
1777 {
1778         xfs_mount_t             *mp = ip->i_mount;
1779         xfs_off_t               count;
1780         xfs_filblks_t           allocated_fsb;
1781         xfs_filblks_t           allocatesize_fsb;
1782         xfs_extlen_t            extsz, temp;
1783         xfs_fileoff_t           startoffset_fsb;
1784         xfs_fsblock_t           firstfsb;
1785         int                     nimaps;
1786         int                     quota_flag;
1787         int                     rt;
1788         xfs_trans_t             *tp;
1789         xfs_bmbt_irec_t         imaps[1], *imapp;
1790         xfs_bmap_free_t         free_list;
1791         uint                    qblocks, resblks, resrtextents;
1792         int                     committed;
1793         int                     error;
1794
1795         trace_xfs_alloc_file_space(ip);
1796
1797         if (XFS_FORCED_SHUTDOWN(mp))
1798                 return XFS_ERROR(EIO);
1799
1800         error = xfs_qm_dqattach(ip, 0);
1801         if (error)
1802                 return error;
1803
1804         if (len <= 0)
1805                 return XFS_ERROR(EINVAL);
1806
1807         rt = XFS_IS_REALTIME_INODE(ip);
1808         extsz = xfs_get_extsz_hint(ip);
1809
1810         count = len;
1811         imapp = &imaps[0];
1812         nimaps = 1;
1813         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
1814         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
1815
1816         /*
1817          * Allocate file space until done or until there is an error
1818          */
1819         while (allocatesize_fsb && !error) {
1820                 xfs_fileoff_t   s, e;
1821
1822                 /*
1823                  * Determine space reservations for data/realtime.
1824                  */
1825                 if (unlikely(extsz)) {
1826                         s = startoffset_fsb;
1827                         do_div(s, extsz);
1828                         s *= extsz;
1829                         e = startoffset_fsb + allocatesize_fsb;
1830                         if ((temp = do_mod(startoffset_fsb, extsz)))
1831                                 e += temp;
1832                         if ((temp = do_mod(e, extsz)))
1833                                 e += extsz - temp;
1834                 } else {
1835                         s = 0;
1836                         e = allocatesize_fsb;
1837                 }
1838
1839                 /*
1840                  * The transaction reservation is limited to a 32-bit block
1841                  * count, hence we need to limit the number of blocks we are
1842                  * trying to reserve to avoid an overflow. We can't allocate
1843                  * more than @nimaps extents, and an extent is limited on disk
1844                  * to MAXEXTLEN (21 bits), so use that to enforce the limit.
1845                  */
1846                 resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
1847                 if (unlikely(rt)) {
1848                         resrtextents = qblocks = resblks;
1849                         resrtextents /= mp->m_sb.sb_rextsize;
1850                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1851                         quota_flag = XFS_QMOPT_RES_RTBLKS;
1852                 } else {
1853                         resrtextents = 0;
1854                         resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
1855                         quota_flag = XFS_QMOPT_RES_REGBLKS;
1856                 }
1857
1858                 /*
1859                  * Allocate and setup the transaction.
1860                  */
1861                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1862                 error = xfs_trans_reserve(tp, resblks,
1863                                           XFS_WRITE_LOG_RES(mp), resrtextents,
1864                                           XFS_TRANS_PERM_LOG_RES,
1865                                           XFS_WRITE_LOG_COUNT);
1866                 /*
1867                  * Check for running out of space
1868                  */
1869                 if (error) {
1870                         /*
1871                          * Free the transaction structure.
1872                          */
1873                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1874                         xfs_trans_cancel(tp, 0);
1875                         break;
1876                 }
1877                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1878                 error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
1879                                                       0, quota_flag);
1880                 if (error)
1881                         goto error1;
1882
1883                 xfs_trans_ijoin(tp, ip, 0);
1884
1885                 xfs_bmap_init(&free_list, &firstfsb);
1886                 error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1887                                         allocatesize_fsb, alloc_type, &firstfsb,
1888                                         0, imapp, &nimaps, &free_list);
1889                 if (error) {
1890                         goto error0;
1891                 }
1892
1893                 /*
1894                  * Complete the transaction
1895                  */
1896                 error = xfs_bmap_finish(&tp, &free_list, &committed);
1897                 if (error) {
1898                         goto error0;
1899                 }
1900
1901                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1902                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1903                 if (error) {
1904                         break;
1905                 }
1906
1907                 allocated_fsb = imapp->br_blockcount;
1908
1909                 if (nimaps == 0) {
1910                         error = XFS_ERROR(ENOSPC);
1911                         break;
1912                 }
1913
1914                 startoffset_fsb += allocated_fsb;
1915                 allocatesize_fsb -= allocated_fsb;
1916         }
1917
1918         return error;
1919
1920 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1921         xfs_bmap_cancel(&free_list);
1922         xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
1923
1924 error1: /* Just cancel transaction */
1925         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1926         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1927         return error;
1928 }
1929
1930 /*
1931  * Zero file bytes between startoff and endoff inclusive.
1932  * The iolock is held exclusive and no blocks are buffered.
1933  *
1934  * This function is used by xfs_free_file_space() to zero
1935  * partial blocks when the range to free is not block aligned.
1936  * When unreserving space with boundaries that are not block
1937  * aligned we round up the start and round down the end
1938  * boundaries and then use this function to zero the parts of
1939  * the blocks that got dropped during the rounding.
1940  */
1941 STATIC int
1942 xfs_zero_remaining_bytes(
1943         xfs_inode_t             *ip,
1944         xfs_off_t               startoff,
1945         xfs_off_t               endoff)
1946 {
1947         xfs_bmbt_irec_t         imap;
1948         xfs_fileoff_t           offset_fsb;
1949         xfs_off_t               lastoffset;
1950         xfs_off_t               offset;
1951         xfs_buf_t               *bp;
1952         xfs_mount_t             *mp = ip->i_mount;
1953         int                     nimap;
1954         int                     error = 0;
1955
1956         /*
1957          * Avoid doing I/O beyond eof - it's not necessary
1958          * since nothing can read beyond eof.  The space will
1959          * be zeroed when the file is extended anyway.
1960          */
1961         if (startoff >= XFS_ISIZE(ip))
1962                 return 0;
1963
1964         if (endoff > XFS_ISIZE(ip))
1965                 endoff = XFS_ISIZE(ip);
1966
1967         bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
1968                                         mp->m_rtdev_targp : mp->m_ddev_targp,
1969                                 mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
1970         if (!bp)
1971                 return XFS_ERROR(ENOMEM);
1972
1973         xfs_buf_unlock(bp);
1974
1975         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1976                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1977                 nimap = 1;
1978                 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1979                 if (error || nimap < 1)
1980                         break;
1981                 ASSERT(imap.br_blockcount >= 1);
1982                 ASSERT(imap.br_startoff == offset_fsb);
1983                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
1984                 if (lastoffset > endoff)
1985                         lastoffset = endoff;
1986                 if (imap.br_startblock == HOLESTARTBLOCK)
1987                         continue;
1988                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1989                 if (imap.br_state == XFS_EXT_UNWRITTEN)
1990                         continue;
1991                 XFS_BUF_UNDONE(bp);
1992                 XFS_BUF_UNWRITE(bp);
1993                 XFS_BUF_READ(bp);
1994                 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
1995                 xfsbdstrat(mp, bp);
1996                 error = xfs_buf_iowait(bp);
1997                 if (error) {
1998                         xfs_buf_ioerror_alert(bp,
1999                                         "xfs_zero_remaining_bytes(read)");
2000                         break;
2001                 }
2002                 memset(bp->b_addr +
2003                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
2004                       0, lastoffset - offset + 1);
2005                 XFS_BUF_UNDONE(bp);
2006                 XFS_BUF_UNREAD(bp);
2007                 XFS_BUF_WRITE(bp);
2008                 xfsbdstrat(mp, bp);
2009                 error = xfs_buf_iowait(bp);
2010                 if (error) {
2011                         xfs_buf_ioerror_alert(bp,
2012                                         "xfs_zero_remaining_bytes(write)");
2013                         break;
2014                 }
2015         }
2016         xfs_buf_free(bp);
2017         return error;
2018 }
2019
2020 /*
2021  * xfs_free_file_space()
2022  *      This routine frees disk space for the given file.
2023  *
2024  *      This routine is only called by xfs_change_file_space
2025  *      for an UNRESVSP type call.
2026  *
2027  * RETURNS:
2028  *       0 on success
2029  *      errno on error
2030  *
2031  */
2032 STATIC int
2033 xfs_free_file_space(
2034         xfs_inode_t             *ip,
2035         xfs_off_t               offset,
2036         xfs_off_t               len,
2037         int                     attr_flags)
2038 {
2039         int                     committed;
2040         int                     done;
2041         xfs_fileoff_t           endoffset_fsb;
2042         int                     error;
2043         xfs_fsblock_t           firstfsb;
2044         xfs_bmap_free_t         free_list;
2045         xfs_bmbt_irec_t         imap;
2046         xfs_off_t               ioffset;
2047         xfs_extlen_t            mod=0;
2048         xfs_mount_t             *mp;
2049         int                     nimap;
2050         uint                    resblks;
2051         uint                    rounding;
2052         int                     rt;
2053         xfs_fileoff_t           startoffset_fsb;
2054         xfs_trans_t             *tp;
2055         int                     need_iolock = 1;
2056
2057         mp = ip->i_mount;
2058
2059         trace_xfs_free_file_space(ip);
2060
2061         error = xfs_qm_dqattach(ip, 0);
2062         if (error)
2063                 return error;
2064
2065         error = 0;
2066         if (len <= 0)   /* if nothing being freed */
2067                 return error;
2068         rt = XFS_IS_REALTIME_INODE(ip);
2069         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
2070         endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
2071
2072         if (attr_flags & XFS_ATTR_NOLOCK)
2073                 need_iolock = 0;
2074         if (need_iolock) {
2075                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
2076                 /* wait for the completion of any pending DIOs */
2077                 inode_dio_wait(VFS_I(ip));
2078         }
2079
2080         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
2081         ioffset = offset & ~(rounding - 1);
2082
2083         if (VN_CACHED(VFS_I(ip)) != 0) {
2084                 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
2085                 if (error)
2086                         goto out_unlock_iolock;
2087         }
2088
2089         /*
2090          * Need to zero the stuff we're not freeing, on disk.
2091          * If it's a realtime file & can't use unwritten extents then we
2092          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
2093          * will take care of it for us.
2094          */
2095         if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
2096                 nimap = 1;
2097                 error = xfs_bmapi_read(ip, startoffset_fsb, 1,
2098                                         &imap, &nimap, 0);
2099                 if (error)
2100                         goto out_unlock_iolock;
2101                 ASSERT(nimap == 0 || nimap == 1);
2102                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
2103                         xfs_daddr_t     block;
2104
2105                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
2106                         block = imap.br_startblock;
2107                         mod = do_div(block, mp->m_sb.sb_rextsize);
2108                         if (mod)
2109                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
2110                 }
2111                 nimap = 1;
2112                 error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
2113                                         &imap, &nimap, 0);
2114                 if (error)
2115                         goto out_unlock_iolock;
2116                 ASSERT(nimap == 0 || nimap == 1);
2117                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
2118                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
2119                         mod++;
2120                         if (mod && (mod != mp->m_sb.sb_rextsize))
2121                                 endoffset_fsb -= mod;
2122                 }
2123         }
2124         if ((done = (endoffset_fsb <= startoffset_fsb)))
2125                 /*
2126                  * One contiguous piece to clear
2127                  */
2128                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
2129         else {
2130                 /*
2131                  * Some full blocks, possibly two pieces to clear
2132                  */
2133                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
2134                         error = xfs_zero_remaining_bytes(ip, offset,
2135                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
2136                 if (!error &&
2137                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
2138                         error = xfs_zero_remaining_bytes(ip,
2139                                 XFS_FSB_TO_B(mp, endoffset_fsb),
2140                                 offset + len - 1);
2141         }
2142
2143         /*
2144          * free file space until done or until there is an error
2145          */
2146         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
2147         while (!error && !done) {
2148
2149                 /*
2150                  * allocate and setup the transaction. Allow this
2151                  * transaction to dip into the reserve blocks to ensure
2152                  * the freeing of the space succeeds at ENOSPC.
2153                  */
2154                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
2155                 tp->t_flags |= XFS_TRANS_RESERVE;
2156                 error = xfs_trans_reserve(tp,
2157                                           resblks,
2158                                           XFS_WRITE_LOG_RES(mp),
2159                                           0,
2160                                           XFS_TRANS_PERM_LOG_RES,
2161                                           XFS_WRITE_LOG_COUNT);
2162
2163                 /*
2164                  * check for running out of space
2165                  */
2166                 if (error) {
2167                         /*
2168                          * Free the transaction structure.
2169                          */
2170                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
2171                         xfs_trans_cancel(tp, 0);
2172                         break;
2173                 }
2174                 xfs_ilock(ip, XFS_ILOCK_EXCL);
2175                 error = xfs_trans_reserve_quota(tp, mp,
2176                                 ip->i_udquot, ip->i_gdquot,
2177                                 resblks, 0, XFS_QMOPT_RES_REGBLKS);
2178                 if (error)
2179                         goto error1;
2180
2181                 xfs_trans_ijoin(tp, ip, 0);
2182
2183                 /*
2184                  * issue the bunmapi() call to free the blocks
2185                  */
2186                 xfs_bmap_init(&free_list, &firstfsb);
2187                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
2188                                   endoffset_fsb - startoffset_fsb,
2189                                   0, 2, &firstfsb, &free_list, &done);
2190                 if (error) {
2191                         goto error0;
2192                 }
2193
2194                 /*
2195                  * complete the transaction
2196                  */
2197                 error = xfs_bmap_finish(&tp, &free_list, &committed);
2198                 if (error) {
2199                         goto error0;
2200                 }
2201
2202                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2203                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2204         }
2205
2206  out_unlock_iolock:
2207         if (need_iolock)
2208                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
2209         return error;
2210
2211  error0:
2212         xfs_bmap_cancel(&free_list);
2213  error1:
2214         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
2215         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
2216                     XFS_ILOCK_EXCL);
2217         return error;
2218 }
2219
2220 /*
2221  * xfs_change_file_space()
2222  *      This routine allocates or frees disk space for the given file.
2223  *      The user specified parameters are checked for alignment and size
2224  *      limitations.
2225  *
2226  * RETURNS:
2227  *       0 on success
2228  *      errno on error
2229  *
2230  */
2231 int
2232 xfs_change_file_space(
2233         xfs_inode_t     *ip,
2234         int             cmd,
2235         xfs_flock64_t   *bf,
2236         xfs_off_t       offset,
2237         int             attr_flags)
2238 {
2239         xfs_mount_t     *mp = ip->i_mount;
2240         int             clrprealloc;
2241         int             error;
2242         xfs_fsize_t     fsize;
2243         int             setprealloc;
2244         xfs_off_t       startoffset;
2245         xfs_off_t       llen;
2246         xfs_trans_t     *tp;
2247         struct iattr    iattr;
2248         int             prealloc_type;
2249
2250         if (!S_ISREG(ip->i_d.di_mode))
2251                 return XFS_ERROR(EINVAL);
2252
2253         switch (bf->l_whence) {
2254         case 0: /*SEEK_SET*/
2255                 break;
2256         case 1: /*SEEK_CUR*/
2257                 bf->l_start += offset;
2258                 break;
2259         case 2: /*SEEK_END*/
2260                 bf->l_start += XFS_ISIZE(ip);
2261                 break;
2262         default:
2263                 return XFS_ERROR(EINVAL);
2264         }
2265
2266         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
2267
2268         if (   (bf->l_start < 0)
2269             || (bf->l_start > XFS_MAXIOFFSET(mp))
2270             || (bf->l_start + llen < 0)
2271             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
2272                 return XFS_ERROR(EINVAL);
2273
2274         bf->l_whence = 0;
2275
2276         startoffset = bf->l_start;
2277         fsize = XFS_ISIZE(ip);
2278
2279         /*
2280          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
2281          * file space.
2282          * These calls do NOT zero the data space allocated to the file,
2283          * nor do they change the file size.
2284          *
2285          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
2286          * space.
2287          * These calls cause the new file data to be zeroed and the file
2288          * size to be changed.
2289          */
2290         setprealloc = clrprealloc = 0;
2291         prealloc_type = XFS_BMAPI_PREALLOC;
2292
2293         switch (cmd) {
2294         case XFS_IOC_ZERO_RANGE:
2295                 prealloc_type |= XFS_BMAPI_CONVERT;
2296                 xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
2297                 /* FALLTHRU */
2298         case XFS_IOC_RESVSP:
2299         case XFS_IOC_RESVSP64:
2300                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
2301                                                 prealloc_type, attr_flags);
2302                 if (error)
2303                         return error;
2304                 setprealloc = 1;
2305                 break;
2306
2307         case XFS_IOC_UNRESVSP:
2308         case XFS_IOC_UNRESVSP64:
2309                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
2310                                                                 attr_flags)))
2311                         return error;
2312                 break;
2313
2314         case XFS_IOC_ALLOCSP:
2315         case XFS_IOC_ALLOCSP64:
2316         case XFS_IOC_FREESP:
2317         case XFS_IOC_FREESP64:
2318                 if (startoffset > fsize) {
2319                         error = xfs_alloc_file_space(ip, fsize,
2320                                         startoffset - fsize, 0, attr_flags);
2321                         if (error)
2322                                 break;
2323                 }
2324
2325                 iattr.ia_valid = ATTR_SIZE;
2326                 iattr.ia_size = startoffset;
2327
2328                 error = xfs_setattr_size(ip, &iattr, attr_flags);
2329
2330                 if (error)
2331                         return error;
2332
2333                 clrprealloc = 1;
2334                 break;
2335
2336         default:
2337                 ASSERT(0);
2338                 return XFS_ERROR(EINVAL);
2339         }
2340
2341         /*
2342          * update the inode timestamp, mode, and prealloc flag bits
2343          */
2344         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
2345
2346         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
2347                                       0, 0, 0))) {
2348                 /* ASSERT(0); */
2349                 xfs_trans_cancel(tp, 0);
2350                 return error;
2351         }
2352
2353         xfs_ilock(ip, XFS_ILOCK_EXCL);
2354         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2355
2356         if ((attr_flags & XFS_ATTR_DMI) == 0) {
2357                 ip->i_d.di_mode &= ~S_ISUID;
2358
2359                 /*
2360                  * Note that we don't have to worry about mandatory
2361                  * file locking being disabled here because we only
2362                  * clear the S_ISGID bit if the Group execute bit is
2363                  * on, but if it was on then mandatory locking wouldn't
2364                  * have been enabled.
2365                  */
2366                 if (ip->i_d.di_mode & S_IXGRP)
2367                         ip->i_d.di_mode &= ~S_ISGID;
2368
2369                 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2370         }
2371         if (setprealloc)
2372                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
2373         else if (clrprealloc)
2374                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
2375
2376         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2377         if (attr_flags & XFS_ATTR_SYNC)
2378                 xfs_trans_set_sync(tp);
2379         return xfs_trans_commit(tp, 0);
2380 }