fs/xfs/xfs_vnodeops.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include "xfs.h"
  20 #include "xfs_fs.h"
  21 #include "xfs_types.h"
  22 #include "xfs_bit.h"
  23 #include "xfs_log.h"
  24 #include "xfs_inum.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_sb.h"
  27 #include "xfs_ag.h"
  28 #include "xfs_dir2.h"
  29 #include "xfs_mount.h"
  30 #include "xfs_da_btree.h"
  31 #include "xfs_bmap_btree.h"
  32 #include "xfs_ialloc_btree.h"
  33 #include "xfs_dinode.h"
  34 #include "xfs_inode.h"
  35 #include "xfs_inode_item.h"
  36 #include "xfs_itable.h"
  37 #include "xfs_ialloc.h"
  38 #include "xfs_alloc.h"
  39 #include "xfs_bmap.h"
  40 #include "xfs_acl.h"
  41 #include "xfs_attr.h"
  42 #include "xfs_rw.h"
  43 #include "xfs_error.h"
  44 #include "xfs_quota.h"
  45 #include "xfs_utils.h"
  46 #include "xfs_rtalloc.h"
  47 #include "xfs_trans_space.h"
  48 #include "xfs_log_priv.h"
  49 #include "xfs_filestream.h"
  50 #include "xfs_vnodeops.h"
  51 #include "xfs_trace.h"
  52
  53 /*
  54  * The maximum pathlen is 1024 bytes. Since the minimum file system
  55  * blocksize is 512 bytes, we can get a max of 2 extents back from
  56  * bmapi.
  57  */
  58 #define SYMLINK_MAPS 2
  59
  60 STATIC int
  61 xfs_readlink_bmap(
  62         xfs_inode_t     *ip,
  63         char            *link)
  64 {
  65         xfs_mount_t     *mp = ip->i_mount;
  66         int             pathlen = ip->i_d.di_size;
  67         int             nmaps = SYMLINK_MAPS;
  68         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
  69         xfs_daddr_t     d;
  70         int             byte_cnt;
  71         int             n;
  72         xfs_buf_t       *bp;
  73         int             error = 0;
  74
  75         error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps,
  76                                0);
  77         if (error)
  78                 goto out;
  79
  80         for (n = 0; n < nmaps; n++) {
  81                 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
  82                 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
  83
  84                 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
  85                                   XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
  86                 if (!bp)
  87                         return XFS_ERROR(ENOMEM);
  88                 error = bp->b_error;
  89                 if (error) {
  90                         xfs_buf_ioerror_alert(bp, __func__);
  91                         xfs_buf_relse(bp);
  92                         goto out;
  93                 }
  94                 if (pathlen < byte_cnt)
  95                         byte_cnt = pathlen;
  96                 pathlen -= byte_cnt;
  97
  98                 memcpy(link, bp->b_addr, byte_cnt);
  99                 xfs_buf_relse(bp);
 100         }
 101
 102         link[ip->i_d.di_size] = '\0';
 103         error = 0;
 104
 105  out:
 106         return error;
 107 }
 108
 109 int
 110 xfs_readlink(
 111         xfs_inode_t     *ip,
 112         char            *link)
 113 {
 114         xfs_mount_t     *mp = ip->i_mount;
 115         xfs_fsize_t     pathlen;
 116         int             error = 0;
 117
 118         trace_xfs_readlink(ip);
 119
 120         if (XFS_FORCED_SHUTDOWN(mp))
 121                 return XFS_ERROR(EIO);
 122
 123         xfs_ilock(ip, XFS_ILOCK_SHARED);
 124
 125         pathlen = ip->i_d.di_size;
 126         if (!pathlen)
 127                 goto out;
 128
 129         if (pathlen < 0 || pathlen > MAXPATHLEN) {
 130                 xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
 131                          __func__, (unsigned long long) ip->i_ino,
 132                          (long long) pathlen);
 133                 ASSERT(0);
 134                 return XFS_ERROR(EFSCORRUPTED);
 135         }
 136
 137
 138         if (ip->i_df.if_flags & XFS_IFINLINE) {
 139                 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
 140                 link[pathlen] = '\0';
 141         } else {
 142                 error = xfs_readlink_bmap(ip, link);
 143         }
 144
 145  out:
 146         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 147         return error;
 148 }
 149
 150 /*
 151  * Flags for xfs_free_eofblocks
 152  */
 153 #define XFS_FREE_EOF_TRYLOCK    (1<<0)
 154
 155 /*
 156  * This is called by xfs_inactive to free any blocks beyond eof
 157  * when the link count isn't zero and by xfs_dm_punch_hole() when
 158  * punching a hole to EOF.
 159  */
 160 STATIC int
 161 xfs_free_eofblocks(
 162         xfs_mount_t     *mp,
 163         xfs_inode_t     *ip,
 164         int             flags)
 165 {
 166         xfs_trans_t     *tp;
 167         int             error;
 168         xfs_fileoff_t   end_fsb;
 169         xfs_fileoff_t   last_fsb;
 170         xfs_filblks_t   map_len;
 171         int             nimaps;
 172         xfs_bmbt_irec_t imap;
 173
 174         /*
 175          * Figure out if there are any blocks beyond the end
 176          * of the file.  If not, then there is nothing to do.
 177          */
 178         end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
 179         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
 180         if (last_fsb <= end_fsb)
 181                 return 0;
 182         map_len = last_fsb - end_fsb;
 183
 184         nimaps = 1;
 185         xfs_ilock(ip, XFS_ILOCK_SHARED);
 186         error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
 187         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 188
 189         if (!error && (nimaps != 0) &&
 190             (imap.br_startblock != HOLESTARTBLOCK ||
 191              ip->i_delayed_blks)) {
 192                 /*
 193                  * Attach the dquots to the inode up front.
 194                  */
 195                 error = xfs_qm_dqattach(ip, 0);
 196                 if (error)
 197                         return error;
 198
 199                 /*
 200                  * There are blocks after the end of file.
 201                  * Free them up now by truncating the file to
 202                  * its current size.
 203                  */
 204                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 205
 206                 if (flags & XFS_FREE_EOF_TRYLOCK) {
 207                         if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
 208                                 xfs_trans_cancel(tp, 0);
 209                                 return 0;
 210                         }
 211                 } else {
 212                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
 213                 }
 214
 215                 error = xfs_trans_reserve(tp, 0,
 216                                           XFS_ITRUNCATE_LOG_RES(mp),
 217                                           0, XFS_TRANS_PERM_LOG_RES,
 218                                           XFS_ITRUNCATE_LOG_COUNT);
 219                 if (error) {
 220                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
 221                         xfs_trans_cancel(tp, 0);
 222                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 223                         return error;
 224                 }
 225
 226                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 227                 xfs_trans_ijoin(tp, ip, 0);
 228
 229                 /*
 230                  * Do not update the on-disk file size.  If we update the
 231                  * on-disk file size and then the system crashes before the
 232                  * contents of the file are flushed to disk then the files
 233                  * may be full of holes (ie NULL files bug).
 234                  */
 235                 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
 236                                               XFS_ISIZE(ip));
 237                 if (error) {
 238                         /*
 239                          * If we get an error at this point we simply don't
 240                          * bother truncating the file.
 241                          */
 242                         xfs_trans_cancel(tp,
 243                                          (XFS_TRANS_RELEASE_LOG_RES |
 244                                           XFS_TRANS_ABORT));
 245                 } else {
 246                         error = xfs_trans_commit(tp,
 247                                                 XFS_TRANS_RELEASE_LOG_RES);
 248                 }
 249                 xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
 250         }
 251         return error;
 252 }
 253
 254 /*
 255  * Free a symlink that has blocks associated with it.
 256  */
 257 STATIC int
 258 xfs_inactive_symlink_rmt(
 259         xfs_inode_t     *ip,
 260         xfs_trans_t     **tpp)
 261 {
 262         xfs_buf_t       *bp;
 263         int             committed;
 264         int             done;
 265         int             error;
 266         xfs_fsblock_t   first_block;
 267         xfs_bmap_free_t free_list;
 268         int             i;
 269         xfs_mount_t     *mp;
 270         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 271         int             nmaps;
 272         xfs_trans_t     *ntp;
 273         int             size;
 274         xfs_trans_t     *tp;
 275
 276         tp = *tpp;
 277         mp = ip->i_mount;
 278         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
 279         /*
 280          * We're freeing a symlink that has some
 281          * blocks allocated to it.  Free the
 282          * blocks here.  We know that we've got
 283          * either 1 or 2 extents and that we can
 284          * free them all in one bunmapi call.
 285          */
 286         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
 287         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
 288                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
 289                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
 290                 xfs_trans_cancel(tp, 0);
 291                 *tpp = NULL;
 292                 return error;
 293         }
 294         /*
 295          * Lock the inode, fix the size, and join it to the transaction.
 296          * Hold it so in the normal path, we still have it locked for
 297          * the second transaction.  In the error paths we need it
 298          * held so the cancel won't rele it, see below.
 299          */
 300         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 301         size = (int)ip->i_d.di_size;
 302         ip->i_d.di_size = 0;
 303         xfs_trans_ijoin(tp, ip, 0);
 304         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 305         /*
 306          * Find the block(s) so we can inval and unmap them.
 307          */
 308         done = 0;
 309         xfs_bmap_init(&free_list, &first_block);
 310         nmaps = ARRAY_SIZE(mval);
 311         error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size),
 312                                 mval, &nmaps, 0);
 313         if (error)
 314                 goto error0;
 315         /*
 316          * Invalidate the block(s).
 317          */
 318         for (i = 0; i < nmaps; i++) {
 319                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
 320                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
 321                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
 322                 if (!bp) {
 323                         error = ENOMEM;
 324                         goto error1;
 325                 }
 326                 xfs_trans_binval(tp, bp);
 327         }
 328         /*
 329          * Unmap the dead block(s) to the free_list.
 330          */
 331         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
 332                         &first_block, &free_list, &done)))
 333                 goto error1;
 334         ASSERT(done);
 335         /*
 336          * Commit the first transaction.  This logs the EFI and the inode.
 337          */
 338         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
 339                 goto error1;
 340         /*
 341          * The transaction must have been committed, since there were
 342          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
 343          * The new tp has the extent freeing and EFDs.
 344          */
 345         ASSERT(committed);
 346         /*
 347          * The first xact was committed, so add the inode to the new one.
 348          * Mark it dirty so it will be logged and moved forward in the log as
 349          * part of every commit.
 350          */
 351         xfs_trans_ijoin(tp, ip, 0);
 352         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 353         /*
 354          * Get a new, empty transaction to return to our caller.
 355          */
 356         ntp = xfs_trans_dup(tp);
 357         /*
 358          * Commit the transaction containing extent freeing and EFDs.
 359          * If we get an error on the commit here or on the reserve below,
 360          * we need to unlock the inode since the new transaction doesn't
 361          * have the inode attached.
 362          */
 363         error = xfs_trans_commit(tp, 0);
 364         tp = ntp;
 365         if (error) {
 366                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
 367                 goto error0;
 368         }
 369         /*
 370          * transaction commit worked ok so we can drop the extra ticket
 371          * reference that we gained in xfs_trans_dup()
 372          */
 373         xfs_log_ticket_put(tp->t_ticket);
 374
 375         /*
 376          * Remove the memory for extent descriptions (just bookkeeping).
 377          */
 378         if (ip->i_df.if_bytes)
 379                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
 380         ASSERT(ip->i_df.if_bytes == 0);
 381         /*
 382          * Put an itruncate log reservation in the new transaction
 383          * for our caller.
 384          */
 385         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
 386                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
 387                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
 388                 goto error0;
 389         }
 390         /*
 391          * Return with the inode locked but not joined to the transaction.
 392          */
 393         *tpp = tp;
 394         return 0;
 395
 396  error1:
 397         xfs_bmap_cancel(&free_list);
 398  error0:
 399         /*
 400          * Have to come here with the inode locked and either
 401          * (held and in the transaction) or (not in the transaction).
 402          * If the inode isn't held then cancel would iput it, but
 403          * that's wrong since this is inactive and the vnode ref
 404          * count is 0 already.
 405          * Cancel won't do anything to the inode if held, but it still
 406          * needs to be locked until the cancel is done, if it was
 407          * joined to the transaction.
 408          */
 409         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 410         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 411         *tpp = NULL;
 412         return error;
 413
 414 }
 415
 416 STATIC int
 417 xfs_inactive_symlink_local(
 418         xfs_inode_t     *ip,
 419         xfs_trans_t     **tpp)
 420 {
 421         int             error;
 422
 423         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
 424         /*
 425          * We're freeing a symlink which fit into
 426          * the inode.  Just free the memory used
 427          * to hold the old symlink.
 428          */
 429         error = xfs_trans_reserve(*tpp, 0,
 430                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
 431                                   0, XFS_TRANS_PERM_LOG_RES,
 432                                   XFS_ITRUNCATE_LOG_COUNT);
 433
 434         if (error) {
 435                 xfs_trans_cancel(*tpp, 0);
 436                 *tpp = NULL;
 437                 return error;
 438         }
 439         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 440
 441         /*
 442          * Zero length symlinks _can_ exist.
 443          */
 444         if (ip->i_df.if_bytes > 0) {
 445                 xfs_idata_realloc(ip,
 446                                   -(ip->i_df.if_bytes),
 447                                   XFS_DATA_FORK);
 448                 ASSERT(ip->i_df.if_bytes == 0);
 449         }
 450         return 0;
 451 }
 452
 453 STATIC int
 454 xfs_inactive_attrs(
 455         xfs_inode_t     *ip,
 456         xfs_trans_t     **tpp)
 457 {
 458         xfs_trans_t     *tp;
 459         int             error;
 460         xfs_mount_t     *mp;
 461
 462         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 463         tp = *tpp;
 464         mp = ip->i_mount;
 465         ASSERT(ip->i_d.di_forkoff != 0);
 466         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 467         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 468         if (error)
 469                 goto error_unlock;
 470
 471         error = xfs_attr_inactive(ip);
 472         if (error)
 473                 goto error_unlock;
 474
 475         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 476         error = xfs_trans_reserve(tp, 0,
 477                                   XFS_IFREE_LOG_RES(mp),
 478                                   0, XFS_TRANS_PERM_LOG_RES,
 479                                   XFS_INACTIVE_LOG_COUNT);
 480         if (error)
 481                 goto error_cancel;
 482
 483         xfs_ilock(ip, XFS_ILOCK_EXCL);
 484         xfs_trans_ijoin(tp, ip, 0);
 485         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 486
 487         ASSERT(ip->i_d.di_anextents == 0);
 488
 489         *tpp = tp;
 490         return 0;
 491
 492 error_cancel:
 493         ASSERT(XFS_FORCED_SHUTDOWN(mp));
 494         xfs_trans_cancel(tp, 0);
 495 error_unlock:
 496         *tpp = NULL;
 497         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 498         return error;
 499 }
 500
 501 int
 502 xfs_release(
 503         xfs_inode_t     *ip)
 504 {
 505         xfs_mount_t     *mp = ip->i_mount;
 506         int             error;
 507
 508         if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
 509                 return 0;
 510
 511         /* If this is a read-only mount, don't do this (would generate I/O) */
 512         if (mp->m_flags & XFS_MOUNT_RDONLY)
 513                 return 0;
 514
 515         if (!XFS_FORCED_SHUTDOWN(mp)) {
 516                 int truncated;
 517
 518                 /*
 519                  * If we are using filestreams, and we have an unlinked
 520                  * file that we are processing the last close on, then nothing
 521                  * will be able to reopen and write to this file. Purge this
 522                  * inode from the filestreams cache so that it doesn't delay
 523                  * teardown of the inode.
 524                  */
 525                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
 526                         xfs_filestream_deassociate(ip);
 527
 528                 /*
 529                  * If we previously truncated this file and removed old data
 530                  * in the process, we want to initiate "early" writeout on
 531                  * the last close.  This is an attempt to combat the notorious
 532                  * NULL files problem which is particularly noticeable from a
 533                  * truncate down, buffered (re-)write (delalloc), followed by
 534                  * a crash.  What we are effectively doing here is
 535                  * significantly reducing the time window where we'd otherwise
 536                  * be exposed to that problem.
 537                  */
 538                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
 539                 if (truncated) {
 540                         xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
 541                         if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
 542                                 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
 543                 }
 544         }
 545
 546         if (ip->i_d.di_nlink == 0)
 547                 return 0;
 548
 549         if ((S_ISREG(ip->i_d.di_mode) &&
 550              (VFS_I(ip)->i_size > 0 ||
 551               (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
 552              (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
 553             (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
 554
 555                 /*
 556                  * If we can't get the iolock just skip truncating the blocks
 557                  * past EOF because we could deadlock with the mmap_sem
 558                  * otherwise.  We'll get another chance to drop them once the
 559                  * last reference to the inode is dropped, so we'll never leak
 560                  * blocks permanently.
 561                  *
 562                  * Further, check if the inode is being opened, written and
 563                  * closed frequently and we have delayed allocation blocks
 564                  * outstanding (e.g. streaming writes from the NFS server),
 565                  * truncating the blocks past EOF will cause fragmentation to
 566                  * occur.
 567                  *
 568                  * In this case don't do the truncation, either, but we have to
 569                  * be careful how we detect this case. Blocks beyond EOF show
 570                  * up as i_delayed_blks even when the inode is clean, so we
 571                  * need to truncate them away first before checking for a dirty
 572                  * release. Hence on the first dirty close we will still remove
 573                  * the speculative allocation, but after that we will leave it
 574                  * in place.
 575                  */
 576                 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
 577                         return 0;
 578
 579                 error = xfs_free_eofblocks(mp, ip,
 580                                            XFS_FREE_EOF_TRYLOCK);
 581                 if (error)
 582                         return error;
 583
 584                 /* delalloc blocks after truncation means it really is dirty */
 585                 if (ip->i_delayed_blks)
 586                         xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
 587         }
 588         return 0;
 589 }
 590
 591 /*
 592  * xfs_inactive
 593  *
 594  * This is called when the vnode reference count for the vnode
 595  * goes to zero.  If the file has been unlinked, then it must
 596  * now be truncated.  Also, we clear all of the read-ahead state
 597  * kept for the inode here since the file is now closed.
 598  */
 599 int
 600 xfs_inactive(
 601         xfs_inode_t     *ip)
 602 {
 603         xfs_bmap_free_t free_list;
 604         xfs_fsblock_t   first_block;
 605         int             committed;
 606         xfs_trans_t     *tp;
 607         xfs_mount_t     *mp;
 608         int             error;
 609         int             truncate;
 610
 611         /*
 612          * If the inode is already free, then there can be nothing
 613          * to clean up here.
 614          */
 615         if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
 616                 ASSERT(ip->i_df.if_real_bytes == 0);
 617                 ASSERT(ip->i_df.if_broot_bytes == 0);
 618                 return VN_INACTIVE_CACHE;
 619         }
 620
 621         /*
 622          * Only do a truncate if it's a regular file with
 623          * some actual space in it.  It's OK to look at the
 624          * inode's fields without the lock because we're the
 625          * only one with a reference to the inode.
 626          */
 627         truncate = ((ip->i_d.di_nlink == 0) &&
 628             ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
 629              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
 630             S_ISREG(ip->i_d.di_mode));
 631
 632         mp = ip->i_mount;
 633
 634         error = 0;
 635
 636         /* If this is a read-only mount, don't do this (would generate I/O) */
 637         if (mp->m_flags & XFS_MOUNT_RDONLY)
 638                 goto out;
 639
 640         if (ip->i_d.di_nlink != 0) {
 641                 if ((S_ISREG(ip->i_d.di_mode) &&
 642                     (VFS_I(ip)->i_size > 0 ||
 643                      (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
 644                     (ip->i_df.if_flags & XFS_IFEXTENTS) &&
 645                     (!(ip->i_d.di_flags &
 646                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
 647                      ip->i_delayed_blks != 0))) {
 648                         error = xfs_free_eofblocks(mp, ip, 0);
 649                         if (error)
 650                                 return VN_INACTIVE_CACHE;
 651                 }
 652                 goto out;
 653         }
 654
 655         ASSERT(ip->i_d.di_nlink == 0);
 656
 657         error = xfs_qm_dqattach(ip, 0);
 658         if (error)
 659                 return VN_INACTIVE_CACHE;
 660
 661         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
 662         if (truncate) {
 663                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
 664
 665                 error = xfs_trans_reserve(tp, 0,
 666                                           XFS_ITRUNCATE_LOG_RES(mp),
 667                                           0, XFS_TRANS_PERM_LOG_RES,
 668                                           XFS_ITRUNCATE_LOG_COUNT);
 669                 if (error) {
 670                         /* Don't call itruncate_cleanup */
 671                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
 672                         xfs_trans_cancel(tp, 0);
 673                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 674                         return VN_INACTIVE_CACHE;
 675                 }
 676
 677                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 678                 xfs_trans_ijoin(tp, ip, 0);
 679
 680                 ip->i_d.di_size = 0;
 681                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 682
 683                 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
 684                 if (error) {
 685                         xfs_trans_cancel(tp,
 686                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 687                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 688                         return VN_INACTIVE_CACHE;
 689                 }
 690
 691                 ASSERT(ip->i_d.di_nextents == 0);
 692         } else if (S_ISLNK(ip->i_d.di_mode)) {
 693
 694                 /*
 695                  * If we get an error while cleaning up a
 696                  * symlink we bail out.
 697                  */
 698                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
 699                         xfs_inactive_symlink_rmt(ip, &tp) :
 700                         xfs_inactive_symlink_local(ip, &tp);
 701
 702                 if (error) {
 703                         ASSERT(tp == NULL);
 704                         return VN_INACTIVE_CACHE;
 705                 }
 706
 707                 xfs_trans_ijoin(tp, ip, 0);
 708         } else {
 709                 error = xfs_trans_reserve(tp, 0,
 710                                           XFS_IFREE_LOG_RES(mp),
 711                                           0, XFS_TRANS_PERM_LOG_RES,
 712                                           XFS_INACTIVE_LOG_COUNT);
 713                 if (error) {
 714                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
 715                         xfs_trans_cancel(tp, 0);
 716                         return VN_INACTIVE_CACHE;
 717                 }
 718
 719                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 720                 xfs_trans_ijoin(tp, ip, 0);
 721         }
 722
 723         /*
 724          * If there are attributes associated with the file
 725          * then blow them away now.  The code calls a routine
 726          * that recursively deconstructs the attribute fork.
 727          * We need to just commit the current transaction
 728          * because we can't use it for xfs_attr_inactive().
 729          */
 730         if (ip->i_d.di_anextents > 0) {
 731                 error = xfs_inactive_attrs(ip, &tp);
 732                 /*
 733                  * If we got an error, the transaction is already
 734                  * cancelled, and the inode is unlocked. Just get out.
 735                  */
 736                  if (error)
 737                          return VN_INACTIVE_CACHE;
 738         } else if (ip->i_afp) {
 739                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 740         }
 741
 742         /*
 743          * Free the inode.
 744          */
 745         xfs_bmap_init(&free_list, &first_block);
 746         error = xfs_ifree(tp, ip, &free_list);
 747         if (error) {
 748                 /*
 749                  * If we fail to free the inode, shut down.  The cancel
 750                  * might do that, we need to make sure.  Otherwise the
 751                  * inode might be lost for a long time or forever.
 752                  */
 753                 if (!XFS_FORCED_SHUTDOWN(mp)) {
 754                         xfs_notice(mp, "%s: xfs_ifree returned error %d",
 755                                 __func__, error);
 756                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 757                 }
 758                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
 759         } else {
 760                 /*
 761                  * Credit the quota account(s). The inode is gone.
 762                  */
 763                 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
 764
 765                 /*
 766                  * Just ignore errors at this point.  There is nothing we can
 767                  * do except to try to keep going. Make sure it's not a silent
 768                  * error.
 769                  */
 770                 error = xfs_bmap_finish(&tp,  &free_list, &committed);
 771                 if (error)
 772                         xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
 773                                 __func__, error);
 774                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 775                 if (error)
 776                         xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
 777                                 __func__, error);
 778         }
 779
 780         /*
 781          * Release the dquots held by inode, if any.
 782          */
 783         xfs_qm_dqdetach(ip);
 784         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 785
 786  out:
 787         return VN_INACTIVE_CACHE;
 788 }
 789
 790 /*
 791  * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
 792  * is allowed, otherwise it has to be an exact match. If a CI match is found,
 793  * ci_name->name will point to a the actual name (caller must free) or
 794  * will be set to NULL if an exact match is found.
 795  */
 796 int
 797 xfs_lookup(
 798         xfs_inode_t             *dp,
 799         struct xfs_name         *name,
 800         xfs_inode_t             **ipp,
 801         struct xfs_name         *ci_name)
 802 {
 803         xfs_ino_t               inum;
 804         int                     error;
 805         uint                    lock_mode;
 806
 807         trace_xfs_lookup(dp, name);
 808
 809         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 810                 return XFS_ERROR(EIO);
 811
 812         lock_mode = xfs_ilock_map_shared(dp);
 813         error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 814         xfs_iunlock_map_shared(dp, lock_mode);
 815
 816         if (error)
 817                 goto out;
 818
 819         error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
 820         if (error)
 821                 goto out_free_name;
 822
 823         return 0;
 824
 825 out_free_name:
 826         if (ci_name)
 827                 kmem_free(ci_name->name);
 828 out:
 829         *ipp = NULL;
 830         return error;
 831 }
 832
 833 int
 834 xfs_create(
 835         xfs_inode_t             *dp,
 836         struct xfs_name         *name,
 837         umode_t                 mode,
 838         xfs_dev_t               rdev,
 839         xfs_inode_t             **ipp)
 840 {
 841         int                     is_dir = S_ISDIR(mode);
 842         struct xfs_mount        *mp = dp->i_mount;
 843         struct xfs_inode        *ip = NULL;
 844         struct xfs_trans        *tp = NULL;
 845         int                     error;
 846         xfs_bmap_free_t         free_list;
 847         xfs_fsblock_t           first_block;
 848         boolean_t               unlock_dp_on_error = B_FALSE;
 849         uint                    cancel_flags;
 850         int                     committed;
 851         prid_t                  prid;
 852         struct xfs_dquot        *udqp = NULL;
 853         struct xfs_dquot        *gdqp = NULL;
 854         uint                    resblks;
 855         uint                    log_res;
 856         uint                    log_count;
 857
 858         trace_xfs_create(dp, name);
 859
 860         if (XFS_FORCED_SHUTDOWN(mp))
 861                 return XFS_ERROR(EIO);
 862
 863         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
 864                 prid = xfs_get_projid(dp);
 865         else
 866                 prid = XFS_PROJID_DEFAULT;
 867
 868         /*
 869          * Make sure that we have allocated dquot(s) on disk.
 870          */
 871         error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
 872                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 873         if (error)
 874                 return error;
 875
 876         if (is_dir) {
 877                 rdev = 0;
 878                 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
 879                 log_res = XFS_MKDIR_LOG_RES(mp);
 880                 log_count = XFS_MKDIR_LOG_COUNT;
 881                 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
 882         } else {
 883                 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
 884                 log_res = XFS_CREATE_LOG_RES(mp);
 885                 log_count = XFS_CREATE_LOG_COUNT;
 886                 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
 887         }
 888
 889         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 890
 891         /*
 892          * Initially assume that the file does not exist and
 893          * reserve the resources for that case.  If that is not
 894          * the case we'll drop the one we have and get a more
 895          * appropriate transaction later.
 896          */
 897         error = xfs_trans_reserve(tp, resblks, log_res, 0,
 898                         XFS_TRANS_PERM_LOG_RES, log_count);
 899         if (error == ENOSPC) {
 900                 /* flush outstanding delalloc blocks and retry */
 901                 xfs_flush_inodes(dp);
 902                 error = xfs_trans_reserve(tp, resblks, log_res, 0,
 903                                 XFS_TRANS_PERM_LOG_RES, log_count);
 904         }
 905         if (error == ENOSPC) {
 906                 /* No space at all so try a "no-allocation" reservation */
 907                 resblks = 0;
 908                 error = xfs_trans_reserve(tp, 0, log_res, 0,
 909                                 XFS_TRANS_PERM_LOG_RES, log_count);
 910         }
 911         if (error) {
 912                 cancel_flags = 0;
 913                 goto out_trans_cancel;
 914         }
 915
 916         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 917         unlock_dp_on_error = B_TRUE;
 918
 919         /*
 920          * Check for directory link count overflow.
 921          */
 922         if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) {
 923                 error = XFS_ERROR(EMLINK);
 924                 goto out_trans_cancel;
 925         }
 926
 927         xfs_bmap_init(&free_list, &first_block);
 928
 929         /*
 930          * Reserve disk quota and the inode.
 931          */
 932         error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
 933         if (error)
 934                 goto out_trans_cancel;
 935
 936         error = xfs_dir_canenter(tp, dp, name, resblks);
 937         if (error)
 938                 goto out_trans_cancel;
 939
 940         /*
 941          * A newly created regular or special file just has one directory
 942          * entry pointing to them, but a directory also the "." entry
 943          * pointing to itself.
 944          */
 945         error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
 946                                prid, resblks > 0, &ip, &committed);
 947         if (error) {
 948                 if (error == ENOSPC)
 949                         goto out_trans_cancel;
 950                 goto out_trans_abort;
 951         }
 952
 953         /*
 954          * Now we join the directory inode to the transaction.  We do not do it
 955          * earlier because xfs_dir_ialloc might commit the previous transaction
 956          * (and release all the locks).  An error from here on will result in
 957          * the transaction cancel unlocking dp so don't do it explicitly in the
 958          * error path.
 959          */
 960         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 961         unlock_dp_on_error = B_FALSE;
 962
 963         error = xfs_dir_createname(tp, dp, name, ip->i_ino,
 964                                         &first_block, &free_list, resblks ?
 965                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 966         if (error) {
 967                 ASSERT(error != ENOSPC);
 968                 goto out_trans_abort;
 969         }
 970         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 971         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 972
 973         if (is_dir) {
 974                 error = xfs_dir_init(tp, ip, dp);
 975                 if (error)
 976                         goto out_bmap_cancel;
 977
 978                 error = xfs_bumplink(tp, dp);
 979                 if (error)
 980                         goto out_bmap_cancel;
 981         }
 982
 983         /*
 984          * If this is a synchronous mount, make sure that the
 985          * create transaction goes to disk before returning to
 986          * the user.
 987          */
 988         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
 989                 xfs_trans_set_sync(tp);
 990
 991         /*
 992          * Attach the dquot(s) to the inodes and modify them incore.
 993          * These ids of the inode couldn't have changed since the new
 994          * inode has been locked ever since it was created.
 995          */
 996         xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
 997
 998         error = xfs_bmap_finish(&tp, &free_list, &committed);
 999         if (error)
1000                 goto out_bmap_cancel;
1001
1002         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1003         if (error)
1004                 goto out_release_inode;
1005
1006         xfs_qm_dqrele(udqp);
1007         xfs_qm_dqrele(gdqp);
1008
1009         *ipp = ip;
1010         return 0;
1011
1012  out_bmap_cancel:
1013         xfs_bmap_cancel(&free_list);
1014  out_trans_abort:
1015         cancel_flags |= XFS_TRANS_ABORT;
1016  out_trans_cancel:
1017         xfs_trans_cancel(tp, cancel_flags);
1018  out_release_inode:
1019         /*
1020          * Wait until after the current transaction is aborted to
1021          * release the inode.  This prevents recursive transactions
1022          * and deadlocks from xfs_inactive.
1023          */
1024         if (ip)
1025                 IRELE(ip);
1026
1027         xfs_qm_dqrele(udqp);
1028         xfs_qm_dqrele(gdqp);
1029
1030         if (unlock_dp_on_error)
1031                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1032         return error;
1033 }
1034
1035 #ifdef DEBUG
1036 int xfs_locked_n;
1037 int xfs_small_retries;
1038 int xfs_middle_retries;
1039 int xfs_lots_retries;
1040 int xfs_lock_delays;
1041 #endif
1042
1043 /*
1044  * Bump the subclass so xfs_lock_inodes() acquires each lock with
1045  * a different value
1046  */
1047 static inline int
1048 xfs_lock_inumorder(int lock_mode, int subclass)
1049 {
1050         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
1051                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
1052         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
1053                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
1054
1055         return lock_mode;
1056 }
1057
1058 /*
1059  * The following routine will lock n inodes in exclusive mode.
1060  * We assume the caller calls us with the inodes in i_ino order.
1061  *
1062  * We need to detect deadlock where an inode that we lock
1063  * is in the AIL and we start waiting for another inode that is locked
1064  * by a thread in a long running transaction (such as truncate). This can
1065  * result in deadlock since the long running trans might need to wait
1066  * for the inode we just locked in order to push the tail and free space
1067  * in the log.
1068  */
1069 void
1070 xfs_lock_inodes(
1071         xfs_inode_t     **ips,
1072         int             inodes,
1073         uint            lock_mode)
1074 {
1075         int             attempts = 0, i, j, try_lock;
1076         xfs_log_item_t  *lp;
1077
1078         ASSERT(ips && (inodes >= 2)); /* we need at least two */
1079
1080         try_lock = 0;
1081         i = 0;
1082
1083 again:
1084         for (; i < inodes; i++) {
1085                 ASSERT(ips[i]);
1086
1087                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
1088                         continue;
1089
1090                 /*
1091                  * If try_lock is not set yet, make sure all locked inodes
1092                  * are not in the AIL.
1093                  * If any are, set try_lock to be used later.
1094                  */
1095
1096                 if (!try_lock) {
1097                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
1098                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
1099                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1100                                         try_lock++;
1101                                 }
1102                         }
1103                 }
1104
1105                 /*
1106                  * If any of the previous locks we have locked is in the AIL,
1107                  * we must TRY to get the second and subsequent locks. If
1108                  * we can't get any, we must release all we have
1109                  * and try again.
1110                  */
1111
1112                 if (try_lock) {
1113                         /* try_lock must be 0 if i is 0. */
1114                         /*
1115                          * try_lock means we have an inode locked
1116                          * that is in the AIL.
1117                          */
1118                         ASSERT(i != 0);
1119                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
1120                                 attempts++;
1121
1122                                 /*
1123                                  * Unlock all previous guys and try again.
1124                                  * xfs_iunlock will try to push the tail
1125                                  * if the inode is in the AIL.
1126                                  */
1127
1128                                 for(j = i - 1; j >= 0; j--) {
1129
1130                                         /*
1131                                          * Check to see if we've already
1132                                          * unlocked this one.
1133                                          * Not the first one going back,
1134                                          * and the inode ptr is the same.
1135                                          */
1136                                         if ((j != (i - 1)) && ips[j] ==
1137                                                                 ips[j+1])
1138                                                 continue;
1139
1140                                         xfs_iunlock(ips[j], lock_mode);
1141                                 }
1142
1143                                 if ((attempts % 5) == 0) {
1144                                         delay(1); /* Don't just spin the CPU */
1145 #ifdef DEBUG
1146                                         xfs_lock_delays++;
1147 #endif
1148                                 }
1149                                 i = 0;
1150                                 try_lock = 0;
1151                                 goto again;
1152                         }
1153                 } else {
1154                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
1155                 }
1156         }
1157
1158 #ifdef DEBUG
1159         if (attempts) {
1160                 if (attempts < 5) xfs_small_retries++;
1161                 else if (attempts < 100) xfs_middle_retries++;
1162                 else xfs_lots_retries++;
1163         } else {
1164                 xfs_locked_n++;
1165         }
1166 #endif
1167 }
1168
1169 /*
1170  * xfs_lock_two_inodes() can only be used to lock one type of lock
1171  * at a time - the iolock or the ilock, but not both at once. If
1172  * we lock both at once, lockdep will report false positives saying
1173  * we have violated locking orders.
1174  */
1175 void
1176 xfs_lock_two_inodes(
1177         xfs_inode_t             *ip0,
1178         xfs_inode_t             *ip1,
1179         uint                    lock_mode)
1180 {
1181         xfs_inode_t             *temp;
1182         int                     attempts = 0;
1183         xfs_log_item_t          *lp;
1184
1185         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
1186                 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
1187         ASSERT(ip0->i_ino != ip1->i_ino);
1188
1189         if (ip0->i_ino > ip1->i_ino) {
1190                 temp = ip0;
1191                 ip0 = ip1;
1192                 ip1 = temp;
1193         }
1194
1195  again:
1196         xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
1197
1198         /*
1199          * If the first lock we have locked is in the AIL, we must TRY to get
1200          * the second lock. If we can't get it, we must release the first one
1201          * and try again.
1202          */
1203         lp = (xfs_log_item_t *)ip0->i_itemp;
1204         if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1205                 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
1206                         xfs_iunlock(ip0, lock_mode);
1207                         if ((++attempts % 5) == 0)
1208                                 delay(1); /* Don't just spin the CPU */
1209                         goto again;
1210                 }
1211         } else {
1212                 xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
1213         }
1214 }
1215
1216 int
1217 xfs_remove(
1218         xfs_inode_t             *dp,
1219         struct xfs_name         *name,
1220         xfs_inode_t             *ip)
1221 {
1222         xfs_mount_t             *mp = dp->i_mount;
1223         xfs_trans_t             *tp = NULL;
1224         int                     is_dir = S_ISDIR(ip->i_d.di_mode);
1225         int                     error = 0;
1226         xfs_bmap_free_t         free_list;
1227         xfs_fsblock_t           first_block;
1228         int                     cancel_flags;
1229         int                     committed;
1230         int                     link_zero;
1231         uint                    resblks;
1232         uint                    log_count;
1233
1234         trace_xfs_remove(dp, name);
1235
1236         if (XFS_FORCED_SHUTDOWN(mp))
1237                 return XFS_ERROR(EIO);
1238
1239         error = xfs_qm_dqattach(dp, 0);
1240         if (error)
1241                 goto std_return;
1242
1243         error = xfs_qm_dqattach(ip, 0);
1244         if (error)
1245                 goto std_return;
1246
1247         if (is_dir) {
1248                 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
1249                 log_count = XFS_DEFAULT_LOG_COUNT;
1250         } else {
1251                 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
1252                 log_count = XFS_REMOVE_LOG_COUNT;
1253         }
1254         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1255
1256         /*
1257          * We try to get the real space reservation first,
1258          * allowing for directory btree deletion(s) implying
1259          * possible bmap insert(s).  If we can't get the space
1260          * reservation then we use 0 instead, and avoid the bmap
1261          * btree insert(s) in the directory code by, if the bmap
1262          * insert tries to happen, instead trimming the LAST
1263          * block from the directory.
1264          */
1265         resblks = XFS_REMOVE_SPACE_RES(mp);
1266         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
1267                                   XFS_TRANS_PERM_LOG_RES, log_count);
1268         if (error == ENOSPC) {
1269                 resblks = 0;
1270                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
1271                                           XFS_TRANS_PERM_LOG_RES, log_count);
1272         }
1273         if (error) {
1274                 ASSERT(error != ENOSPC);
1275                 cancel_flags = 0;
1276                 goto out_trans_cancel;
1277         }
1278
1279         xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
1280
1281         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1282         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1283
1284         /*
1285          * If we're removing a directory perform some additional validation.
1286          */
1287         if (is_dir) {
1288                 ASSERT(ip->i_d.di_nlink >= 2);
1289                 if (ip->i_d.di_nlink != 2) {
1290                         error = XFS_ERROR(ENOTEMPTY);
1291                         goto out_trans_cancel;
1292                 }
1293                 if (!xfs_dir_isempty(ip)) {
1294                         error = XFS_ERROR(ENOTEMPTY);
1295                         goto out_trans_cancel;
1296                 }
1297         }
1298
1299         xfs_bmap_init(&free_list, &first_block);
1300         error = xfs_dir_removename(tp, dp, name, ip->i_ino,
1301                                         &first_block, &free_list, resblks);
1302         if (error) {
1303                 ASSERT(error != ENOENT);
1304                 goto out_bmap_cancel;
1305         }
1306         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1307
1308         if (is_dir) {
1309                 /*
1310                  * Drop the link from ip's "..".
1311                  */
1312                 error = xfs_droplink(tp, dp);
1313                 if (error)
1314                         goto out_bmap_cancel;
1315
1316                 /*
1317                  * Drop the "." link from ip to self.
1318                  */
1319                 error = xfs_droplink(tp, ip);
1320                 if (error)
1321                         goto out_bmap_cancel;
1322         } else {
1323                 /*
1324                  * When removing a non-directory we need to log the parent
1325                  * inode here.  For a directory this is done implicitly
1326                  * by the xfs_droplink call for the ".." entry.
1327                  */
1328                 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1329         }
1330
1331         /*
1332          * Drop the link from dp to ip.
1333          */
1334         error = xfs_droplink(tp, ip);
1335         if (error)
1336                 goto out_bmap_cancel;
1337
1338         /*
1339          * Determine if this is the last link while
1340          * we are in the transaction.
1341          */
1342         link_zero = (ip->i_d.di_nlink == 0);
1343
1344         /*
1345          * If this is a synchronous mount, make sure that the
1346          * remove transaction goes to disk before returning to
1347          * the user.
1348          */
1349         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1350                 xfs_trans_set_sync(tp);
1351
1352         error = xfs_bmap_finish(&tp, &free_list, &committed);
1353         if (error)
1354                 goto out_bmap_cancel;
1355
1356         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1357         if (error)
1358                 goto std_return;
1359
1360         /*
1361          * If we are using filestreams, kill the stream association.
1362          * If the file is still open it may get a new one but that
1363          * will get killed on last close in xfs_close() so we don't
1364          * have to worry about that.
1365          */
1366         if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
1367                 xfs_filestream_deassociate(ip);
1368
1369         return 0;
1370
1371  out_bmap_cancel:
1372         xfs_bmap_cancel(&free_list);
1373         cancel_flags |= XFS_TRANS_ABORT;
1374  out_trans_cancel:
1375         xfs_trans_cancel(tp, cancel_flags);
1376  std_return:
1377         return error;
1378 }
1379
1380 int
1381 xfs_link(
1382         xfs_inode_t             *tdp,
1383         xfs_inode_t             *sip,
1384         struct xfs_name         *target_name)
1385 {
1386         xfs_mount_t             *mp = tdp->i_mount;
1387         xfs_trans_t             *tp;
1388         int                     error;
1389         xfs_bmap_free_t         free_list;
1390         xfs_fsblock_t           first_block;
1391         int                     cancel_flags;
1392         int                     committed;
1393         int                     resblks;
1394
1395         trace_xfs_link(tdp, target_name);
1396
1397         ASSERT(!S_ISDIR(sip->i_d.di_mode));
1398
1399         if (XFS_FORCED_SHUTDOWN(mp))
1400                 return XFS_ERROR(EIO);
1401
1402         error = xfs_qm_dqattach(sip, 0);
1403         if (error)
1404                 goto std_return;
1405
1406         error = xfs_qm_dqattach(tdp, 0);
1407         if (error)
1408                 goto std_return;
1409
1410         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
1411         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1412         resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1413         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
1414                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1415         if (error == ENOSPC) {
1416                 resblks = 0;
1417                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
1418                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1419         }
1420         if (error) {
1421                 cancel_flags = 0;
1422                 goto error_return;
1423         }
1424
1425         xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1426
1427         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1428         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1429
1430         /*
1431          * If the source has too many links, we can't make any more to it.
1432          */
1433         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
1434                 error = XFS_ERROR(EMLINK);
1435                 goto error_return;
1436         }
1437
1438         /*
1439          * If we are using project inheritance, we only allow hard link
1440          * creation in our tree when the project IDs are the same; else
1441          * the tree quota mechanism could be circumvented.
1442          */
1443         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1444                      (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1445                 error = XFS_ERROR(EXDEV);
1446                 goto error_return;
1447         }
1448
1449         error = xfs_dir_canenter(tp, tdp, target_name, resblks);
1450         if (error)
1451                 goto error_return;
1452
1453         xfs_bmap_init(&free_list, &first_block);
1454
1455         error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1456                                         &first_block, &free_list, resblks);
1457         if (error)
1458                 goto abort_return;
1459         xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1460         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1461
1462         error = xfs_bumplink(tp, sip);
1463         if (error)
1464                 goto abort_return;
1465
1466         /*
1467          * If this is a synchronous mount, make sure that the
1468          * link transaction goes to disk before returning to
1469          * the user.
1470          */
1471         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1472                 xfs_trans_set_sync(tp);
1473         }
1474
1475         error = xfs_bmap_finish (&tp, &free_list, &committed);
1476         if (error) {
1477                 xfs_bmap_cancel(&free_list);
1478                 goto abort_return;
1479         }
1480
1481         return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1482
1483  abort_return:
1484         cancel_flags |= XFS_TRANS_ABORT;
1485  error_return:
1486         xfs_trans_cancel(tp, cancel_flags);
1487  std_return:
1488         return error;
1489 }
1490
1491 int
1492 xfs_symlink(
1493         xfs_inode_t             *dp,
1494         struct xfs_name         *link_name,
1495         const char              *target_path,
1496         umode_t                 mode,
1497         xfs_inode_t             **ipp)
1498 {
1499         xfs_mount_t             *mp = dp->i_mount;
1500         xfs_trans_t             *tp;
1501         xfs_inode_t             *ip;
1502         int                     error;
1503         int                     pathlen;
1504         xfs_bmap_free_t         free_list;
1505         xfs_fsblock_t           first_block;
1506         boolean_t               unlock_dp_on_error = B_FALSE;
1507         uint                    cancel_flags;
1508         int                     committed;
1509         xfs_fileoff_t           first_fsb;
1510         xfs_filblks_t           fs_blocks;
1511         int                     nmaps;
1512         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
1513         xfs_daddr_t             d;
1514         const char              *cur_chunk;
1515         int                     byte_cnt;
1516         int                     n;
1517         xfs_buf_t               *bp;
1518         prid_t                  prid;
1519         struct xfs_dquot        *udqp, *gdqp;
1520         uint                    resblks;
1521
1522         *ipp = NULL;
1523         error = 0;
1524         ip = NULL;
1525         tp = NULL;
1526
1527         trace_xfs_symlink(dp, link_name);
1528
1529         if (XFS_FORCED_SHUTDOWN(mp))
1530                 return XFS_ERROR(EIO);
1531
1532         /*
1533          * Check component lengths of the target path name.
1534          */
1535         pathlen = strlen(target_path);
1536         if (pathlen >= MAXPATHLEN)      /* total string too long */
1537                 return XFS_ERROR(ENAMETOOLONG);
1538
1539         udqp = gdqp = NULL;
1540         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1541                 prid = xfs_get_projid(dp);
1542         else
1543                 prid = XFS_PROJID_DEFAULT;
1544
1545         /*
1546          * Make sure that we have allocated dquot(s) on disk.
1547          */
1548         error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
1549                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
1550         if (error)
1551                 goto std_return;
1552
1553         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
1554         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1555         /*
1556          * The symlink will fit into the inode data fork?
1557          * There can't be any attributes so we get the whole variable part.
1558          */
1559         if (pathlen <= XFS_LITINO(mp))
1560                 fs_blocks = 0;
1561         else
1562                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
1563         resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
1564         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
1565                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
1566         if (error == ENOSPC && fs_blocks == 0) {
1567                 resblks = 0;
1568                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
1569                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
1570         }
1571         if (error) {
1572                 cancel_flags = 0;
1573                 goto error_return;
1574         }
1575
1576         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1577         unlock_dp_on_error = B_TRUE;
1578
1579         /*
1580          * Check whether the directory allows new symlinks or not.
1581          */
1582         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
1583                 error = XFS_ERROR(EPERM);
1584                 goto error_return;
1585         }
1586
1587         /*
1588          * Reserve disk quota : blocks and inode.
1589          */
1590         error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
1591         if (error)
1592                 goto error_return;
1593
1594         /*
1595          * Check for ability to enter directory entry, if no space reserved.
1596          */
1597         error = xfs_dir_canenter(tp, dp, link_name, resblks);
1598         if (error)
1599                 goto error_return;
1600         /*
1601          * Initialize the bmap freelist prior to calling either
1602          * bmapi or the directory create code.
1603          */
1604         xfs_bmap_init(&free_list, &first_block);
1605
1606         /*
1607          * Allocate an inode for the symlink.
1608          */
1609         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
1610                                prid, resblks > 0, &ip, NULL);
1611         if (error) {
1612                 if (error == ENOSPC)
1613                         goto error_return;
1614                 goto error1;
1615         }
1616
1617         /*
1618          * An error after we've joined dp to the transaction will result in the
1619          * transaction cancel unlocking dp so don't do it explicitly in the
1620          * error path.
1621          */
1622         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1623         unlock_dp_on_error = B_FALSE;
1624
1625         /*
1626          * Also attach the dquot(s) to it, if applicable.
1627          */
1628         xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
1629
1630         if (resblks)
1631                 resblks -= XFS_IALLOC_SPACE_RES(mp);
1632         /*
1633          * If the symlink will fit into the inode, write it inline.
1634          */
1635         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
1636                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
1637                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
1638                 ip->i_d.di_size = pathlen;
1639
1640                 /*
1641                  * The inode was initially created in extent format.
1642                  */
1643                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
1644                 ip->i_df.if_flags |= XFS_IFINLINE;
1645
1646                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
1647                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
1648
1649         } else {
1650                 first_fsb = 0;
1651                 nmaps = SYMLINK_MAPS;
1652
1653                 error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
1654                                   XFS_BMAPI_METADATA, &first_block, resblks,
1655                                   mval, &nmaps, &free_list);
1656                 if (error)
1657                         goto error2;
1658
1659                 if (resblks)
1660                         resblks -= fs_blocks;
1661                 ip->i_d.di_size = pathlen;
1662                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1663
1664                 cur_chunk = target_path;
1665                 for (n = 0; n < nmaps; n++) {
1666                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1667                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1668                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
1669                                                BTOBB(byte_cnt), 0);
1670                         if (!bp) {
1671                                 error = ENOMEM;
1672                                 goto error2;
1673                         }
1674                         if (pathlen < byte_cnt) {
1675                                 byte_cnt = pathlen;
1676                         }
1677                         pathlen -= byte_cnt;
1678
1679                         memcpy(bp->b_addr, cur_chunk, byte_cnt);
1680                         cur_chunk += byte_cnt;
1681
1682                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
1683                 }
1684         }
1685
1686         /*
1687          * Create the directory entry for the symlink.
1688          */
1689         error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
1690                                         &first_block, &free_list, resblks);
1691         if (error)
1692                 goto error2;
1693         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1694         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1695
1696         /*
1697          * If this is a synchronous mount, make sure that the
1698          * symlink transaction goes to disk before returning to
1699          * the user.
1700          */
1701         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1702                 xfs_trans_set_sync(tp);
1703         }
1704
1705         error = xfs_bmap_finish(&tp, &free_list, &committed);
1706         if (error) {
1707                 goto error2;
1708         }
1709         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1710         xfs_qm_dqrele(udqp);
1711         xfs_qm_dqrele(gdqp);
1712
1713         *ipp = ip;
1714         return 0;
1715
1716  error2:
1717         IRELE(ip);
1718  error1:
1719         xfs_bmap_cancel(&free_list);
1720         cancel_flags |= XFS_TRANS_ABORT;
1721  error_return:
1722         xfs_trans_cancel(tp, cancel_flags);
1723         xfs_qm_dqrele(udqp);
1724         xfs_qm_dqrele(gdqp);
1725
1726         if (unlock_dp_on_error)
1727                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1728  std_return:
1729         return error;
1730 }
1731
1732 int
1733 xfs_set_dmattrs(
1734         xfs_inode_t     *ip,
1735         u_int           evmask,
1736         u_int16_t       state)
1737 {
1738         xfs_mount_t     *mp = ip->i_mount;
1739         xfs_trans_t     *tp;
1740         int             error;
1741
1742         if (!capable(CAP_SYS_ADMIN))
1743                 return XFS_ERROR(EPERM);
1744
1745         if (XFS_FORCED_SHUTDOWN(mp))
1746                 return XFS_ERROR(EIO);
1747
1748         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
1749         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
1750         if (error) {
1751                 xfs_trans_cancel(tp, 0);
1752                 return error;
1753         }
1754         xfs_ilock(ip, XFS_ILOCK_EXCL);
1755         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1756
1757         ip->i_d.di_dmevmask = evmask;
1758         ip->i_d.di_dmstate  = state;
1759
1760         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1761         error = xfs_trans_commit(tp, 0);
1762
1763         return error;
1764 }
1765
1766 /*
1767  * xfs_alloc_file_space()
1768  *      This routine allocates disk space for the given file.
1769  *
1770  *      If alloc_type == 0, this request is for an ALLOCSP type
1771  *      request which will change the file size.  In this case, no
1772  *      DMAPI event will be generated by the call.  A TRUNCATE event
1773  *      will be generated later by xfs_setattr.
1774  *
1775  *      If alloc_type != 0, this request is for a RESVSP type
1776  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
1777  *      lower block boundary byte address is less than the file's
1778  *      length.
1779  *
1780  * RETURNS:
1781  *       0 on success
1782  *      errno on error
1783  *
1784  */
1785 STATIC int
1786 xfs_alloc_file_space(
1787         xfs_inode_t             *ip,
1788         xfs_off_t               offset,
1789         xfs_off_t               len,
1790         int                     alloc_type,
1791         int                     attr_flags)
1792 {
1793         xfs_mount_t             *mp = ip->i_mount;
1794         xfs_off_t               count;
1795         xfs_filblks_t           allocated_fsb;
1796         xfs_filblks_t           allocatesize_fsb;
1797         xfs_extlen_t            extsz, temp;
1798         xfs_fileoff_t           startoffset_fsb;
1799         xfs_fsblock_t           firstfsb;
1800         int                     nimaps;
1801         int                     quota_flag;
1802         int                     rt;
1803         xfs_trans_t             *tp;
1804         xfs_bmbt_irec_t         imaps[1], *imapp;
1805         xfs_bmap_free_t         free_list;
1806         uint                    qblocks, resblks, resrtextents;
1807         int                     committed;
1808         int                     error;
1809
1810         trace_xfs_alloc_file_space(ip);
1811
1812         if (XFS_FORCED_SHUTDOWN(mp))
1813                 return XFS_ERROR(EIO);
1814
1815         error = xfs_qm_dqattach(ip, 0);
1816         if (error)
1817                 return error;
1818
1819         if (len <= 0)
1820                 return XFS_ERROR(EINVAL);
1821
1822         rt = XFS_IS_REALTIME_INODE(ip);
1823         extsz = xfs_get_extsz_hint(ip);
1824
1825         count = len;
1826         imapp = &imaps[0];
1827         nimaps = 1;
1828         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
1829         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
1830
1831         /*
1832          * Allocate file space until done or until there is an error
1833          */
1834         while (allocatesize_fsb && !error) {
1835                 xfs_fileoff_t   s, e;
1836
1837                 /*
1838                  * Determine space reservations for data/realtime.
1839                  */
1840                 if (unlikely(extsz)) {
1841                         s = startoffset_fsb;
1842                         do_div(s, extsz);
1843                         s *= extsz;
1844                         e = startoffset_fsb + allocatesize_fsb;
1845                         if ((temp = do_mod(startoffset_fsb, extsz)))
1846                                 e += temp;
1847                         if ((temp = do_mod(e, extsz)))
1848                                 e += extsz - temp;
1849                 } else {
1850                         s = 0;
1851                         e = allocatesize_fsb;
1852                 }
1853
1854                 /*
1855                  * The transaction reservation is limited to a 32-bit block
1856                  * count, hence we need to limit the number of blocks we are
1857                  * trying to reserve to avoid an overflow. We can't allocate
1858                  * more than @nimaps extents, and an extent is limited on disk
1859                  * to MAXEXTLEN (21 bits), so use that to enforce the limit.
1860                  */
1861                 resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
1862                 if (unlikely(rt)) {
1863                         resrtextents = qblocks = resblks;
1864                         resrtextents /= mp->m_sb.sb_rextsize;
1865                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1866                         quota_flag = XFS_QMOPT_RES_RTBLKS;
1867                 } else {
1868                         resrtextents = 0;
1869                         resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
1870                         quota_flag = XFS_QMOPT_RES_REGBLKS;
1871                 }
1872
1873                 /*
1874                  * Allocate and setup the transaction.
1875                  */
1876                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1877                 error = xfs_trans_reserve(tp, resblks,
1878                                           XFS_WRITE_LOG_RES(mp), resrtextents,
1879                                           XFS_TRANS_PERM_LOG_RES,
1880                                           XFS_WRITE_LOG_COUNT);
1881                 /*
1882                  * Check for running out of space
1883                  */
1884                 if (error) {
1885                         /*
1886                          * Free the transaction structure.
1887                          */
1888                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1889                         xfs_trans_cancel(tp, 0);
1890                         break;
1891                 }
1892                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1893                 error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
1894                                                       0, quota_flag);
1895                 if (error)
1896                         goto error1;
1897
1898                 xfs_trans_ijoin(tp, ip, 0);
1899
1900                 xfs_bmap_init(&free_list, &firstfsb);
1901                 error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1902                                         allocatesize_fsb, alloc_type, &firstfsb,
1903                                         0, imapp, &nimaps, &free_list);
1904                 if (error) {
1905                         goto error0;
1906                 }
1907
1908                 /*
1909                  * Complete the transaction
1910                  */
1911                 error = xfs_bmap_finish(&tp, &free_list, &committed);
1912                 if (error) {
1913                         goto error0;
1914                 }
1915
1916                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1917                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1918                 if (error) {
1919                         break;
1920                 }
1921
1922                 allocated_fsb = imapp->br_blockcount;
1923
1924                 if (nimaps == 0) {
1925                         error = XFS_ERROR(ENOSPC);
1926                         break;
1927                 }
1928
1929                 startoffset_fsb += allocated_fsb;
1930                 allocatesize_fsb -= allocated_fsb;
1931         }
1932
1933         return error;
1934
1935 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1936         xfs_bmap_cancel(&free_list);
1937         xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
1938
1939 error1: /* Just cancel transaction */
1940         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1941         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1942         return error;
1943 }
1944
1945 /*
1946  * Zero file bytes between startoff and endoff inclusive.
1947  * The iolock is held exclusive and no blocks are buffered.
1948  *
1949  * This function is used by xfs_free_file_space() to zero
1950  * partial blocks when the range to free is not block aligned.
1951  * When unreserving space with boundaries that are not block
1952  * aligned we round up the start and round down the end
1953  * boundaries and then use this function to zero the parts of
1954  * the blocks that got dropped during the rounding.
1955  */
1956 STATIC int
1957 xfs_zero_remaining_bytes(
1958         xfs_inode_t             *ip,
1959         xfs_off_t               startoff,
1960         xfs_off_t               endoff)
1961 {
1962         xfs_bmbt_irec_t         imap;
1963         xfs_fileoff_t           offset_fsb;
1964         xfs_off_t               lastoffset;
1965         xfs_off_t               offset;
1966         xfs_buf_t               *bp;
1967         xfs_mount_t             *mp = ip->i_mount;
1968         int                     nimap;
1969         int                     error = 0;
1970
1971         /*
1972          * Avoid doing I/O beyond eof - it's not necessary
1973          * since nothing can read beyond eof.  The space will
1974          * be zeroed when the file is extended anyway.
1975          */
1976         if (startoff >= XFS_ISIZE(ip))
1977                 return 0;
1978
1979         if (endoff > XFS_ISIZE(ip))
1980                 endoff = XFS_ISIZE(ip);
1981
1982         bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
1983                                         mp->m_rtdev_targp : mp->m_ddev_targp,
1984                                 mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
1985         if (!bp)
1986                 return XFS_ERROR(ENOMEM);
1987
1988         xfs_buf_unlock(bp);
1989
1990         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1991                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1992                 nimap = 1;
1993                 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1994                 if (error || nimap < 1)
1995                         break;
1996                 ASSERT(imap.br_blockcount >= 1);
1997                 ASSERT(imap.br_startoff == offset_fsb);
1998                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
1999                 if (lastoffset > endoff)
2000                         lastoffset = endoff;
2001                 if (imap.br_startblock == HOLESTARTBLOCK)
2002                         continue;
2003                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
2004                 if (imap.br_state == XFS_EXT_UNWRITTEN)
2005                         continue;
2006                 XFS_BUF_UNDONE(bp);
2007                 XFS_BUF_UNWRITE(bp);
2008                 XFS_BUF_READ(bp);
2009                 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
2010                 xfsbdstrat(mp, bp);
2011                 error = xfs_buf_iowait(bp);
2012                 if (error) {
2013                         xfs_buf_ioerror_alert(bp,
2014                                         "xfs_zero_remaining_bytes(read)");
2015                         break;
2016                 }
2017                 memset(bp->b_addr +
2018                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
2019                       0, lastoffset - offset + 1);
2020                 XFS_BUF_UNDONE(bp);
2021                 XFS_BUF_UNREAD(bp);
2022                 XFS_BUF_WRITE(bp);
2023                 xfsbdstrat(mp, bp);
2024                 error = xfs_buf_iowait(bp);
2025                 if (error) {
2026                         xfs_buf_ioerror_alert(bp,
2027                                         "xfs_zero_remaining_bytes(write)");
2028                         break;
2029                 }
2030         }
2031         xfs_buf_free(bp);
2032         return error;
2033 }
2034
2035 /*
2036  * xfs_free_file_space()
2037  *      This routine frees disk space for the given file.
2038  *
2039  *      This routine is only called by xfs_change_file_space
2040  *      for an UNRESVSP type call.
2041  *
2042  * RETURNS:
2043  *       0 on success
2044  *      errno on error
2045  *
2046  */
2047 STATIC int
2048 xfs_free_file_space(
2049         xfs_inode_t             *ip,
2050         xfs_off_t               offset,
2051         xfs_off_t               len,
2052         int                     attr_flags)
2053 {
2054         int                     committed;
2055         int                     done;
2056         xfs_fileoff_t           endoffset_fsb;
2057         int                     error;
2058         xfs_fsblock_t           firstfsb;
2059         xfs_bmap_free_t         free_list;
2060         xfs_bmbt_irec_t         imap;
2061         xfs_off_t               ioffset;
2062         xfs_extlen_t            mod=0;
2063         xfs_mount_t             *mp;
2064         int                     nimap;
2065         uint                    resblks;
2066         uint                    rounding;
2067         int                     rt;
2068         xfs_fileoff_t           startoffset_fsb;
2069         xfs_trans_t             *tp;
2070         int                     need_iolock = 1;
2071
2072         mp = ip->i_mount;
2073
2074         trace_xfs_free_file_space(ip);
2075
2076         error = xfs_qm_dqattach(ip, 0);
2077         if (error)
2078                 return error;
2079
2080         error = 0;
2081         if (len <= 0)   /* if nothing being freed */
2082                 return error;
2083         rt = XFS_IS_REALTIME_INODE(ip);
2084         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
2085         endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
2086
2087         if (attr_flags & XFS_ATTR_NOLOCK)
2088                 need_iolock = 0;
2089         if (need_iolock) {
2090                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
2091                 /* wait for the completion of any pending DIOs */
2092                 inode_dio_wait(VFS_I(ip));
2093         }
2094
2095         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
2096         ioffset = offset & ~(rounding - 1);
2097
2098         if (VN_CACHED(VFS_I(ip)) != 0) {
2099                 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
2100                 if (error)
2101                         goto out_unlock_iolock;
2102         }
2103
2104         /*
2105          * Need to zero the stuff we're not freeing, on disk.
2106          * If it's a realtime file & can't use unwritten extents then we
2107          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
2108          * will take care of it for us.
2109          */
2110         if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
2111                 nimap = 1;
2112                 error = xfs_bmapi_read(ip, startoffset_fsb, 1,
2113                                         &imap, &nimap, 0);
2114                 if (error)
2115                         goto out_unlock_iolock;
2116                 ASSERT(nimap == 0 || nimap == 1);
2117                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
2118                         xfs_daddr_t     block;
2119
2120                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
2121                         block = imap.br_startblock;
2122                         mod = do_div(block, mp->m_sb.sb_rextsize);
2123                         if (mod)
2124                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
2125                 }
2126                 nimap = 1;
2127                 error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
2128                                         &imap, &nimap, 0);
2129                 if (error)
2130                         goto out_unlock_iolock;
2131                 ASSERT(nimap == 0 || nimap == 1);
2132                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
2133                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
2134                         mod++;
2135                         if (mod && (mod != mp->m_sb.sb_rextsize))
2136                                 endoffset_fsb -= mod;
2137                 }
2138         }
2139         if ((done = (endoffset_fsb <= startoffset_fsb)))
2140                 /*
2141                  * One contiguous piece to clear
2142                  */
2143                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
2144         else {
2145                 /*
2146                  * Some full blocks, possibly two pieces to clear
2147                  */
2148                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
2149                         error = xfs_zero_remaining_bytes(ip, offset,
2150                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
2151                 if (!error &&
2152                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
2153                         error = xfs_zero_remaining_bytes(ip,
2154                                 XFS_FSB_TO_B(mp, endoffset_fsb),
2155                                 offset + len - 1);
2156         }
2157
2158         /*
2159          * free file space until done or until there is an error
2160          */
2161         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
2162         while (!error && !done) {
2163
2164                 /*
2165                  * allocate and setup the transaction. Allow this
2166                  * transaction to dip into the reserve blocks to ensure
2167                  * the freeing of the space succeeds at ENOSPC.
2168                  */
2169                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
2170                 tp->t_flags |= XFS_TRANS_RESERVE;
2171                 error = xfs_trans_reserve(tp,
2172                                           resblks,
2173                                           XFS_WRITE_LOG_RES(mp),
2174                                           0,
2175                                           XFS_TRANS_PERM_LOG_RES,
2176                                           XFS_WRITE_LOG_COUNT);
2177
2178                 /*
2179                  * check for running out of space
2180                  */
2181                 if (error) {
2182                         /*
2183                          * Free the transaction structure.
2184                          */
2185                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
2186                         xfs_trans_cancel(tp, 0);
2187                         break;
2188                 }
2189                 xfs_ilock(ip, XFS_ILOCK_EXCL);
2190                 error = xfs_trans_reserve_quota(tp, mp,
2191                                 ip->i_udquot, ip->i_gdquot,
2192                                 resblks, 0, XFS_QMOPT_RES_REGBLKS);
2193                 if (error)
2194                         goto error1;
2195
2196                 xfs_trans_ijoin(tp, ip, 0);
2197
2198                 /*
2199                  * issue the bunmapi() call to free the blocks
2200                  */
2201                 xfs_bmap_init(&free_list, &firstfsb);
2202                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
2203                                   endoffset_fsb - startoffset_fsb,
2204                                   0, 2, &firstfsb, &free_list, &done);
2205                 if (error) {
2206                         goto error0;
2207                 }
2208
2209                 /*
2210                  * complete the transaction
2211                  */
2212                 error = xfs_bmap_finish(&tp, &free_list, &committed);
2213                 if (error) {
2214                         goto error0;
2215                 }
2216
2217                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2218                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2219         }
2220
2221  out_unlock_iolock:
2222         if (need_iolock)
2223                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
2224         return error;
2225
2226  error0:
2227         xfs_bmap_cancel(&free_list);
2228  error1:
2229         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
2230         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
2231                     XFS_ILOCK_EXCL);
2232         return error;
2233 }
2234
2235 /*
2236  * xfs_change_file_space()
2237  *      This routine allocates or frees disk space for the given file.
2238  *      The user specified parameters are checked for alignment and size
2239  *      limitations.
2240  *
2241  * RETURNS:
2242  *       0 on success
2243  *      errno on error
2244  *
2245  */
2246 int
2247 xfs_change_file_space(
2248         xfs_inode_t     *ip,
2249         int             cmd,
2250         xfs_flock64_t   *bf,
2251         xfs_off_t       offset,
2252         int             attr_flags)
2253 {
2254         xfs_mount_t     *mp = ip->i_mount;
2255         int             clrprealloc;
2256         int             error;
2257         xfs_fsize_t     fsize;
2258         int             setprealloc;
2259         xfs_off_t       startoffset;
2260         xfs_off_t       llen;
2261         xfs_trans_t     *tp;
2262         struct iattr    iattr;
2263         int             prealloc_type;
2264
2265         if (!S_ISREG(ip->i_d.di_mode))
2266                 return XFS_ERROR(EINVAL);
2267
2268         switch (bf->l_whence) {
2269         case 0: /*SEEK_SET*/
2270                 break;
2271         case 1: /*SEEK_CUR*/
2272                 bf->l_start += offset;
2273                 break;
2274         case 2: /*SEEK_END*/
2275                 bf->l_start += XFS_ISIZE(ip);
2276                 break;
2277         default:
2278                 return XFS_ERROR(EINVAL);
2279         }
2280
2281         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
2282
2283         if (   (bf->l_start < 0)
2284             || (bf->l_start > XFS_MAXIOFFSET(mp))
2285             || (bf->l_start + llen < 0)
2286             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
2287                 return XFS_ERROR(EINVAL);
2288
2289         bf->l_whence = 0;
2290
2291         startoffset = bf->l_start;
2292         fsize = XFS_ISIZE(ip);
2293
2294         /*
2295          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
2296          * file space.
2297          * These calls do NOT zero the data space allocated to the file,
2298          * nor do they change the file size.
2299          *
2300          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
2301          * space.
2302          * These calls cause the new file data to be zeroed and the file
2303          * size to be changed.
2304          */
2305         setprealloc = clrprealloc = 0;
2306         prealloc_type = XFS_BMAPI_PREALLOC;
2307
2308         switch (cmd) {
2309         case XFS_IOC_ZERO_RANGE:
2310                 prealloc_type |= XFS_BMAPI_CONVERT;
2311                 xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
2312                 /* FALLTHRU */
2313         case XFS_IOC_RESVSP:
2314         case XFS_IOC_RESVSP64:
2315                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
2316                                                 prealloc_type, attr_flags);
2317                 if (error)
2318                         return error;
2319                 setprealloc = 1;
2320                 break;
2321
2322         case XFS_IOC_UNRESVSP:
2323         case XFS_IOC_UNRESVSP64:
2324                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
2325                                                                 attr_flags)))
2326                         return error;
2327                 break;
2328
2329         case XFS_IOC_ALLOCSP:
2330         case XFS_IOC_ALLOCSP64:
2331         case XFS_IOC_FREESP:
2332         case XFS_IOC_FREESP64:
2333                 if (startoffset > fsize) {
2334                         error = xfs_alloc_file_space(ip, fsize,
2335                                         startoffset - fsize, 0, attr_flags);
2336                         if (error)
2337                                 break;
2338                 }
2339
2340                 iattr.ia_valid = ATTR_SIZE;
2341                 iattr.ia_size = startoffset;
2342
2343                 error = xfs_setattr_size(ip, &iattr, attr_flags);
2344
2345                 if (error)
2346                         return error;
2347
2348                 clrprealloc = 1;
2349                 break;
2350
2351         default:
2352                 ASSERT(0);
2353                 return XFS_ERROR(EINVAL);
2354         }
2355
2356         /*
2357          * update the inode timestamp, mode, and prealloc flag bits
2358          */
2359         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
2360
2361         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
2362                                       0, 0, 0))) {
2363                 /* ASSERT(0); */
2364                 xfs_trans_cancel(tp, 0);
2365                 return error;
2366         }
2367
2368         xfs_ilock(ip, XFS_ILOCK_EXCL);
2369         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2370
2371         if ((attr_flags & XFS_ATTR_DMI) == 0) {
2372                 ip->i_d.di_mode &= ~S_ISUID;
2373
2374                 /*
2375                  * Note that we don't have to worry about mandatory
2376                  * file locking being disabled here because we only
2377                  * clear the S_ISGID bit if the Group execute bit is
2378                  * on, but if it was on then mandatory locking wouldn't
2379                  * have been enabled.
2380                  */
2381                 if (ip->i_d.di_mode & S_IXGRP)
2382                         ip->i_d.di_mode &= ~S_ISGID;
2383
2384                 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2385         }
2386         if (setprealloc)
2387                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
2388         else if (clrprealloc)
2389                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
2390
2391         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2392         if (attr_flags & XFS_ATTR_SYNC)
2393                 xfs_trans_set_sync(tp);
2394         return xfs_trans_commit(tp, 0);
2395 }