fs/xfs/xfs_vnodeops.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include "xfs.h"
  20 #include "xfs_fs.h"
  21 #include "xfs_types.h"
  22 #include "xfs_bit.h"
  23 #include "xfs_log.h"
  24 #include "xfs_inum.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_sb.h"
  27 #include "xfs_ag.h"
  28 #include "xfs_dir2.h"
  29 #include "xfs_dmapi.h"
  30 #include "xfs_mount.h"
  31 #include "xfs_da_btree.h"
  32 #include "xfs_bmap_btree.h"
  33 #include "xfs_alloc_btree.h"
  34 #include "xfs_ialloc_btree.h"
  35 #include "xfs_dir2_sf.h"
  36 #include "xfs_attr_sf.h"
  37 #include "xfs_dinode.h"
  38 #include "xfs_inode.h"
  39 #include "xfs_inode_item.h"
  40 #include "xfs_itable.h"
  41 #include "xfs_btree.h"
  42 #include "xfs_ialloc.h"
  43 #include "xfs_alloc.h"
  44 #include "xfs_bmap.h"
  45 #include "xfs_attr.h"
  46 #include "xfs_rw.h"
  47 #include "xfs_error.h"
  48 #include "xfs_quota.h"
  49 #include "xfs_utils.h"
  50 #include "xfs_rtalloc.h"
  51 #include "xfs_refcache.h"
  52 #include "xfs_trans_space.h"
  53 #include "xfs_log_priv.h"
  54
  55 STATIC int
  56 xfs_open(
  57         bhv_desc_t      *bdp,
  58         cred_t          *credp)
  59 {
  60         int             mode;
  61         bhv_vnode_t     *vp = BHV_TO_VNODE(bdp);
  62         xfs_inode_t     *ip = XFS_BHVTOI(bdp);
  63
  64         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  65                 return XFS_ERROR(EIO);
  66
  67         /*
  68          * If it's a directory with any blocks, read-ahead block 0
  69          * as we're almost certain to have the next operation be a read there.
  70          */
  71         if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) {
  72                 mode = xfs_ilock_map_shared(ip);
  73                 if (ip->i_d.di_nextents > 0)
  74                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
  75                 xfs_iunlock(ip, mode);
  76         }
  77         return 0;
  78 }
  79
  80 STATIC int
  81 xfs_close(
  82         bhv_desc_t      *bdp,
  83         int             flags,
  84         lastclose_t     lastclose,
  85         cred_t          *credp)
  86 {
  87         bhv_vnode_t     *vp = BHV_TO_VNODE(bdp);
  88         xfs_inode_t     *ip = XFS_BHVTOI(bdp);
  89
  90         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  91                 return XFS_ERROR(EIO);
  92
  93         if (lastclose != L_TRUE || !VN_ISREG(vp))
  94                 return 0;
  95
  96         /*
  97          * If we previously truncated this file and removed old data in
  98          * the process, we want to initiate "early" writeout on the last
  99          * close.  This is an attempt to combat the notorious NULL files
 100          * problem which is particularly noticable from a truncate down,
 101          * buffered (re-)write (delalloc), followed by a crash.  What we
 102          * are effectively doing here is significantly reducing the time
 103          * window where we'd otherwise be exposed to that problem.
 104          */
 105         if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
 106                 return bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE);
 107         return 0;
 108 }
 109
 110 /*
 111  * xfs_getattr
 112  */
 113 STATIC int
 114 xfs_getattr(
 115         bhv_desc_t      *bdp,
 116         bhv_vattr_t     *vap,
 117         int             flags,
 118         cred_t          *credp)
 119 {
 120         xfs_inode_t     *ip;
 121         xfs_mount_t     *mp;
 122         bhv_vnode_t     *vp;
 123
 124         vp  = BHV_TO_VNODE(bdp);
 125         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 126
 127         ip = XFS_BHVTOI(bdp);
 128         mp = ip->i_mount;
 129
 130         if (XFS_FORCED_SHUTDOWN(mp))
 131                 return XFS_ERROR(EIO);
 132
 133         if (!(flags & ATTR_LAZY))
 134                 xfs_ilock(ip, XFS_ILOCK_SHARED);
 135
 136         vap->va_size = XFS_ISIZE(ip);
 137         if (vap->va_mask == XFS_AT_SIZE)
 138                 goto all_done;
 139
 140         vap->va_nblocks =
 141                 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 142         vap->va_nodeid = ip->i_ino;
 143 #if XFS_BIG_INUMS
 144         vap->va_nodeid += mp->m_inoadd;
 145 #endif
 146         vap->va_nlink = ip->i_d.di_nlink;
 147
 148         /*
 149          * Quick exit for non-stat callers
 150          */
 151         if ((vap->va_mask &
 152             ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
 153               XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
 154                 goto all_done;
 155
 156         /*
 157          * Copy from in-core inode.
 158          */
 159         vap->va_mode = ip->i_d.di_mode;
 160         vap->va_uid = ip->i_d.di_uid;
 161         vap->va_gid = ip->i_d.di_gid;
 162         vap->va_projid = ip->i_d.di_projid;
 163
 164         /*
 165          * Check vnode type block/char vs. everything else.
 166          */
 167         switch (ip->i_d.di_mode & S_IFMT) {
 168         case S_IFBLK:
 169         case S_IFCHR:
 170                 vap->va_rdev = ip->i_df.if_u2.if_rdev;
 171                 vap->va_blocksize = BLKDEV_IOSIZE;
 172                 break;
 173         default:
 174                 vap->va_rdev = 0;
 175
 176                 if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
 177                         vap->va_blocksize = xfs_preferred_iosize(mp);
 178                 } else {
 179
 180                         /*
 181                          * If the file blocks are being allocated from a
 182                          * realtime partition, then return the inode's
 183                          * realtime extent size or the realtime volume's
 184                          * extent size.
 185                          */
 186                         vap->va_blocksize =
 187                                 xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
 188                 }
 189                 break;
 190         }
 191
 192         vn_atime_to_timespec(vp, &vap->va_atime);
 193         vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 194         vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 195         vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
 196         vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
 197
 198         /*
 199          * Exit for stat callers.  See if any of the rest of the fields
 200          * to be filled in are needed.
 201          */
 202         if ((vap->va_mask &
 203              (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 204               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 205                 goto all_done;
 206
 207         /*
 208          * Convert di_flags to xflags.
 209          */
 210         vap->va_xflags = xfs_ip2xflags(ip);
 211
 212         /*
 213          * Exit for inode revalidate.  See if any of the rest of
 214          * the fields to be filled in are needed.
 215          */
 216         if ((vap->va_mask &
 217              (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 218               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 219                 goto all_done;
 220
 221         vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
 222         vap->va_nextents =
 223                 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
 224                         ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
 225                         ip->i_d.di_nextents;
 226         if (ip->i_afp)
 227                 vap->va_anextents =
 228                         (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
 229                                 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
 230                                  ip->i_d.di_anextents;
 231         else
 232                 vap->va_anextents = 0;
 233         vap->va_gen = ip->i_d.di_gen;
 234
 235  all_done:
 236         if (!(flags & ATTR_LAZY))
 237                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 238         return 0;
 239 }
 240
 241
 242 /*
 243  * xfs_setattr
 244  */
 245 int
 246 xfs_setattr(
 247         bhv_desc_t              *bdp,
 248         bhv_vattr_t             *vap,
 249         int                     flags,
 250         cred_t                  *credp)
 251 {
 252         xfs_inode_t             *ip;
 253         xfs_trans_t             *tp;
 254         xfs_mount_t             *mp;
 255         int                     mask;
 256         int                     code;
 257         uint                    lock_flags;
 258         uint                    commit_flags=0;
 259         uid_t                   uid=0, iuid=0;
 260         gid_t                   gid=0, igid=0;
 261         int                     timeflags = 0;
 262         bhv_vnode_t             *vp;
 263         xfs_prid_t              projid=0, iprojid=0;
 264         int                     mandlock_before, mandlock_after;
 265         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
 266         int                     file_owner;
 267         int                     need_iolock = 1;
 268
 269         vp = BHV_TO_VNODE(bdp);
 270         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 271
 272         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
 273                 return XFS_ERROR(EROFS);
 274
 275         /*
 276          * Cannot set certain attributes.
 277          */
 278         mask = vap->va_mask;
 279         if (mask & XFS_AT_NOSET) {
 280                 return XFS_ERROR(EINVAL);
 281         }
 282
 283         ip = XFS_BHVTOI(bdp);
 284         mp = ip->i_mount;
 285
 286         if (XFS_FORCED_SHUTDOWN(mp))
 287                 return XFS_ERROR(EIO);
 288
 289         /*
 290          * Timestamps do not need to be logged and hence do not
 291          * need to be done within a transaction.
 292          */
 293         if (mask & XFS_AT_UPDTIMES) {
 294                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
 295                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
 296                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
 297                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
 298                 xfs_ichgtime(ip, timeflags);
 299                 return 0;
 300         }
 301
 302         olddquot1 = olddquot2 = NULL;
 303         udqp = gdqp = NULL;
 304
 305         /*
 306          * If disk quotas is on, we make sure that the dquots do exist on disk,
 307          * before we start any other transactions. Trying to do this later
 308          * is messy. We don't care to take a readlock to look at the ids
 309          * in inode here, because we can't hold it across the trans_reserve.
 310          * If the IDs do change before we take the ilock, we're covered
 311          * because the i_*dquot fields will get updated anyway.
 312          */
 313         if (XFS_IS_QUOTA_ON(mp) &&
 314             (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
 315                 uint    qflags = 0;
 316
 317                 if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
 318                         uid = vap->va_uid;
 319                         qflags |= XFS_QMOPT_UQUOTA;
 320                 } else {
 321                         uid = ip->i_d.di_uid;
 322                 }
 323                 if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
 324                         gid = vap->va_gid;
 325                         qflags |= XFS_QMOPT_GQUOTA;
 326                 }  else {
 327                         gid = ip->i_d.di_gid;
 328                 }
 329                 if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
 330                         projid = vap->va_projid;
 331                         qflags |= XFS_QMOPT_PQUOTA;
 332                 }  else {
 333                         projid = ip->i_d.di_projid;
 334                 }
 335                 /*
 336                  * We take a reference when we initialize udqp and gdqp,
 337                  * so it is important that we never blindly double trip on
 338                  * the same variable. See xfs_create() for an example.
 339                  */
 340                 ASSERT(udqp == NULL);
 341                 ASSERT(gdqp == NULL);
 342                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
 343                                          &udqp, &gdqp);
 344                 if (code)
 345                         return code;
 346         }
 347
 348         /*
 349          * For the other attributes, we acquire the inode lock and
 350          * first do an error checking pass.
 351          */
 352         tp = NULL;
 353         lock_flags = XFS_ILOCK_EXCL;
 354         if (flags & ATTR_NOLOCK)
 355                 need_iolock = 0;
 356         if (!(mask & XFS_AT_SIZE)) {
 357                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
 358                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
 359                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 360                         commit_flags = 0;
 361                         if ((code = xfs_trans_reserve(tp, 0,
 362                                                      XFS_ICHANGE_LOG_RES(mp), 0,
 363                                                      0, 0))) {
 364                                 lock_flags = 0;
 365                                 goto error_return;
 366                         }
 367                 }
 368         } else {
 369                 if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
 370                     !(flags & ATTR_DMI)) {
 371                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 372                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
 373                                 vap->va_size, 0, dmflags, NULL);
 374                         if (code) {
 375                                 lock_flags = 0;
 376                                 goto error_return;
 377                         }
 378                 }
 379                 if (need_iolock)
 380                         lock_flags |= XFS_IOLOCK_EXCL;
 381         }
 382
 383         xfs_ilock(ip, lock_flags);
 384
 385         /* boolean: are we the file owner? */
 386         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
 387
 388         /*
 389          * Change various properties of a file.
 390          * Only the owner or users with CAP_FOWNER
 391          * capability may do these things.
 392          */
 393         if (mask &
 394             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
 395              XFS_AT_GID|XFS_AT_PROJID)) {
 396                 /*
 397                  * CAP_FOWNER overrides the following restrictions:
 398                  *
 399                  * The user ID of the calling process must be equal
 400                  * to the file owner ID, except in cases where the
 401                  * CAP_FSETID capability is applicable.
 402                  */
 403                 if (!file_owner && !capable(CAP_FOWNER)) {
 404                         code = XFS_ERROR(EPERM);
 405                         goto error_return;
 406                 }
 407
 408                 /*
 409                  * CAP_FSETID overrides the following restrictions:
 410                  *
 411                  * The effective user ID of the calling process shall match
 412                  * the file owner when setting the set-user-ID and
 413                  * set-group-ID bits on that file.
 414                  *
 415                  * The effective group ID or one of the supplementary group
 416                  * IDs of the calling process shall match the group owner of
 417                  * the file when setting the set-group-ID bit on that file
 418                  */
 419                 if (mask & XFS_AT_MODE) {
 420                         mode_t m = 0;
 421
 422                         if ((vap->va_mode & S_ISUID) && !file_owner)
 423                                 m |= S_ISUID;
 424                         if ((vap->va_mode & S_ISGID) &&
 425                             !in_group_p((gid_t)ip->i_d.di_gid))
 426                                 m |= S_ISGID;
 427 #if 0
 428                         /* Linux allows this, Irix doesn't. */
 429                         if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
 430                                 m |= S_ISVTX;
 431 #endif
 432                         if (m && !capable(CAP_FSETID))
 433                                 vap->va_mode &= ~m;
 434                 }
 435         }
 436
 437         /*
 438          * Change file ownership.  Must be the owner or privileged.
 439          * If the system was configured with the "restricted_chown"
 440          * option, the owner is not permitted to give away the file,
 441          * and can change the group id only to a group of which he
 442          * or she is a member.
 443          */
 444         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 445                 /*
 446                  * These IDs could have changed since we last looked at them.
 447                  * But, we're assured that if the ownership did change
 448                  * while we didn't have the inode locked, inode's dquot(s)
 449                  * would have changed also.
 450                  */
 451                 iuid = ip->i_d.di_uid;
 452                 iprojid = ip->i_d.di_projid;
 453                 igid = ip->i_d.di_gid;
 454                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
 455                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
 456                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
 457                          iprojid;
 458
 459                 /*
 460                  * CAP_CHOWN overrides the following restrictions:
 461                  *
 462                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
 463                  * shall override the restriction that a process cannot
 464                  * change the user ID of a file it owns and the restriction
 465                  * that the group ID supplied to the chown() function
 466                  * shall be equal to either the group ID or one of the
 467                  * supplementary group IDs of the calling process.
 468                  */
 469                 if (restricted_chown &&
 470                     (iuid != uid || (igid != gid &&
 471                                      !in_group_p((gid_t)gid))) &&
 472                     !capable(CAP_CHOWN)) {
 473                         code = XFS_ERROR(EPERM);
 474                         goto error_return;
 475                 }
 476                 /*
 477                  * Do a quota reservation only if uid/projid/gid is actually
 478                  * going to change.
 479                  */
 480                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 481                     (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
 482                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 483                         ASSERT(tp);
 484                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
 485                                                 capable(CAP_FOWNER) ?
 486                                                 XFS_QMOPT_FORCE_RES : 0);
 487                         if (code)       /* out of quota */
 488                                 goto error_return;
 489                 }
 490         }
 491
 492         /*
 493          * Truncate file.  Must have write permission and not be a directory.
 494          */
 495         if (mask & XFS_AT_SIZE) {
 496                 /* Short circuit the truncate case for zero length files */
 497                 if ((vap->va_size == 0) &&
 498                    (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
 499                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 500                         lock_flags &= ~XFS_ILOCK_EXCL;
 501                         if (mask & XFS_AT_CTIME)
 502                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 503                         code = 0;
 504                         goto error_return;
 505                 }
 506
 507                 if (VN_ISDIR(vp)) {
 508                         code = XFS_ERROR(EISDIR);
 509                         goto error_return;
 510                 } else if (!VN_ISREG(vp)) {
 511                         code = XFS_ERROR(EINVAL);
 512                         goto error_return;
 513                 }
 514                 /*
 515                  * Make sure that the dquots are attached to the inode.
 516                  */
 517                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
 518                         goto error_return;
 519         }
 520
 521         /*
 522          * Change file access or modified times.
 523          */
 524         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 525                 if (!file_owner) {
 526                         if ((flags & ATTR_UTIME) &&
 527                             !capable(CAP_FOWNER)) {
 528                                 code = XFS_ERROR(EPERM);
 529                                 goto error_return;
 530                         }
 531                 }
 532         }
 533
 534         /*
 535          * Change extent size or realtime flag.
 536          */
 537         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 538                 /*
 539                  * Can't change extent size if any extents are allocated.
 540                  */
 541                 if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
 542                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 543                      vap->va_extsize) ) {
 544                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 545                         goto error_return;
 546                 }
 547
 548                 /*
 549                  * Can't change realtime flag if any extents are allocated.
 550                  */
 551                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 552                     (mask & XFS_AT_XFLAGS) &&
 553                     (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
 554                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 555                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 556                         goto error_return;
 557                 }
 558                 /*
 559                  * Extent size must be a multiple of the appropriate block
 560                  * size, if set at all.
 561                  */
 562                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
 563                         xfs_extlen_t    size;
 564
 565                         if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
 566                             ((mask & XFS_AT_XFLAGS) &&
 567                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
 568                                 size = mp->m_sb.sb_rextsize <<
 569                                        mp->m_sb.sb_blocklog;
 570                         } else {
 571                                 size = mp->m_sb.sb_blocksize;
 572                         }
 573                         if (vap->va_extsize % size) {
 574                                 code = XFS_ERROR(EINVAL);
 575                                 goto error_return;
 576                         }
 577                 }
 578                 /*
 579                  * If realtime flag is set then must have realtime data.
 580                  */
 581                 if ((mask & XFS_AT_XFLAGS) &&
 582                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 583                         if ((mp->m_sb.sb_rblocks == 0) ||
 584                             (mp->m_sb.sb_rextsize == 0) ||
 585                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
 586                                 code = XFS_ERROR(EINVAL);
 587                                 goto error_return;
 588                         }
 589                 }
 590
 591                 /*
 592                  * Can't modify an immutable/append-only file unless
 593                  * we have appropriate permission.
 594                  */
 595                 if ((mask & XFS_AT_XFLAGS) &&
 596                     (ip->i_d.di_flags &
 597                                 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
 598                      (vap->va_xflags &
 599                                 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
 600                     !capable(CAP_LINUX_IMMUTABLE)) {
 601                         code = XFS_ERROR(EPERM);
 602                         goto error_return;
 603                 }
 604         }
 605
 606         /*
 607          * Now we can make the changes.  Before we join the inode
 608          * to the transaction, if XFS_AT_SIZE is set then take care of
 609          * the part of the truncation that must be done without the
 610          * inode lock.  This needs to be done before joining the inode
 611          * to the transaction, because the inode cannot be unlocked
 612          * once it is a part of the transaction.
 613          */
 614         if (mask & XFS_AT_SIZE) {
 615                 code = 0;
 616                 if ((vap->va_size > ip->i_size) &&
 617                     (flags & ATTR_NOSIZETOK) == 0) {
 618                         code = xfs_igrow_start(ip, vap->va_size, credp);
 619                 }
 620                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 621                 vn_iowait(vp); /* wait for the completion of any pending DIOs */
 622                 if (!code)
 623                         code = xfs_itruncate_data(ip, vap->va_size);
 624                 if (code) {
 625                         ASSERT(tp == NULL);
 626                         lock_flags &= ~XFS_ILOCK_EXCL;
 627                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
 628                         goto error_return;
 629                 }
 630                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 631                 if ((code = xfs_trans_reserve(tp, 0,
 632                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
 633                                              XFS_TRANS_PERM_LOG_RES,
 634                                              XFS_ITRUNCATE_LOG_COUNT))) {
 635                         xfs_trans_cancel(tp, 0);
 636                         if (need_iolock)
 637                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 638                         return code;
 639                 }
 640                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 641                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 642         }
 643
 644         if (tp) {
 645                 xfs_trans_ijoin(tp, ip, lock_flags);
 646                 xfs_trans_ihold(tp, ip);
 647         }
 648
 649         /* determine whether mandatory locking mode changes */
 650         mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
 651
 652         /*
 653          * Truncate file.  Must have write permission and not be a directory.
 654          */
 655         if (mask & XFS_AT_SIZE) {
 656                 if (vap->va_size > ip->i_size) {
 657                         xfs_igrow_finish(tp, ip, vap->va_size,
 658                             !(flags & ATTR_DMI));
 659                 } else if ((vap->va_size <= ip->i_size) ||
 660                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 661                         /*
 662                          * signal a sync transaction unless
 663                          * we're truncating an already unlinked
 664                          * file on a wsync filesystem
 665                          */
 666                         code = xfs_itruncate_finish(&tp, ip,
 667                                             (xfs_fsize_t)vap->va_size,
 668                                             XFS_DATA_FORK,
 669                                             ((ip->i_d.di_nlink != 0 ||
 670                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
 671                                              ? 1 : 0));
 672                         if (code)
 673                                 goto abort_return;
 674                         /*
 675                          * Truncated "down", so we're removing references
 676                          * to old data here - if we now delay flushing for
 677                          * a long time, we expose ourselves unduly to the
 678                          * notorious NULL files problem.  So, we mark this
 679                          * vnode and flush it when the file is closed, and
 680                          * do not wait the usual (long) time for writeout.
 681                          */
 682                         VTRUNCATE(vp);
 683                 }
 684                 /*
 685                  * Have to do this even if the file's size doesn't change.
 686                  */
 687                 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 688         }
 689
 690         /*
 691          * Change file access modes.
 692          */
 693         if (mask & XFS_AT_MODE) {
 694                 ip->i_d.di_mode &= S_IFMT;
 695                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
 696
 697                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 698                 timeflags |= XFS_ICHGTIME_CHG;
 699         }
 700
 701         /*
 702          * Change file ownership.  Must be the owner or privileged.
 703          * If the system was configured with the "restricted_chown"
 704          * option, the owner is not permitted to give away the file,
 705          * and can change the group id only to a group of which he
 706          * or she is a member.
 707          */
 708         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 709                 /*
 710                  * CAP_FSETID overrides the following restrictions:
 711                  *
 712                  * The set-user-ID and set-group-ID bits of a file will be
 713                  * cleared upon successful return from chown()
 714                  */
 715                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 716                     !capable(CAP_FSETID)) {
 717                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 718                 }
 719
 720                 /*
 721                  * Change the ownerships and register quota modifications
 722                  * in the transaction.
 723                  */
 724                 if (iuid != uid) {
 725                         if (XFS_IS_UQUOTA_ON(mp)) {
 726                                 ASSERT(mask & XFS_AT_UID);
 727                                 ASSERT(udqp);
 728                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 729                                                         &ip->i_udquot, udqp);
 730                         }
 731                         ip->i_d.di_uid = uid;
 732                 }
 733                 if (igid != gid) {
 734                         if (XFS_IS_GQUOTA_ON(mp)) {
 735                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
 736                                 ASSERT(mask & XFS_AT_GID);
 737                                 ASSERT(gdqp);
 738                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 739                                                         &ip->i_gdquot, gdqp);
 740                         }
 741                         ip->i_d.di_gid = gid;
 742                 }
 743                 if (iprojid != projid) {
 744                         if (XFS_IS_PQUOTA_ON(mp)) {
 745                                 ASSERT(!XFS_IS_GQUOTA_ON(mp));
 746                                 ASSERT(mask & XFS_AT_PROJID);
 747                                 ASSERT(gdqp);
 748                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 749                                                         &ip->i_gdquot, gdqp);
 750                         }
 751                         ip->i_d.di_projid = projid;
 752                         /*
 753                          * We may have to rev the inode as well as
 754                          * the superblock version number since projids didn't
 755                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
 756                          */
 757                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
 758                                 xfs_bump_ino_vers2(tp, ip);
 759                 }
 760
 761                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 762                 timeflags |= XFS_ICHGTIME_CHG;
 763         }
 764
 765
 766         /*
 767          * Change file access or modified times.
 768          */
 769         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 770                 if (mask & XFS_AT_ATIME) {
 771                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
 772                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
 773                         ip->i_update_core = 1;
 774                         timeflags &= ~XFS_ICHGTIME_ACC;
 775                 }
 776                 if (mask & XFS_AT_MTIME) {
 777                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
 778                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
 779                         timeflags &= ~XFS_ICHGTIME_MOD;
 780                         timeflags |= XFS_ICHGTIME_CHG;
 781                 }
 782                 if (tp && (flags & ATTR_UTIME))
 783                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 784         }
 785
 786         /*
 787          * Change XFS-added attributes.
 788          */
 789         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 790                 if (mask & XFS_AT_EXTSIZE) {
 791                         /*
 792                          * Converting bytes to fs blocks.
 793                          */
 794                         ip->i_d.di_extsize = vap->va_extsize >>
 795                                 mp->m_sb.sb_blocklog;
 796                 }
 797                 if (mask & XFS_AT_XFLAGS) {
 798                         uint    di_flags;
 799
 800                         /* can't set PREALLOC this way, just preserve it */
 801                         di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
 802                         if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 803                                 di_flags |= XFS_DIFLAG_IMMUTABLE;
 804                         if (vap->va_xflags & XFS_XFLAG_APPEND)
 805                                 di_flags |= XFS_DIFLAG_APPEND;
 806                         if (vap->va_xflags & XFS_XFLAG_SYNC)
 807                                 di_flags |= XFS_DIFLAG_SYNC;
 808                         if (vap->va_xflags & XFS_XFLAG_NOATIME)
 809                                 di_flags |= XFS_DIFLAG_NOATIME;
 810                         if (vap->va_xflags & XFS_XFLAG_NODUMP)
 811                                 di_flags |= XFS_DIFLAG_NODUMP;
 812                         if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 813                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
 814                         if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
 815                                 di_flags |= XFS_DIFLAG_NODEFRAG;
 816                         if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 817                                 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 818                                         di_flags |= XFS_DIFLAG_RTINHERIT;
 819                                 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
 820                                         di_flags |= XFS_DIFLAG_NOSYMLINKS;
 821                                 if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
 822                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 823                         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 824                                 if (vap->va_xflags & XFS_XFLAG_REALTIME) {
 825                                         di_flags |= XFS_DIFLAG_REALTIME;
 826                                         ip->i_iocore.io_flags |= XFS_IOCORE_RT;
 827                                 } else {
 828                                         ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
 829                                 }
 830                                 if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
 831                                         di_flags |= XFS_DIFLAG_EXTSIZE;
 832                         }
 833                         ip->i_d.di_flags = di_flags;
 834                 }
 835                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 836                 timeflags |= XFS_ICHGTIME_CHG;
 837         }
 838
 839         /*
 840          * Change file inode change time only if XFS_AT_CTIME set
 841          * AND we have been called by a DMI function.
 842          */
 843
 844         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
 845                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
 846                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
 847                 ip->i_update_core = 1;
 848                 timeflags &= ~XFS_ICHGTIME_CHG;
 849         }
 850
 851         /*
 852          * Send out timestamp changes that need to be set to the
 853          * current time.  Not done when called by a DMI function.
 854          */
 855         if (timeflags && !(flags & ATTR_DMI))
 856                 xfs_ichgtime(ip, timeflags);
 857
 858         XFS_STATS_INC(xs_ig_attrchg);
 859
 860         /*
 861          * If this is a synchronous mount, make sure that the
 862          * transaction goes to disk before returning to the user.
 863          * This is slightly sub-optimal in that truncates require
 864          * two sync transactions instead of one for wsync filesystems.
 865          * One for the truncate and one for the timestamps since we
 866          * don't want to change the timestamps unless we're sure the
 867          * truncate worked.  Truncates are less than 1% of the laddis
 868          * mix so this probably isn't worth the trouble to optimize.
 869          */
 870         code = 0;
 871         if (tp) {
 872                 if (mp->m_flags & XFS_MOUNT_WSYNC)
 873                         xfs_trans_set_sync(tp);
 874
 875                 code = xfs_trans_commit(tp, commit_flags);
 876         }
 877
 878         /*
 879          * If the (regular) file's mandatory locking mode changed, then
 880          * notify the vnode.  We do this under the inode lock to prevent
 881          * racing calls to vop_vnode_change.
 882          */
 883         mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
 884         if (mandlock_before != mandlock_after) {
 885                 bhv_vop_vnode_change(vp, VCHANGE_FLAGS_ENF_LOCKING,
 886                                  mandlock_after);
 887         }
 888
 889         xfs_iunlock(ip, lock_flags);
 890
 891         /*
 892          * Release any dquot(s) the inode had kept before chown.
 893          */
 894         XFS_QM_DQRELE(mp, olddquot1);
 895         XFS_QM_DQRELE(mp, olddquot2);
 896         XFS_QM_DQRELE(mp, udqp);
 897         XFS_QM_DQRELE(mp, gdqp);
 898
 899         if (code) {
 900                 return code;
 901         }
 902
 903         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
 904             !(flags & ATTR_DMI)) {
 905                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
 906                                         NULL, DM_RIGHT_NULL, NULL, NULL,
 907                                         0, 0, AT_DELAY_FLAG(flags));
 908         }
 909         return 0;
 910
 911  abort_return:
 912         commit_flags |= XFS_TRANS_ABORT;
 913         /* FALLTHROUGH */
 914  error_return:
 915         XFS_QM_DQRELE(mp, udqp);
 916         XFS_QM_DQRELE(mp, gdqp);
 917         if (tp) {
 918                 xfs_trans_cancel(tp, commit_flags);
 919         }
 920         if (lock_flags != 0) {
 921                 xfs_iunlock(ip, lock_flags);
 922         }
 923         return code;
 924 }
 925
 926
 927 /*
 928  * xfs_access
 929  * Null conversion from vnode mode bits to inode mode bits, as in efs.
 930  */
 931 STATIC int
 932 xfs_access(
 933         bhv_desc_t      *bdp,
 934         int             mode,
 935         cred_t          *credp)
 936 {
 937         xfs_inode_t     *ip;
 938         int             error;
 939
 940         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
 941                                                (inst_t *)__return_address);
 942
 943         ip = XFS_BHVTOI(bdp);
 944         xfs_ilock(ip, XFS_ILOCK_SHARED);
 945         error = xfs_iaccess(ip, mode, credp);
 946         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 947         return error;
 948 }
 949
 950
 951 /*
 952  * The maximum pathlen is 1024 bytes. Since the minimum file system
 953  * blocksize is 512 bytes, we can get a max of 2 extents back from
 954  * bmapi.
 955  */
 956 #define SYMLINK_MAPS 2
 957
 958 /*
 959  * xfs_readlink
 960  *
 961  */
 962 STATIC int
 963 xfs_readlink(
 964         bhv_desc_t      *bdp,
 965         uio_t           *uiop,
 966         int             ioflags,
 967         cred_t          *credp)
 968 {
 969         xfs_inode_t     *ip;
 970         int             count;
 971         xfs_off_t       offset;
 972         int             pathlen;
 973         bhv_vnode_t     *vp;
 974         int             error = 0;
 975         xfs_mount_t     *mp;
 976         int             nmaps;
 977         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 978         xfs_daddr_t     d;
 979         int             byte_cnt;
 980         int             n;
 981         xfs_buf_t       *bp;
 982
 983         vp = BHV_TO_VNODE(bdp);
 984         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 985
 986         ip = XFS_BHVTOI(bdp);
 987         mp = ip->i_mount;
 988
 989         if (XFS_FORCED_SHUTDOWN(mp))
 990                 return XFS_ERROR(EIO);
 991
 992         xfs_ilock(ip, XFS_ILOCK_SHARED);
 993
 994         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
 995
 996         offset = uiop->uio_offset;
 997         count = uiop->uio_resid;
 998
 999         if (offset < 0) {
1000                 error = XFS_ERROR(EINVAL);
1001                 goto error_return;
1002         }
1003         if (count <= 0) {
1004                 error = 0;
1005                 goto error_return;
1006         }
1007
1008         /*
1009          * See if the symlink is stored inline.
1010          */
1011         pathlen = (int)ip->i_d.di_size;
1012
1013         if (ip->i_df.if_flags & XFS_IFINLINE) {
1014                 error = xfs_uio_read(ip->i_df.if_u1.if_data, pathlen, uiop);
1015         }
1016         else {
1017                 /*
1018                  * Symlink not inline.  Call bmap to get it in.
1019                  */
1020                 nmaps = SYMLINK_MAPS;
1021
1022                 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
1023                                   0, NULL, 0, mval, &nmaps, NULL, NULL);
1024
1025                 if (error) {
1026                         goto error_return;
1027                 }
1028
1029                 for (n = 0; n < nmaps; n++) {
1030                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1031                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1032                         bp = xfs_buf_read(mp->m_ddev_targp, d,
1033                                       BTOBB(byte_cnt), 0);
1034                         error = XFS_BUF_GETERROR(bp);
1035                         if (error) {
1036                                 xfs_ioerror_alert("xfs_readlink",
1037                                           ip->i_mount, bp, XFS_BUF_ADDR(bp));
1038                                 xfs_buf_relse(bp);
1039                                 goto error_return;
1040                         }
1041                         if (pathlen < byte_cnt)
1042                                 byte_cnt = pathlen;
1043                         pathlen -= byte_cnt;
1044
1045                         error = xfs_uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop);
1046                         xfs_buf_relse (bp);
1047                 }
1048
1049         }
1050
1051 error_return:
1052         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1053         return error;
1054 }
1055
1056
1057 /*
1058  * xfs_fsync
1059  *
1060  * This is called to sync the inode and its data out to disk.
1061  * We need to hold the I/O lock while flushing the data, and
1062  * the inode lock while flushing the inode.  The inode lock CANNOT
1063  * be held while flushing the data, so acquire after we're done
1064  * with that.
1065  */
1066 STATIC int
1067 xfs_fsync(
1068         bhv_desc_t      *bdp,
1069         int             flag,
1070         cred_t          *credp,
1071         xfs_off_t       start,
1072         xfs_off_t       stop)
1073 {
1074         xfs_inode_t     *ip;
1075         xfs_trans_t     *tp;
1076         int             error;
1077         int             log_flushed = 0, changed = 1;
1078
1079         vn_trace_entry(BHV_TO_VNODE(bdp),
1080                         __FUNCTION__, (inst_t *)__return_address);
1081
1082         ip = XFS_BHVTOI(bdp);
1083
1084         ASSERT(start >= 0 && stop >= -1);
1085
1086         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1087                 return XFS_ERROR(EIO);
1088
1089         /*
1090          * We always need to make sure that the required inode state
1091          * is safe on disk.  The vnode might be clean but because
1092          * of committed transactions that haven't hit the disk yet.
1093          * Likewise, there could be unflushed non-transactional
1094          * changes to the inode core that have to go to disk.
1095          *
1096          * The following code depends on one assumption:  that
1097          * any transaction that changes an inode logs the core
1098          * because it has to change some field in the inode core
1099          * (typically nextents or nblocks).  That assumption
1100          * implies that any transactions against an inode will
1101          * catch any non-transactional updates.  If inode-altering
1102          * transactions exist that violate this assumption, the
1103          * code breaks.  Right now, it figures that if the involved
1104          * update_* field is clear and the inode is unpinned, the
1105          * inode is clean.  Either it's been flushed or it's been
1106          * committed and the commit has hit the disk unpinning the inode.
1107          * (Note that xfs_inode_item_format() called at commit clears
1108          * the update_* fields.)
1109          */
1110         xfs_ilock(ip, XFS_ILOCK_SHARED);
1111
1112         /* If we are flushing data then we care about update_size
1113          * being set, otherwise we care about update_core
1114          */
1115         if ((flag & FSYNC_DATA) ?
1116                         (ip->i_update_size == 0) :
1117                         (ip->i_update_core == 0)) {
1118                 /*
1119                  * Timestamps/size haven't changed since last inode
1120                  * flush or inode transaction commit.  That means
1121                  * either nothing got written or a transaction
1122                  * committed which caught the updates.  If the
1123                  * latter happened and the transaction hasn't
1124                  * hit the disk yet, the inode will be still
1125                  * be pinned.  If it is, force the log.
1126                  */
1127
1128                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1129
1130                 if (xfs_ipincount(ip)) {
1131                         _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1132                                       XFS_LOG_FORCE |
1133                                       ((flag & FSYNC_WAIT)
1134                                        ? XFS_LOG_SYNC : 0),
1135                                       &log_flushed);
1136                 } else {
1137                         /*
1138                          * If the inode is not pinned and nothing
1139                          * has changed we don't need to flush the
1140                          * cache.
1141                          */
1142                         changed = 0;
1143                 }
1144                 error = 0;
1145         } else  {
1146                 /*
1147                  * Kick off a transaction to log the inode
1148                  * core to get the updates.  Make it
1149                  * sync if FSYNC_WAIT is passed in (which
1150                  * is done by everybody but specfs).  The
1151                  * sync transaction will also force the log.
1152                  */
1153                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1154                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1155                 if ((error = xfs_trans_reserve(tp, 0,
1156                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1157                                 0, 0, 0)))  {
1158                         xfs_trans_cancel(tp, 0);
1159                         return error;
1160                 }
1161                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1162
1163                 /*
1164                  * Note - it's possible that we might have pushed
1165                  * ourselves out of the way during trans_reserve
1166                  * which would flush the inode.  But there's no
1167                  * guarantee that the inode buffer has actually
1168                  * gone out yet (it's delwri).  Plus the buffer
1169                  * could be pinned anyway if it's part of an
1170                  * inode in another recent transaction.  So we
1171                  * play it safe and fire off the transaction anyway.
1172                  */
1173                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1174                 xfs_trans_ihold(tp, ip);
1175                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1176                 if (flag & FSYNC_WAIT)
1177                         xfs_trans_set_sync(tp);
1178                 error = _xfs_trans_commit(tp, 0, &log_flushed);
1179
1180                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1181         }
1182
1183         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1184                 /*
1185                  * If the log write didn't issue an ordered tag we need
1186                  * to flush the disk cache for the data device now.
1187                  */
1188                 if (!log_flushed)
1189                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1190
1191                 /*
1192                  * If this inode is on the RT dev we need to flush that
1193                  * cache as well.
1194                  */
1195                 if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
1196                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1197         }
1198
1199         return error;
1200 }
1201
1202 /*
1203  * This is called by xfs_inactive to free any blocks beyond eof
1204  * when the link count isn't zero and by xfs_dm_punch_hole() when
1205  * punching a hole to EOF.
1206  */
1207 int
1208 xfs_free_eofblocks(
1209         xfs_mount_t     *mp,
1210         xfs_inode_t     *ip,
1211         int             flags)
1212 {
1213         xfs_trans_t     *tp;
1214         int             error;
1215         xfs_fileoff_t   end_fsb;
1216         xfs_fileoff_t   last_fsb;
1217         xfs_filblks_t   map_len;
1218         int             nimaps;
1219         xfs_bmbt_irec_t imap;
1220         int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
1221
1222         /*
1223          * Figure out if there are any blocks beyond the end
1224          * of the file.  If not, then there is nothing to do.
1225          */
1226         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1227         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1228         map_len = last_fsb - end_fsb;
1229         if (map_len <= 0)
1230                 return 0;
1231
1232         nimaps = 1;
1233         xfs_ilock(ip, XFS_ILOCK_SHARED);
1234         error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
1235                           NULL, 0, &imap, &nimaps, NULL, NULL);
1236         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1237
1238         if (!error && (nimaps != 0) &&
1239             (imap.br_startblock != HOLESTARTBLOCK ||
1240              ip->i_delayed_blks)) {
1241                 /*
1242                  * Attach the dquots to the inode up front.
1243                  */
1244                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1245                         return error;
1246
1247                 /*
1248                  * There are blocks after the end of file.
1249                  * Free them up now by truncating the file to
1250                  * its current size.
1251                  */
1252                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1253
1254                 /*
1255                  * Do the xfs_itruncate_start() call before
1256                  * reserving any log space because
1257                  * itruncate_start will call into the buffer
1258                  * cache and we can't
1259                  * do that within a transaction.
1260                  */
1261                 if (use_iolock)
1262                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
1263                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1264                                     ip->i_size);
1265                 if (error) {
1266                         if (use_iolock)
1267                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1268                         return error;
1269                 }
1270
1271                 error = xfs_trans_reserve(tp, 0,
1272                                           XFS_ITRUNCATE_LOG_RES(mp),
1273                                           0, XFS_TRANS_PERM_LOG_RES,
1274                                           XFS_ITRUNCATE_LOG_COUNT);
1275                 if (error) {
1276                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1277                         xfs_trans_cancel(tp, 0);
1278                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1279                         return error;
1280                 }
1281
1282                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1283                 xfs_trans_ijoin(tp, ip,
1284                                 XFS_IOLOCK_EXCL |
1285                                 XFS_ILOCK_EXCL);
1286                 xfs_trans_ihold(tp, ip);
1287
1288                 error = xfs_itruncate_finish(&tp, ip,
1289                                              ip->i_size,
1290                                              XFS_DATA_FORK,
1291                                              0);
1292                 /*
1293                  * If we get an error at this point we
1294                  * simply don't bother truncating the file.
1295                  */
1296                 if (error) {
1297                         xfs_trans_cancel(tp,
1298                                          (XFS_TRANS_RELEASE_LOG_RES |
1299                                           XFS_TRANS_ABORT));
1300                 } else {
1301                         error = xfs_trans_commit(tp,
1302                                                 XFS_TRANS_RELEASE_LOG_RES);
1303                 }
1304                 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1305                                             : XFS_ILOCK_EXCL));
1306         }
1307         return error;
1308 }
1309
1310 /*
1311  * Free a symlink that has blocks associated with it.
1312  */
1313 STATIC int
1314 xfs_inactive_symlink_rmt(
1315         xfs_inode_t     *ip,
1316         xfs_trans_t     **tpp)
1317 {
1318         xfs_buf_t       *bp;
1319         int             committed;
1320         int             done;
1321         int             error;
1322         xfs_fsblock_t   first_block;
1323         xfs_bmap_free_t free_list;
1324         int             i;
1325         xfs_mount_t     *mp;
1326         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1327         int             nmaps;
1328         xfs_trans_t     *ntp;
1329         int             size;
1330         xfs_trans_t     *tp;
1331
1332         tp = *tpp;
1333         mp = ip->i_mount;
1334         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1335         /*
1336          * We're freeing a symlink that has some
1337          * blocks allocated to it.  Free the
1338          * blocks here.  We know that we've got
1339          * either 1 or 2 extents and that we can
1340          * free them all in one bunmapi call.
1341          */
1342         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1343         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1344                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1345                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1346                 xfs_trans_cancel(tp, 0);
1347                 *tpp = NULL;
1348                 return error;
1349         }
1350         /*
1351          * Lock the inode, fix the size, and join it to the transaction.
1352          * Hold it so in the normal path, we still have it locked for
1353          * the second transaction.  In the error paths we need it
1354          * held so the cancel won't rele it, see below.
1355          */
1356         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1357         size = (int)ip->i_d.di_size;
1358         ip->i_d.di_size = 0;
1359         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1360         xfs_trans_ihold(tp, ip);
1361         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1362         /*
1363          * Find the block(s) so we can inval and unmap them.
1364          */
1365         done = 0;
1366         XFS_BMAP_INIT(&free_list, &first_block);
1367         nmaps = ARRAY_SIZE(mval);
1368         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1369                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1370                         &free_list, NULL)))
1371                 goto error0;
1372         /*
1373          * Invalidate the block(s).
1374          */
1375         for (i = 0; i < nmaps; i++) {
1376                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1377                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1378                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1379                 xfs_trans_binval(tp, bp);
1380         }
1381         /*
1382          * Unmap the dead block(s) to the free_list.
1383          */
1384         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1385                         &first_block, &free_list, NULL, &done)))
1386                 goto error1;
1387         ASSERT(done);
1388         /*
1389          * Commit the first transaction.  This logs the EFI and the inode.
1390          */
1391         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1392                 goto error1;
1393         /*
1394          * The transaction must have been committed, since there were
1395          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1396          * The new tp has the extent freeing and EFDs.
1397          */
1398         ASSERT(committed);
1399         /*
1400          * The first xact was committed, so add the inode to the new one.
1401          * Mark it dirty so it will be logged and moved forward in the log as
1402          * part of every commit.
1403          */
1404         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1405         xfs_trans_ihold(tp, ip);
1406         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1407         /*
1408          * Get a new, empty transaction to return to our caller.
1409          */
1410         ntp = xfs_trans_dup(tp);
1411         /*
1412          * Commit the transaction containing extent freeing and EFDs.
1413          * If we get an error on the commit here or on the reserve below,
1414          * we need to unlock the inode since the new transaction doesn't
1415          * have the inode attached.
1416          */
1417         error = xfs_trans_commit(tp, 0);
1418         tp = ntp;
1419         if (error) {
1420                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1421                 goto error0;
1422         }
1423         /*
1424          * Remove the memory for extent descriptions (just bookkeeping).
1425          */
1426         if (ip->i_df.if_bytes)
1427                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1428         ASSERT(ip->i_df.if_bytes == 0);
1429         /*
1430          * Put an itruncate log reservation in the new transaction
1431          * for our caller.
1432          */
1433         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1434                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1435                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1436                 goto error0;
1437         }
1438         /*
1439          * Return with the inode locked but not joined to the transaction.
1440          */
1441         *tpp = tp;
1442         return 0;
1443
1444  error1:
1445         xfs_bmap_cancel(&free_list);
1446  error0:
1447         /*
1448          * Have to come here with the inode locked and either
1449          * (held and in the transaction) or (not in the transaction).
1450          * If the inode isn't held then cancel would iput it, but
1451          * that's wrong since this is inactive and the vnode ref
1452          * count is 0 already.
1453          * Cancel won't do anything to the inode if held, but it still
1454          * needs to be locked until the cancel is done, if it was
1455          * joined to the transaction.
1456          */
1457         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1458         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1459         *tpp = NULL;
1460         return error;
1461
1462 }
1463
1464 STATIC int
1465 xfs_inactive_symlink_local(
1466         xfs_inode_t     *ip,
1467         xfs_trans_t     **tpp)
1468 {
1469         int             error;
1470
1471         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1472         /*
1473          * We're freeing a symlink which fit into
1474          * the inode.  Just free the memory used
1475          * to hold the old symlink.
1476          */
1477         error = xfs_trans_reserve(*tpp, 0,
1478                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1479                                   0, XFS_TRANS_PERM_LOG_RES,
1480                                   XFS_ITRUNCATE_LOG_COUNT);
1481
1482         if (error) {
1483                 xfs_trans_cancel(*tpp, 0);
1484                 *tpp = NULL;
1485                 return error;
1486         }
1487         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1488
1489         /*
1490          * Zero length symlinks _can_ exist.
1491          */
1492         if (ip->i_df.if_bytes > 0) {
1493                 xfs_idata_realloc(ip,
1494                                   -(ip->i_df.if_bytes),
1495                                   XFS_DATA_FORK);
1496                 ASSERT(ip->i_df.if_bytes == 0);
1497         }
1498         return 0;
1499 }
1500
1501 STATIC int
1502 xfs_inactive_attrs(
1503         xfs_inode_t     *ip,
1504         xfs_trans_t     **tpp)
1505 {
1506         xfs_trans_t     *tp;
1507         int             error;
1508         xfs_mount_t     *mp;
1509
1510         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1511         tp = *tpp;
1512         mp = ip->i_mount;
1513         ASSERT(ip->i_d.di_forkoff != 0);
1514         xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1515         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1516
1517         error = xfs_attr_inactive(ip);
1518         if (error) {
1519                 *tpp = NULL;
1520                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1521                 return error; /* goto out */
1522         }
1523
1524         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1525         error = xfs_trans_reserve(tp, 0,
1526                                   XFS_IFREE_LOG_RES(mp),
1527                                   0, XFS_TRANS_PERM_LOG_RES,
1528                                   XFS_INACTIVE_LOG_COUNT);
1529         if (error) {
1530                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1531                 xfs_trans_cancel(tp, 0);
1532                 *tpp = NULL;
1533                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1534                 return error;
1535         }
1536
1537         xfs_ilock(ip, XFS_ILOCK_EXCL);
1538         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1539         xfs_trans_ihold(tp, ip);
1540         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1541
1542         ASSERT(ip->i_d.di_anextents == 0);
1543
1544         *tpp = tp;
1545         return 0;
1546 }
1547
1548 STATIC int
1549 xfs_release(
1550         bhv_desc_t      *bdp)
1551 {
1552         xfs_inode_t     *ip;
1553         bhv_vnode_t     *vp;
1554         xfs_mount_t     *mp;
1555         int             error;
1556
1557         vp = BHV_TO_VNODE(bdp);
1558         ip = XFS_BHVTOI(bdp);
1559         mp = ip->i_mount;
1560
1561         if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
1562                 return 0;
1563
1564         /* If this is a read-only mount, don't do this (would generate I/O) */
1565         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1566                 return 0;
1567
1568 #ifdef HAVE_REFCACHE
1569         /* If we are in the NFS reference cache then don't do this now */
1570         if (ip->i_refcache)
1571                 return 0;
1572 #endif
1573
1574         if (ip->i_d.di_nlink != 0) {
1575                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1576                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1577                        ip->i_delayed_blks > 0)) &&
1578                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1579                     (!(ip->i_d.di_flags &
1580                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1581                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1582                         if (error)
1583                                 return error;
1584                         /* Update linux inode block count after free above */
1585                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1586                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1587                 }
1588         }
1589
1590         return 0;
1591 }
1592
1593 /*
1594  * xfs_inactive
1595  *
1596  * This is called when the vnode reference count for the vnode
1597  * goes to zero.  If the file has been unlinked, then it must
1598  * now be truncated.  Also, we clear all of the read-ahead state
1599  * kept for the inode here since the file is now closed.
1600  */
1601 STATIC int
1602 xfs_inactive(
1603         bhv_desc_t      *bdp,
1604         cred_t          *credp)
1605 {
1606         xfs_inode_t     *ip;
1607         bhv_vnode_t     *vp;
1608         xfs_bmap_free_t free_list;
1609         xfs_fsblock_t   first_block;
1610         int             committed;
1611         xfs_trans_t     *tp;
1612         xfs_mount_t     *mp;
1613         int             error;
1614         int             truncate;
1615
1616         vp = BHV_TO_VNODE(bdp);
1617         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1618
1619         ip = XFS_BHVTOI(bdp);
1620
1621         /*
1622          * If the inode is already free, then there can be nothing
1623          * to clean up here.
1624          */
1625         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1626                 ASSERT(ip->i_df.if_real_bytes == 0);
1627                 ASSERT(ip->i_df.if_broot_bytes == 0);
1628                 return VN_INACTIVE_CACHE;
1629         }
1630
1631         /*
1632          * Only do a truncate if it's a regular file with
1633          * some actual space in it.  It's OK to look at the
1634          * inode's fields without the lock because we're the
1635          * only one with a reference to the inode.
1636          */
1637         truncate = ((ip->i_d.di_nlink == 0) &&
1638             ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1639              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1640             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1641
1642         mp = ip->i_mount;
1643
1644         if (ip->i_d.di_nlink == 0 &&
1645             DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
1646                 (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1647         }
1648
1649         error = 0;
1650
1651         /* If this is a read-only mount, don't do this (would generate I/O) */
1652         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1653                 goto out;
1654
1655         if (ip->i_d.di_nlink != 0) {
1656                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1657                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1658                        ip->i_delayed_blks > 0)) &&
1659                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1660                      (!(ip->i_d.di_flags &
1661                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1662                       (ip->i_delayed_blks != 0)))) {
1663                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1664                         if (error)
1665                                 return VN_INACTIVE_CACHE;
1666                         /* Update linux inode block count after free above */
1667                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1668                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1669                 }
1670                 goto out;
1671         }
1672
1673         ASSERT(ip->i_d.di_nlink == 0);
1674
1675         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1676                 return VN_INACTIVE_CACHE;
1677
1678         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1679         if (truncate) {
1680                 /*
1681                  * Do the xfs_itruncate_start() call before
1682                  * reserving any log space because itruncate_start
1683                  * will call into the buffer cache and we can't
1684                  * do that within a transaction.
1685                  */
1686                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1687
1688                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1689                 if (error) {
1690                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1691                         return VN_INACTIVE_CACHE;
1692                 }
1693
1694                 error = xfs_trans_reserve(tp, 0,
1695                                           XFS_ITRUNCATE_LOG_RES(mp),
1696                                           0, XFS_TRANS_PERM_LOG_RES,
1697                                           XFS_ITRUNCATE_LOG_COUNT);
1698                 if (error) {
1699                         /* Don't call itruncate_cleanup */
1700                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1701                         xfs_trans_cancel(tp, 0);
1702                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1703                         return VN_INACTIVE_CACHE;
1704                 }
1705
1706                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1707                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1708                 xfs_trans_ihold(tp, ip);
1709
1710                 /*
1711                  * normally, we have to run xfs_itruncate_finish sync.
1712                  * But if filesystem is wsync and we're in the inactive
1713                  * path, then we know that nlink == 0, and that the
1714                  * xaction that made nlink == 0 is permanently committed
1715                  * since xfs_remove runs as a synchronous transaction.
1716                  */
1717                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1718                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1719
1720                 if (error) {
1721                         xfs_trans_cancel(tp,
1722                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1723                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1724                         return VN_INACTIVE_CACHE;
1725                 }
1726         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1727
1728                 /*
1729                  * If we get an error while cleaning up a
1730                  * symlink we bail out.
1731                  */
1732                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1733                         xfs_inactive_symlink_rmt(ip, &tp) :
1734                         xfs_inactive_symlink_local(ip, &tp);
1735
1736                 if (error) {
1737                         ASSERT(tp == NULL);
1738                         return VN_INACTIVE_CACHE;
1739                 }
1740
1741                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1742                 xfs_trans_ihold(tp, ip);
1743         } else {
1744                 error = xfs_trans_reserve(tp, 0,
1745                                           XFS_IFREE_LOG_RES(mp),
1746                                           0, XFS_TRANS_PERM_LOG_RES,
1747                                           XFS_INACTIVE_LOG_COUNT);
1748                 if (error) {
1749                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1750                         xfs_trans_cancel(tp, 0);
1751                         return VN_INACTIVE_CACHE;
1752                 }
1753
1754                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1755                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1756                 xfs_trans_ihold(tp, ip);
1757         }
1758
1759         /*
1760          * If there are attributes associated with the file
1761          * then blow them away now.  The code calls a routine
1762          * that recursively deconstructs the attribute fork.
1763          * We need to just commit the current transaction
1764          * because we can't use it for xfs_attr_inactive().
1765          */
1766         if (ip->i_d.di_anextents > 0) {
1767                 error = xfs_inactive_attrs(ip, &tp);
1768                 /*
1769                  * If we got an error, the transaction is already
1770                  * cancelled, and the inode is unlocked. Just get out.
1771                  */
1772                  if (error)
1773                          return VN_INACTIVE_CACHE;
1774         } else if (ip->i_afp) {
1775                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1776         }
1777
1778         /*
1779          * Free the inode.
1780          */
1781         XFS_BMAP_INIT(&free_list, &first_block);
1782         error = xfs_ifree(tp, ip, &free_list);
1783         if (error) {
1784                 /*
1785                  * If we fail to free the inode, shut down.  The cancel
1786                  * might do that, we need to make sure.  Otherwise the
1787                  * inode might be lost for a long time or forever.
1788                  */
1789                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1790                         cmn_err(CE_NOTE,
1791                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1792                                 error, mp->m_fsname);
1793                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1794                 }
1795                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1796         } else {
1797                 /*
1798                  * Credit the quota account(s). The inode is gone.
1799                  */
1800                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1801
1802                 /*
1803                  * Just ignore errors at this point.  There is
1804                  * nothing we can do except to try to keep going.
1805                  */
1806                 (void) xfs_bmap_finish(&tp,  &free_list, &committed);
1807                 (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1808         }
1809         /*
1810          * Release the dquots held by inode, if any.
1811          */
1812         XFS_QM_DQDETACH(mp, ip);
1813
1814         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1815
1816  out:
1817         return VN_INACTIVE_CACHE;
1818 }
1819
1820
1821 /*
1822  * xfs_lookup
1823  */
1824 STATIC int
1825 xfs_lookup(
1826         bhv_desc_t              *dir_bdp,
1827         bhv_vname_t             *dentry,
1828         bhv_vnode_t             **vpp,
1829         int                     flags,
1830         bhv_vnode_t             *rdir,
1831         cred_t                  *credp)
1832 {
1833         xfs_inode_t             *dp, *ip;
1834         xfs_ino_t               e_inum;
1835         int                     error;
1836         uint                    lock_mode;
1837         bhv_vnode_t             *dir_vp;
1838
1839         dir_vp = BHV_TO_VNODE(dir_bdp);
1840         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1841
1842         dp = XFS_BHVTOI(dir_bdp);
1843
1844         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1845                 return XFS_ERROR(EIO);
1846
1847         lock_mode = xfs_ilock_map_shared(dp);
1848         error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1849         if (!error) {
1850                 *vpp = XFS_ITOV(ip);
1851                 ITRACE(ip);
1852         }
1853         xfs_iunlock_map_shared(dp, lock_mode);
1854         return error;
1855 }
1856
1857
1858 /*
1859  * xfs_create (create a new file).
1860  */
1861 STATIC int
1862 xfs_create(
1863         bhv_desc_t              *dir_bdp,
1864         bhv_vname_t             *dentry,
1865         bhv_vattr_t             *vap,
1866         bhv_vnode_t             **vpp,
1867         cred_t                  *credp)
1868 {
1869         char                    *name = VNAME(dentry);
1870         bhv_vnode_t             *dir_vp;
1871         xfs_inode_t             *dp, *ip;
1872         bhv_vnode_t             *vp = NULL;
1873         xfs_trans_t             *tp;
1874         xfs_mount_t             *mp;
1875         xfs_dev_t               rdev;
1876         int                     error;
1877         xfs_bmap_free_t         free_list;
1878         xfs_fsblock_t           first_block;
1879         boolean_t               dp_joined_to_trans;
1880         int                     dm_event_sent = 0;
1881         uint                    cancel_flags;
1882         int                     committed;
1883         xfs_prid_t              prid;
1884         struct xfs_dquot        *udqp, *gdqp;
1885         uint                    resblks;
1886         int                     dm_di_mode;
1887         int                     namelen;
1888
1889         ASSERT(!*vpp);
1890         dir_vp = BHV_TO_VNODE(dir_bdp);
1891         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1892
1893         dp = XFS_BHVTOI(dir_bdp);
1894         mp = dp->i_mount;
1895
1896         dm_di_mode = vap->va_mode;
1897         namelen = VNAMELEN(dentry);
1898
1899         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
1900                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1901                                 dir_vp, DM_RIGHT_NULL, NULL,
1902                                 DM_RIGHT_NULL, name, NULL,
1903                                 dm_di_mode, 0, 0);
1904
1905                 if (error)
1906                         return error;
1907                 dm_event_sent = 1;
1908         }
1909
1910         if (XFS_FORCED_SHUTDOWN(mp))
1911                 return XFS_ERROR(EIO);
1912
1913         /* Return through std_return after this point. */
1914
1915         udqp = gdqp = NULL;
1916         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1917                 prid = dp->i_d.di_projid;
1918         else if (vap->va_mask & XFS_AT_PROJID)
1919                 prid = (xfs_prid_t)vap->va_projid;
1920         else
1921                 prid = (xfs_prid_t)dfltprid;
1922
1923         /*
1924          * Make sure that we have allocated dquot(s) on disk.
1925          */
1926         error = XFS_QM_DQVOPALLOC(mp, dp,
1927                         current_fsuid(credp), current_fsgid(credp), prid,
1928                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1929         if (error)
1930                 goto std_return;
1931
1932         ip = NULL;
1933         dp_joined_to_trans = B_FALSE;
1934
1935         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1936         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1937         resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1938         /*
1939          * Initially assume that the file does not exist and
1940          * reserve the resources for that case.  If that is not
1941          * the case we'll drop the one we have and get a more
1942          * appropriate transaction later.
1943          */
1944         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1945                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1946         if (error == ENOSPC) {
1947                 resblks = 0;
1948                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1949                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1950         }
1951         if (error) {
1952                 cancel_flags = 0;
1953                 dp = NULL;
1954                 goto error_return;
1955         }
1956
1957         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1958
1959         XFS_BMAP_INIT(&free_list, &first_block);
1960
1961         ASSERT(ip == NULL);
1962
1963         /*
1964          * Reserve disk quota and the inode.
1965          */
1966         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1967         if (error)
1968                 goto error_return;
1969
1970         if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
1971                 goto error_return;
1972         rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
1973         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
1974                         rdev, credp, prid, resblks > 0,
1975                         &ip, &committed);
1976         if (error) {
1977                 if (error == ENOSPC)
1978                         goto error_return;
1979                 goto abort_return;
1980         }
1981         ITRACE(ip);
1982
1983         /*
1984          * At this point, we've gotten a newly allocated inode.
1985          * It is locked (and joined to the transaction).
1986          */
1987
1988         ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1989
1990         /*
1991          * Now we join the directory inode to the transaction.
1992          * We do not do it earlier because xfs_dir_ialloc
1993          * might commit the previous transaction (and release
1994          * all the locks).
1995          */
1996
1997         VN_HOLD(dir_vp);
1998         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1999         dp_joined_to_trans = B_TRUE;
2000
2001         error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
2002                                         &first_block, &free_list, resblks ?
2003                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2004         if (error) {
2005                 ASSERT(error != ENOSPC);
2006                 goto abort_return;
2007         }
2008         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2009         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2010
2011         /*
2012          * If this is a synchronous mount, make sure that the
2013          * create transaction goes to disk before returning to
2014          * the user.
2015          */
2016         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2017                 xfs_trans_set_sync(tp);
2018         }
2019
2020         dp->i_gen++;
2021
2022         /*
2023          * Attach the dquot(s) to the inodes and modify them incore.
2024          * These ids of the inode couldn't have changed since the new
2025          * inode has been locked ever since it was created.
2026          */
2027         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2028
2029         /*
2030          * xfs_trans_commit normally decrements the vnode ref count
2031          * when it unlocks the inode. Since we want to return the
2032          * vnode to the caller, we bump the vnode ref count now.
2033          */
2034         IHOLD(ip);
2035         vp = XFS_ITOV(ip);
2036
2037         error = xfs_bmap_finish(&tp, &free_list, &committed);
2038         if (error) {
2039                 xfs_bmap_cancel(&free_list);
2040                 goto abort_rele;
2041         }
2042
2043         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2044         if (error) {
2045                 IRELE(ip);
2046                 tp = NULL;
2047                 goto error_return;
2048         }
2049
2050         XFS_QM_DQRELE(mp, udqp);
2051         XFS_QM_DQRELE(mp, gdqp);
2052
2053         /*
2054          * Propagate the fact that the vnode changed after the
2055          * xfs_inode locks have been released.
2056          */
2057         bhv_vop_vnode_change(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2058
2059         *vpp = vp;
2060
2061         /* Fallthrough to std_return with error = 0  */
2062
2063 std_return:
2064         if ( (*vpp || (error != 0 && dm_event_sent != 0)) &&
2065                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2066                                                         DM_EVENT_POSTCREATE)) {
2067                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2068                         dir_vp, DM_RIGHT_NULL,
2069                         *vpp ? vp:NULL,
2070                         DM_RIGHT_NULL, name, NULL,
2071                         dm_di_mode, error, 0);
2072         }
2073         return error;
2074
2075  abort_return:
2076         cancel_flags |= XFS_TRANS_ABORT;
2077         /* FALLTHROUGH */
2078
2079  error_return:
2080         if (tp != NULL)
2081                 xfs_trans_cancel(tp, cancel_flags);
2082
2083         if (!dp_joined_to_trans && (dp != NULL))
2084                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2085         XFS_QM_DQRELE(mp, udqp);
2086         XFS_QM_DQRELE(mp, gdqp);
2087
2088         goto std_return;
2089
2090  abort_rele:
2091         /*
2092          * Wait until after the current transaction is aborted to
2093          * release the inode.  This prevents recursive transactions
2094          * and deadlocks from xfs_inactive.
2095          */
2096         cancel_flags |= XFS_TRANS_ABORT;
2097         xfs_trans_cancel(tp, cancel_flags);
2098         IRELE(ip);
2099
2100         XFS_QM_DQRELE(mp, udqp);
2101         XFS_QM_DQRELE(mp, gdqp);
2102
2103         goto std_return;
2104 }
2105
2106 #ifdef DEBUG
2107 /*
2108  * Some counters to see if (and how often) we are hitting some deadlock
2109  * prevention code paths.
2110  */
2111
2112 int xfs_rm_locks;
2113 int xfs_rm_lock_delays;
2114 int xfs_rm_attempts;
2115 #endif
2116
2117 /*
2118  * The following routine will lock the inodes associated with the
2119  * directory and the named entry in the directory. The locks are
2120  * acquired in increasing inode number.
2121  *
2122  * If the entry is "..", then only the directory is locked. The
2123  * vnode ref count will still include that from the .. entry in
2124  * this case.
2125  *
2126  * There is a deadlock we need to worry about. If the locked directory is
2127  * in the AIL, it might be blocking up the log. The next inode we lock
2128  * could be already locked by another thread waiting for log space (e.g
2129  * a permanent log reservation with a long running transaction (see
2130  * xfs_itruncate_finish)). To solve this, we must check if the directory
2131  * is in the ail and use lock_nowait. If we can't lock, we need to
2132  * drop the inode lock on the directory and try again. xfs_iunlock will
2133  * potentially push the tail if we were holding up the log.
2134  */
2135 STATIC int
2136 xfs_lock_dir_and_entry(
2137         xfs_inode_t     *dp,
2138         xfs_inode_t     *ip)    /* inode of entry 'name' */
2139 {
2140         int             attempts;
2141         xfs_ino_t       e_inum;
2142         xfs_inode_t     *ips[2];
2143         xfs_log_item_t  *lp;
2144
2145 #ifdef DEBUG
2146         xfs_rm_locks++;
2147 #endif
2148         attempts = 0;
2149
2150 again:
2151         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2152
2153         e_inum = ip->i_ino;
2154
2155         ITRACE(ip);
2156
2157         /*
2158          * We want to lock in increasing inum. Since we've already
2159          * acquired the lock on the directory, we may need to release
2160          * if if the inum of the entry turns out to be less.
2161          */
2162         if (e_inum > dp->i_ino) {
2163                 /*
2164                  * We are already in the right order, so just
2165                  * lock on the inode of the entry.
2166                  * We need to use nowait if dp is in the AIL.
2167                  */
2168
2169                 lp = (xfs_log_item_t *)dp->i_itemp;
2170                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2171                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2172                                 attempts++;
2173 #ifdef DEBUG
2174                                 xfs_rm_attempts++;
2175 #endif
2176
2177                                 /*
2178                                  * Unlock dp and try again.
2179                                  * xfs_iunlock will try to push the tail
2180                                  * if the inode is in the AIL.
2181                                  */
2182
2183                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2184
2185                                 if ((attempts % 5) == 0) {
2186                                         delay(1); /* Don't just spin the CPU */
2187 #ifdef DEBUG
2188                                         xfs_rm_lock_delays++;
2189 #endif
2190                                 }
2191                                 goto again;
2192                         }
2193                 } else {
2194                         xfs_ilock(ip, XFS_ILOCK_EXCL);
2195                 }
2196         } else if (e_inum < dp->i_ino) {
2197                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2198
2199                 ips[0] = ip;
2200                 ips[1] = dp;
2201                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2202         }
2203         /* else  e_inum == dp->i_ino */
2204         /*     This can happen if we're asked to lock /x/..
2205          *     the entry is "..", which is also the parent directory.
2206          */
2207
2208         return 0;
2209 }
2210
2211 #ifdef DEBUG
2212 int xfs_locked_n;
2213 int xfs_small_retries;
2214 int xfs_middle_retries;
2215 int xfs_lots_retries;
2216 int xfs_lock_delays;
2217 #endif
2218
2219 /*
2220  * Bump the subclass so xfs_lock_inodes() acquires each lock with
2221  * a different value
2222  */
2223 static inline int
2224 xfs_lock_inumorder(int lock_mode, int subclass)
2225 {
2226         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2227                 lock_mode |= (subclass + XFS_IOLOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2228         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2229                 lock_mode |= (subclass + XFS_ILOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2230
2231         return lock_mode;
2232 }
2233
2234 /*
2235  * The following routine will lock n inodes in exclusive mode.
2236  * We assume the caller calls us with the inodes in i_ino order.
2237  *
2238  * We need to detect deadlock where an inode that we lock
2239  * is in the AIL and we start waiting for another inode that is locked
2240  * by a thread in a long running transaction (such as truncate). This can
2241  * result in deadlock since the long running trans might need to wait
2242  * for the inode we just locked in order to push the tail and free space
2243  * in the log.
2244  */
2245 void
2246 xfs_lock_inodes(
2247         xfs_inode_t     **ips,
2248         int             inodes,
2249         int             first_locked,
2250         uint            lock_mode)
2251 {
2252         int             attempts = 0, i, j, try_lock;
2253         xfs_log_item_t  *lp;
2254
2255         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2256
2257         if (first_locked) {
2258                 try_lock = 1;
2259                 i = 1;
2260         } else {
2261                 try_lock = 0;
2262                 i = 0;
2263         }
2264
2265 again:
2266         for (; i < inodes; i++) {
2267                 ASSERT(ips[i]);
2268
2269                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2270                         continue;
2271
2272                 /*
2273                  * If try_lock is not set yet, make sure all locked inodes
2274                  * are not in the AIL.
2275                  * If any are, set try_lock to be used later.
2276                  */
2277
2278                 if (!try_lock) {
2279                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2280                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2281                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2282                                         try_lock++;
2283                                 }
2284                         }
2285                 }
2286
2287                 /*
2288                  * If any of the previous locks we have locked is in the AIL,
2289                  * we must TRY to get the second and subsequent locks. If
2290                  * we can't get any, we must release all we have
2291                  * and try again.
2292                  */
2293
2294                 if (try_lock) {
2295                         /* try_lock must be 0 if i is 0. */
2296                         /*
2297                          * try_lock means we have an inode locked
2298                          * that is in the AIL.
2299                          */
2300                         ASSERT(i != 0);
2301                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
2302                                 attempts++;
2303
2304                                 /*
2305                                  * Unlock all previous guys and try again.
2306                                  * xfs_iunlock will try to push the tail
2307                                  * if the inode is in the AIL.
2308                                  */
2309
2310                                 for(j = i - 1; j >= 0; j--) {
2311
2312                                         /*
2313                                          * Check to see if we've already
2314                                          * unlocked this one.
2315                                          * Not the first one going back,
2316                                          * and the inode ptr is the same.
2317                                          */
2318                                         if ((j != (i - 1)) && ips[j] ==
2319                                                                 ips[j+1])
2320                                                 continue;
2321
2322                                         xfs_iunlock(ips[j], lock_mode);
2323                                 }
2324
2325                                 if ((attempts % 5) == 0) {
2326                                         delay(1); /* Don't just spin the CPU */
2327 #ifdef DEBUG
2328                                         xfs_lock_delays++;
2329 #endif
2330                                 }
2331                                 i = 0;
2332                                 try_lock = 0;
2333                                 goto again;
2334                         }
2335                 } else {
2336                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
2337                 }
2338         }
2339
2340 #ifdef DEBUG
2341         if (attempts) {
2342                 if (attempts < 5) xfs_small_retries++;
2343                 else if (attempts < 100) xfs_middle_retries++;
2344                 else xfs_lots_retries++;
2345         } else {
2346                 xfs_locked_n++;
2347         }
2348 #endif
2349 }
2350
2351 #ifdef  DEBUG
2352 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2353 int remove_which_error_return = 0;
2354 #else /* ! DEBUG */
2355 #define REMOVE_DEBUG_TRACE(x)
2356 #endif  /* ! DEBUG */
2357
2358
2359 /*
2360  * xfs_remove
2361  *
2362  */
2363 STATIC int
2364 xfs_remove(
2365         bhv_desc_t              *dir_bdp,
2366         bhv_vname_t             *dentry,
2367         cred_t                  *credp)
2368 {
2369         bhv_vnode_t             *dir_vp;
2370         char                    *name = VNAME(dentry);
2371         xfs_inode_t             *dp, *ip;
2372         xfs_trans_t             *tp = NULL;
2373         xfs_mount_t             *mp;
2374         int                     error = 0;
2375         xfs_bmap_free_t         free_list;
2376         xfs_fsblock_t           first_block;
2377         int                     cancel_flags;
2378         int                     committed;
2379         int                     dm_di_mode = 0;
2380         int                     link_zero;
2381         uint                    resblks;
2382         int                     namelen;
2383
2384         dir_vp = BHV_TO_VNODE(dir_bdp);
2385         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2386
2387         dp = XFS_BHVTOI(dir_bdp);
2388         mp = dp->i_mount;
2389
2390         if (XFS_FORCED_SHUTDOWN(mp))
2391                 return XFS_ERROR(EIO);
2392
2393         namelen = VNAMELEN(dentry);
2394
2395         if (!xfs_get_dir_entry(dentry, &ip)) {
2396                 dm_di_mode = ip->i_d.di_mode;
2397                 IRELE(ip);
2398         }
2399
2400         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
2401                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2402                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2403                                         name, NULL, dm_di_mode, 0, 0);
2404                 if (error)
2405                         return error;
2406         }
2407
2408         /* From this point on, return through std_return */
2409         ip = NULL;
2410
2411         /*
2412          * We need to get a reference to ip before we get our log
2413          * reservation. The reason for this is that we cannot call
2414          * xfs_iget for an inode for which we do not have a reference
2415          * once we've acquired a log reservation. This is because the
2416          * inode we are trying to get might be in xfs_inactive going
2417          * for a log reservation. Since we'll have to wait for the
2418          * inactive code to complete before returning from xfs_iget,
2419          * we need to make sure that we don't have log space reserved
2420          * when we call xfs_iget.  Instead we get an unlocked reference
2421          * to the inode before getting our log reservation.
2422          */
2423         error = xfs_get_dir_entry(dentry, &ip);
2424         if (error) {
2425                 REMOVE_DEBUG_TRACE(__LINE__);
2426                 goto std_return;
2427         }
2428
2429         dm_di_mode = ip->i_d.di_mode;
2430
2431         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2432
2433         ITRACE(ip);
2434
2435         error = XFS_QM_DQATTACH(mp, dp, 0);
2436         if (!error && dp != ip)
2437                 error = XFS_QM_DQATTACH(mp, ip, 0);
2438         if (error) {
2439                 REMOVE_DEBUG_TRACE(__LINE__);
2440                 IRELE(ip);
2441                 goto std_return;
2442         }
2443
2444         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2445         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2446         /*
2447          * We try to get the real space reservation first,
2448          * allowing for directory btree deletion(s) implying
2449          * possible bmap insert(s).  If we can't get the space
2450          * reservation then we use 0 instead, and avoid the bmap
2451          * btree insert(s) in the directory code by, if the bmap
2452          * insert tries to happen, instead trimming the LAST
2453          * block from the directory.
2454          */
2455         resblks = XFS_REMOVE_SPACE_RES(mp);
2456         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2457                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2458         if (error == ENOSPC) {
2459                 resblks = 0;
2460                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2461                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2462         }
2463         if (error) {
2464                 ASSERT(error != ENOSPC);
2465                 REMOVE_DEBUG_TRACE(__LINE__);
2466                 xfs_trans_cancel(tp, 0);
2467                 IRELE(ip);
2468                 return error;
2469         }
2470
2471         error = xfs_lock_dir_and_entry(dp, ip);
2472         if (error) {
2473                 REMOVE_DEBUG_TRACE(__LINE__);
2474                 xfs_trans_cancel(tp, cancel_flags);
2475                 IRELE(ip);
2476                 goto std_return;
2477         }
2478
2479         /*
2480          * At this point, we've gotten both the directory and the entry
2481          * inodes locked.
2482          */
2483         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2484         if (dp != ip) {
2485                 /*
2486                  * Increment vnode ref count only in this case since
2487                  * there's an extra vnode reference in the case where
2488                  * dp == ip.
2489                  */
2490                 IHOLD(dp);
2491                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2492         }
2493
2494         /*
2495          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2496          */
2497         XFS_BMAP_INIT(&free_list, &first_block);
2498         error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
2499                                         &first_block, &free_list, 0);
2500         if (error) {
2501                 ASSERT(error != ENOENT);
2502                 REMOVE_DEBUG_TRACE(__LINE__);
2503                 goto error1;
2504         }
2505         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2506
2507         dp->i_gen++;
2508         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2509
2510         error = xfs_droplink(tp, ip);
2511         if (error) {
2512                 REMOVE_DEBUG_TRACE(__LINE__);
2513                 goto error1;
2514         }
2515
2516         /* Determine if this is the last link while
2517          * we are in the transaction.
2518          */
2519         link_zero = (ip)->i_d.di_nlink==0;
2520
2521         /*
2522          * Take an extra ref on the inode so that it doesn't
2523          * go to xfs_inactive() from within the commit.
2524          */
2525         IHOLD(ip);
2526
2527         /*
2528          * If this is a synchronous mount, make sure that the
2529          * remove transaction goes to disk before returning to
2530          * the user.
2531          */
2532         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2533                 xfs_trans_set_sync(tp);
2534         }
2535
2536         error = xfs_bmap_finish(&tp, &free_list, &committed);
2537         if (error) {
2538                 REMOVE_DEBUG_TRACE(__LINE__);
2539                 goto error_rele;
2540         }
2541
2542         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2543         if (error) {
2544                 IRELE(ip);
2545                 goto std_return;
2546         }
2547
2548         /*
2549          * Before we drop our extra reference to the inode, purge it
2550          * from the refcache if it is there.  By waiting until afterwards
2551          * to do the IRELE, we ensure that we won't go inactive in the
2552          * xfs_refcache_purge_ip routine (although that would be OK).
2553          */
2554         xfs_refcache_purge_ip(ip);
2555
2556         vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2557
2558         /*
2559          * Let interposed file systems know about removed links.
2560          */
2561         bhv_vop_link_removed(XFS_ITOV(ip), dir_vp, link_zero);
2562
2563         IRELE(ip);
2564
2565 /*      Fall through to std_return with error = 0 */
2566  std_return:
2567         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
2568                                                 DM_EVENT_POSTREMOVE)) {
2569                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2570                                 dir_vp, DM_RIGHT_NULL,
2571                                 NULL, DM_RIGHT_NULL,
2572                                 name, NULL, dm_di_mode, error, 0);
2573         }
2574         return error;
2575
2576  error1:
2577         xfs_bmap_cancel(&free_list);
2578         cancel_flags |= XFS_TRANS_ABORT;
2579         xfs_trans_cancel(tp, cancel_flags);
2580         goto std_return;
2581
2582  error_rele:
2583         /*
2584          * In this case make sure to not release the inode until after
2585          * the current transaction is aborted.  Releasing it beforehand
2586          * can cause us to go to xfs_inactive and start a recursive
2587          * transaction which can easily deadlock with the current one.
2588          */
2589         xfs_bmap_cancel(&free_list);
2590         cancel_flags |= XFS_TRANS_ABORT;
2591         xfs_trans_cancel(tp, cancel_flags);
2592
2593         /*
2594          * Before we drop our extra reference to the inode, purge it
2595          * from the refcache if it is there.  By waiting until afterwards
2596          * to do the IRELE, we ensure that we won't go inactive in the
2597          * xfs_refcache_purge_ip routine (although that would be OK).
2598          */
2599         xfs_refcache_purge_ip(ip);
2600
2601         IRELE(ip);
2602
2603         goto std_return;
2604 }
2605
2606
2607 /*
2608  * xfs_link
2609  *
2610  */
2611 STATIC int
2612 xfs_link(
2613         bhv_desc_t              *target_dir_bdp,
2614         bhv_vnode_t             *src_vp,
2615         bhv_vname_t             *dentry,
2616         cred_t                  *credp)
2617 {
2618         xfs_inode_t             *tdp, *sip;
2619         xfs_trans_t             *tp;
2620         xfs_mount_t             *mp;
2621         xfs_inode_t             *ips[2];
2622         int                     error;
2623         xfs_bmap_free_t         free_list;
2624         xfs_fsblock_t           first_block;
2625         int                     cancel_flags;
2626         int                     committed;
2627         bhv_vnode_t             *target_dir_vp;
2628         int                     resblks;
2629         char                    *target_name = VNAME(dentry);
2630         int                     target_namelen;
2631
2632         target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2633         vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2634         vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2635
2636         target_namelen = VNAMELEN(dentry);
2637         ASSERT(!VN_ISDIR(src_vp));
2638
2639         sip = xfs_vtoi(src_vp);
2640         tdp = XFS_BHVTOI(target_dir_bdp);
2641         mp = tdp->i_mount;
2642         if (XFS_FORCED_SHUTDOWN(mp))
2643                 return XFS_ERROR(EIO);
2644
2645         if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
2646                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2647                                         target_dir_vp, DM_RIGHT_NULL,
2648                                         src_vp, DM_RIGHT_NULL,
2649                                         target_name, NULL, 0, 0, 0);
2650                 if (error)
2651                         return error;
2652         }
2653
2654         /* Return through std_return after this point. */
2655
2656         error = XFS_QM_DQATTACH(mp, sip, 0);
2657         if (!error && sip != tdp)
2658                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2659         if (error)
2660                 goto std_return;
2661
2662         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2663         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2664         resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2665         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2666                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2667         if (error == ENOSPC) {
2668                 resblks = 0;
2669                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2670                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2671         }
2672         if (error) {
2673                 cancel_flags = 0;
2674                 goto error_return;
2675         }
2676
2677         if (sip->i_ino < tdp->i_ino) {
2678                 ips[0] = sip;
2679                 ips[1] = tdp;
2680         } else {
2681                 ips[0] = tdp;
2682                 ips[1] = sip;
2683         }
2684
2685         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2686
2687         /*
2688          * Increment vnode ref counts since xfs_trans_commit &
2689          * xfs_trans_cancel will both unlock the inodes and
2690          * decrement the associated ref counts.
2691          */
2692         VN_HOLD(src_vp);
2693         VN_HOLD(target_dir_vp);
2694         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2695         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2696
2697         /*
2698          * If the source has too many links, we can't make any more to it.
2699          */
2700         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2701                 error = XFS_ERROR(EMLINK);
2702                 goto error_return;
2703         }
2704
2705         /*
2706          * If we are using project inheritance, we only allow hard link
2707          * creation in our tree when the project IDs are the same; else
2708          * the tree quota mechanism could be circumvented.
2709          */
2710         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2711                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2712                 error = XFS_ERROR(EXDEV);
2713                 goto error_return;
2714         }
2715
2716         if (resblks == 0 &&
2717             (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
2718                 goto error_return;
2719
2720         XFS_BMAP_INIT(&free_list, &first_block);
2721
2722         error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
2723                                    sip->i_ino, &first_block, &free_list,
2724                                    resblks);
2725         if (error)
2726                 goto abort_return;
2727         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2728         tdp->i_gen++;
2729         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2730
2731         error = xfs_bumplink(tp, sip);
2732         if (error)
2733                 goto abort_return;
2734
2735         /*
2736          * If this is a synchronous mount, make sure that the
2737          * link transaction goes to disk before returning to
2738          * the user.
2739          */
2740         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2741                 xfs_trans_set_sync(tp);
2742         }
2743
2744         error = xfs_bmap_finish (&tp, &free_list, &committed);
2745         if (error) {
2746                 xfs_bmap_cancel(&free_list);
2747                 goto abort_return;
2748         }
2749
2750         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2751         if (error)
2752                 goto std_return;
2753
2754         /* Fall through to std_return with error = 0. */
2755 std_return:
2756         if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
2757                                                 DM_EVENT_POSTLINK)) {
2758                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2759                                 target_dir_vp, DM_RIGHT_NULL,
2760                                 src_vp, DM_RIGHT_NULL,
2761                                 target_name, NULL, 0, error, 0);
2762         }
2763         return error;
2764
2765  abort_return:
2766         cancel_flags |= XFS_TRANS_ABORT;
2767         /* FALLTHROUGH */
2768
2769  error_return:
2770         xfs_trans_cancel(tp, cancel_flags);
2771         goto std_return;
2772 }
2773
2774
2775 /*
2776  * xfs_mkdir
2777  *
2778  */
2779 STATIC int
2780 xfs_mkdir(
2781         bhv_desc_t              *dir_bdp,
2782         bhv_vname_t             *dentry,
2783         bhv_vattr_t             *vap,
2784         bhv_vnode_t             **vpp,
2785         cred_t                  *credp)
2786 {
2787         char                    *dir_name = VNAME(dentry);
2788         xfs_inode_t             *dp;
2789         xfs_inode_t             *cdp;   /* inode of created dir */
2790         bhv_vnode_t             *cvp;   /* vnode of created dir */
2791         xfs_trans_t             *tp;
2792         xfs_mount_t             *mp;
2793         int                     cancel_flags;
2794         int                     error;
2795         int                     committed;
2796         xfs_bmap_free_t         free_list;
2797         xfs_fsblock_t           first_block;
2798         bhv_vnode_t             *dir_vp;
2799         boolean_t               dp_joined_to_trans;
2800         boolean_t               created = B_FALSE;
2801         int                     dm_event_sent = 0;
2802         xfs_prid_t              prid;
2803         struct xfs_dquot        *udqp, *gdqp;
2804         uint                    resblks;
2805         int                     dm_di_mode;
2806         int                     dir_namelen;
2807
2808         dir_vp = BHV_TO_VNODE(dir_bdp);
2809         dp = XFS_BHVTOI(dir_bdp);
2810         mp = dp->i_mount;
2811
2812         if (XFS_FORCED_SHUTDOWN(mp))
2813                 return XFS_ERROR(EIO);
2814
2815         dir_namelen = VNAMELEN(dentry);
2816
2817         tp = NULL;
2818         dp_joined_to_trans = B_FALSE;
2819         dm_di_mode = vap->va_mode;
2820
2821         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
2822                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2823                                         dir_vp, DM_RIGHT_NULL, NULL,
2824                                         DM_RIGHT_NULL, dir_name, NULL,
2825                                         dm_di_mode, 0, 0);
2826                 if (error)
2827                         return error;
2828                 dm_event_sent = 1;
2829         }
2830
2831         /* Return through std_return after this point. */
2832
2833         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2834
2835         mp = dp->i_mount;
2836         udqp = gdqp = NULL;
2837         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2838                 prid = dp->i_d.di_projid;
2839         else if (vap->va_mask & XFS_AT_PROJID)
2840                 prid = (xfs_prid_t)vap->va_projid;
2841         else
2842                 prid = (xfs_prid_t)dfltprid;
2843
2844         /*
2845          * Make sure that we have allocated dquot(s) on disk.
2846          */
2847         error = XFS_QM_DQVOPALLOC(mp, dp,
2848                         current_fsuid(credp), current_fsgid(credp), prid,
2849                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2850         if (error)
2851                 goto std_return;
2852
2853         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2854         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2855         resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2856         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2857                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2858         if (error == ENOSPC) {
2859                 resblks = 0;
2860                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2861                                           XFS_TRANS_PERM_LOG_RES,
2862                                           XFS_MKDIR_LOG_COUNT);
2863         }
2864         if (error) {
2865                 cancel_flags = 0;
2866                 dp = NULL;
2867                 goto error_return;
2868         }
2869
2870         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2871
2872         /*
2873          * Check for directory link count overflow.
2874          */
2875         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2876                 error = XFS_ERROR(EMLINK);
2877                 goto error_return;
2878         }
2879
2880         /*
2881          * Reserve disk quota and the inode.
2882          */
2883         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2884         if (error)
2885                 goto error_return;
2886
2887         if (resblks == 0 &&
2888             (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
2889                 goto error_return;
2890         /*
2891          * create the directory inode.
2892          */
2893         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2,
2894                         0, credp, prid, resblks > 0,
2895                 &cdp, NULL);
2896         if (error) {
2897                 if (error == ENOSPC)
2898                         goto error_return;
2899                 goto abort_return;
2900         }
2901         ITRACE(cdp);
2902
2903         /*
2904          * Now we add the directory inode to the transaction.
2905          * We waited until now since xfs_dir_ialloc might start
2906          * a new transaction.  Had we joined the transaction
2907          * earlier, the locks might have gotten released.
2908          */
2909         VN_HOLD(dir_vp);
2910         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2911         dp_joined_to_trans = B_TRUE;
2912
2913         XFS_BMAP_INIT(&free_list, &first_block);
2914
2915         error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
2916                                    &first_block, &free_list, resblks ?
2917                                    resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2918         if (error) {
2919                 ASSERT(error != ENOSPC);
2920                 goto error1;
2921         }
2922         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2923
2924         /*
2925          * Bump the in memory version number of the parent directory
2926          * so that other processes accessing it will recognize that
2927          * the directory has changed.
2928          */
2929         dp->i_gen++;
2930
2931         error = xfs_dir_init(tp, cdp, dp);
2932         if (error)
2933                 goto error2;
2934
2935         cdp->i_gen = 1;
2936         error = xfs_bumplink(tp, dp);
2937         if (error)
2938                 goto error2;
2939
2940         cvp = XFS_ITOV(cdp);
2941
2942         created = B_TRUE;
2943
2944         *vpp = cvp;
2945         IHOLD(cdp);
2946
2947         /*
2948          * Attach the dquots to the new inode and modify the icount incore.
2949          */
2950         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2951
2952         /*
2953          * If this is a synchronous mount, make sure that the
2954          * mkdir transaction goes to disk before returning to
2955          * the user.
2956          */
2957         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2958                 xfs_trans_set_sync(tp);
2959         }
2960
2961         error = xfs_bmap_finish(&tp, &free_list, &committed);
2962         if (error) {
2963                 IRELE(cdp);
2964                 goto error2;
2965         }
2966
2967         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2968         XFS_QM_DQRELE(mp, udqp);
2969         XFS_QM_DQRELE(mp, gdqp);
2970         if (error) {
2971                 IRELE(cdp);
2972         }
2973
2974         /* Fall through to std_return with error = 0 or errno from
2975          * xfs_trans_commit. */
2976
2977 std_return:
2978         if ( (created || (error != 0 && dm_event_sent != 0)) &&
2979                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2980                                                 DM_EVENT_POSTCREATE)) {
2981                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2982                                         dir_vp, DM_RIGHT_NULL,
2983                                         created ? XFS_ITOV(cdp):NULL,
2984                                         DM_RIGHT_NULL,
2985                                         dir_name, NULL,
2986                                         dm_di_mode, error, 0);
2987         }
2988         return error;
2989
2990  error2:
2991  error1:
2992         xfs_bmap_cancel(&free_list);
2993  abort_return:
2994         cancel_flags |= XFS_TRANS_ABORT;
2995  error_return:
2996         xfs_trans_cancel(tp, cancel_flags);
2997         XFS_QM_DQRELE(mp, udqp);
2998         XFS_QM_DQRELE(mp, gdqp);
2999
3000         if (!dp_joined_to_trans && (dp != NULL)) {
3001                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3002         }
3003
3004         goto std_return;
3005 }
3006
3007
3008 /*
3009  * xfs_rmdir
3010  *
3011  */
3012 STATIC int
3013 xfs_rmdir(
3014         bhv_desc_t              *dir_bdp,
3015         bhv_vname_t             *dentry,
3016         cred_t                  *credp)
3017 {
3018         char                    *name = VNAME(dentry);
3019         xfs_inode_t             *dp;
3020         xfs_inode_t             *cdp;   /* child directory */
3021         xfs_trans_t             *tp;
3022         xfs_mount_t             *mp;
3023         int                     error;
3024         xfs_bmap_free_t         free_list;
3025         xfs_fsblock_t           first_block;
3026         int                     cancel_flags;
3027         int                     committed;
3028         bhv_vnode_t             *dir_vp;
3029         int                     dm_di_mode = S_IFDIR;
3030         int                     last_cdp_link;
3031         int                     namelen;
3032         uint                    resblks;
3033
3034         dir_vp = BHV_TO_VNODE(dir_bdp);
3035         dp = XFS_BHVTOI(dir_bdp);
3036         mp = dp->i_mount;
3037
3038         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3039
3040         if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3041                 return XFS_ERROR(EIO);
3042         namelen = VNAMELEN(dentry);
3043
3044         if (!xfs_get_dir_entry(dentry, &cdp)) {
3045                 dm_di_mode = cdp->i_d.di_mode;
3046                 IRELE(cdp);
3047         }
3048
3049         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
3050                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3051                                         dir_vp, DM_RIGHT_NULL,
3052                                         NULL, DM_RIGHT_NULL,
3053                                         name, NULL, dm_di_mode, 0, 0);
3054                 if (error)
3055                         return XFS_ERROR(error);
3056         }
3057
3058         /* Return through std_return after this point. */
3059
3060         cdp = NULL;
3061
3062         /*
3063          * We need to get a reference to cdp before we get our log
3064          * reservation.  The reason for this is that we cannot call
3065          * xfs_iget for an inode for which we do not have a reference
3066          * once we've acquired a log reservation.  This is because the
3067          * inode we are trying to get might be in xfs_inactive going
3068          * for a log reservation.  Since we'll have to wait for the
3069          * inactive code to complete before returning from xfs_iget,
3070          * we need to make sure that we don't have log space reserved
3071          * when we call xfs_iget.  Instead we get an unlocked reference
3072          * to the inode before getting our log reservation.
3073          */
3074         error = xfs_get_dir_entry(dentry, &cdp);
3075         if (error) {
3076                 REMOVE_DEBUG_TRACE(__LINE__);
3077                 goto std_return;
3078         }
3079         mp = dp->i_mount;
3080         dm_di_mode = cdp->i_d.di_mode;
3081
3082         /*
3083          * Get the dquots for the inodes.
3084          */
3085         error = XFS_QM_DQATTACH(mp, dp, 0);
3086         if (!error && dp != cdp)
3087                 error = XFS_QM_DQATTACH(mp, cdp, 0);
3088         if (error) {
3089                 IRELE(cdp);
3090                 REMOVE_DEBUG_TRACE(__LINE__);
3091                 goto std_return;
3092         }
3093
3094         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3095         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3096         /*
3097          * We try to get the real space reservation first,
3098          * allowing for directory btree deletion(s) implying
3099          * possible bmap insert(s).  If we can't get the space
3100          * reservation then we use 0 instead, and avoid the bmap
3101          * btree insert(s) in the directory code by, if the bmap
3102          * insert tries to happen, instead trimming the LAST
3103          * block from the directory.
3104          */
3105         resblks = XFS_REMOVE_SPACE_RES(mp);
3106         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3107                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3108         if (error == ENOSPC) {
3109                 resblks = 0;
3110                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3111                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3112         }
3113         if (error) {
3114                 ASSERT(error != ENOSPC);
3115                 cancel_flags = 0;
3116                 IRELE(cdp);
3117                 goto error_return;
3118         }
3119         XFS_BMAP_INIT(&free_list, &first_block);
3120
3121         /*
3122          * Now lock the child directory inode and the parent directory
3123          * inode in the proper order.  This will take care of validating
3124          * that the directory entry for the child directory inode has
3125          * not changed while we were obtaining a log reservation.
3126          */
3127         error = xfs_lock_dir_and_entry(dp, cdp);
3128         if (error) {
3129                 xfs_trans_cancel(tp, cancel_flags);
3130                 IRELE(cdp);
3131                 goto std_return;
3132         }
3133
3134         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3135         if (dp != cdp) {
3136                 /*
3137                  * Only increment the parent directory vnode count if
3138                  * we didn't bump it in looking up cdp.  The only time
3139                  * we don't bump it is when we're looking up ".".
3140                  */
3141                 VN_HOLD(dir_vp);
3142         }
3143
3144         ITRACE(cdp);
3145         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3146
3147         ASSERT(cdp->i_d.di_nlink >= 2);
3148         if (cdp->i_d.di_nlink != 2) {
3149                 error = XFS_ERROR(ENOTEMPTY);
3150                 goto error_return;
3151         }
3152         if (!xfs_dir_isempty(cdp)) {
3153                 error = XFS_ERROR(ENOTEMPTY);
3154                 goto error_return;
3155         }
3156
3157         error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
3158                                         &first_block, &free_list, resblks);
3159         if (error)
3160                 goto error1;
3161
3162         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3163
3164         /*
3165          * Bump the in memory generation count on the parent
3166          * directory so that other can know that it has changed.
3167          */
3168         dp->i_gen++;
3169
3170         /*
3171          * Drop the link from cdp's "..".
3172          */
3173         error = xfs_droplink(tp, dp);
3174         if (error) {
3175                 goto error1;
3176         }
3177
3178         /*
3179          * Drop the link from dp to cdp.
3180          */
3181         error = xfs_droplink(tp, cdp);
3182         if (error) {
3183                 goto error1;
3184         }
3185
3186         /*
3187          * Drop the "." link from cdp to self.
3188          */
3189         error = xfs_droplink(tp, cdp);
3190         if (error) {
3191                 goto error1;
3192         }
3193
3194         /* Determine these before committing transaction */
3195         last_cdp_link = (cdp)->i_d.di_nlink==0;
3196
3197         /*
3198          * Take an extra ref on the child vnode so that it
3199          * does not go to xfs_inactive() from within the commit.
3200          */
3201         IHOLD(cdp);
3202
3203         /*
3204          * If this is a synchronous mount, make sure that the
3205          * rmdir transaction goes to disk before returning to
3206          * the user.
3207          */
3208         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3209                 xfs_trans_set_sync(tp);
3210         }
3211
3212         error = xfs_bmap_finish (&tp, &free_list, &committed);
3213         if (error) {
3214                 xfs_bmap_cancel(&free_list);
3215                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3216                                  XFS_TRANS_ABORT));
3217                 IRELE(cdp);
3218                 goto std_return;
3219         }
3220
3221         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3222         if (error) {
3223                 IRELE(cdp);
3224                 goto std_return;
3225         }
3226
3227
3228         /*
3229          * Let interposed file systems know about removed links.
3230          */
3231         bhv_vop_link_removed(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3232
3233         IRELE(cdp);
3234
3235         /* Fall through to std_return with error = 0 or the errno
3236          * from xfs_trans_commit. */
3237  std_return:
3238         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) {
3239                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3240                                         dir_vp, DM_RIGHT_NULL,
3241                                         NULL, DM_RIGHT_NULL,
3242                                         name, NULL, dm_di_mode,
3243                                         error, 0);
3244         }
3245         return error;
3246
3247  error1:
3248         xfs_bmap_cancel(&free_list);
3249         cancel_flags |= XFS_TRANS_ABORT;
3250         /* FALLTHROUGH */
3251
3252  error_return:
3253         xfs_trans_cancel(tp, cancel_flags);
3254         goto std_return;
3255 }
3256
3257
3258 /*
3259  * Read dp's entries starting at uiop->uio_offset and translate them into
3260  * bufsize bytes worth of struct dirents starting at bufbase.
3261  */
3262 STATIC int
3263 xfs_readdir(
3264         bhv_desc_t      *dir_bdp,
3265         uio_t           *uiop,
3266         cred_t          *credp,
3267         int             *eofp)
3268 {
3269         xfs_inode_t     *dp;
3270         xfs_trans_t     *tp = NULL;
3271         int             error = 0;
3272         uint            lock_mode;
3273
3274         vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
3275                                                (inst_t *)__return_address);
3276         dp = XFS_BHVTOI(dir_bdp);
3277
3278         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
3279                 return XFS_ERROR(EIO);
3280
3281         lock_mode = xfs_ilock_map_shared(dp);
3282         error = xfs_dir_getdents(tp, dp, uiop, eofp);
3283         xfs_iunlock_map_shared(dp, lock_mode);
3284         return error;
3285 }
3286
3287
3288 STATIC int
3289 xfs_symlink(
3290         bhv_desc_t              *dir_bdp,
3291         bhv_vname_t             *dentry,
3292         bhv_vattr_t             *vap,
3293         char                    *target_path,
3294         bhv_vnode_t             **vpp,
3295         cred_t                  *credp)
3296 {
3297         xfs_trans_t             *tp;
3298         xfs_mount_t             *mp;
3299         xfs_inode_t             *dp;
3300         xfs_inode_t             *ip;
3301         int                     error;
3302         int                     pathlen;
3303         xfs_bmap_free_t         free_list;
3304         xfs_fsblock_t           first_block;
3305         boolean_t               dp_joined_to_trans;
3306         bhv_vnode_t             *dir_vp;
3307         uint                    cancel_flags;
3308         int                     committed;
3309         xfs_fileoff_t           first_fsb;
3310         xfs_filblks_t           fs_blocks;
3311         int                     nmaps;
3312         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3313         xfs_daddr_t             d;
3314         char                    *cur_chunk;
3315         int                     byte_cnt;
3316         int                     n;
3317         xfs_buf_t               *bp;
3318         xfs_prid_t              prid;
3319         struct xfs_dquot        *udqp, *gdqp;
3320         uint                    resblks;
3321         char                    *link_name = VNAME(dentry);
3322         int                     link_namelen;
3323
3324         *vpp = NULL;
3325         dir_vp = BHV_TO_VNODE(dir_bdp);
3326         dp = XFS_BHVTOI(dir_bdp);
3327         dp_joined_to_trans = B_FALSE;
3328         error = 0;
3329         ip = NULL;
3330         tp = NULL;
3331
3332         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3333
3334         mp = dp->i_mount;
3335
3336         if (XFS_FORCED_SHUTDOWN(mp))
3337                 return XFS_ERROR(EIO);
3338
3339         link_namelen = VNAMELEN(dentry);
3340
3341         /*
3342          * Check component lengths of the target path name.
3343          */
3344         pathlen = strlen(target_path);
3345         if (pathlen >= MAXPATHLEN)      /* total string too long */
3346                 return XFS_ERROR(ENAMETOOLONG);
3347         if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3348                 int len, total;
3349                 char *path;
3350
3351                 for (total = 0, path = target_path; total < pathlen;) {
3352                         /*
3353                          * Skip any slashes.
3354                          */
3355                         while(*path == '/') {
3356                                 total++;
3357                                 path++;
3358                         }
3359
3360                         /*
3361                          * Count up to the next slash or end of path.
3362                          * Error out if the component is bigger than MAXNAMELEN.
3363                          */
3364                         for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3365                                 if (++len >= MAXNAMELEN) {
3366                                         error = ENAMETOOLONG;
3367                                         return error;
3368                                 }
3369                         }
3370                 }
3371         }
3372
3373         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
3374                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3375                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3376                                         link_name, target_path, 0, 0, 0);
3377                 if (error)
3378                         return error;
3379         }
3380
3381         /* Return through std_return after this point. */
3382
3383         udqp = gdqp = NULL;
3384         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3385                 prid = dp->i_d.di_projid;
3386         else if (vap->va_mask & XFS_AT_PROJID)
3387                 prid = (xfs_prid_t)vap->va_projid;
3388         else
3389                 prid = (xfs_prid_t)dfltprid;
3390
3391         /*
3392          * Make sure that we have allocated dquot(s) on disk.
3393          */
3394         error = XFS_QM_DQVOPALLOC(mp, dp,
3395                         current_fsuid(credp), current_fsgid(credp), prid,
3396                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3397         if (error)
3398                 goto std_return;
3399
3400         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3401         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3402         /*
3403          * The symlink will fit into the inode data fork?
3404          * There can't be any attributes so we get the whole variable part.
3405          */
3406         if (pathlen <= XFS_LITINO(mp))
3407                 fs_blocks = 0;
3408         else
3409                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3410         resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3411         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3412                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3413         if (error == ENOSPC && fs_blocks == 0) {
3414                 resblks = 0;
3415                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3416                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3417         }
3418         if (error) {
3419                 cancel_flags = 0;
3420                 dp = NULL;
3421                 goto error_return;
3422         }
3423
3424         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
3425
3426         /*
3427          * Check whether the directory allows new symlinks or not.
3428          */
3429         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3430                 error = XFS_ERROR(EPERM);
3431                 goto error_return;
3432         }
3433
3434         /*
3435          * Reserve disk quota : blocks and inode.
3436          */
3437         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3438         if (error)
3439                 goto error_return;
3440
3441         /*
3442          * Check for ability to enter directory entry, if no space reserved.
3443          */
3444         if (resblks == 0 &&
3445             (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
3446                 goto error_return;
3447         /*
3448          * Initialize the bmap freelist prior to calling either
3449          * bmapi or the directory create code.
3450          */
3451         XFS_BMAP_INIT(&free_list, &first_block);
3452
3453         /*
3454          * Allocate an inode for the symlink.
3455          */
3456         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
3457                                1, 0, credp, prid, resblks > 0, &ip, NULL);
3458         if (error) {
3459                 if (error == ENOSPC)
3460                         goto error_return;
3461                 goto error1;
3462         }
3463         ITRACE(ip);
3464
3465         VN_HOLD(dir_vp);
3466         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3467         dp_joined_to_trans = B_TRUE;
3468
3469         /*
3470          * Also attach the dquot(s) to it, if applicable.
3471          */
3472         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3473
3474         if (resblks)
3475                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3476         /*
3477          * If the symlink will fit into the inode, write it inline.
3478          */
3479         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3480                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3481                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3482                 ip->i_d.di_size = pathlen;
3483
3484                 /*
3485                  * The inode was initially created in extent format.
3486                  */
3487                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3488                 ip->i_df.if_flags |= XFS_IFINLINE;
3489
3490                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3491                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3492
3493         } else {
3494                 first_fsb = 0;
3495                 nmaps = SYMLINK_MAPS;
3496
3497                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3498                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3499                                   &first_block, resblks, mval, &nmaps,
3500                                   &free_list, NULL);
3501                 if (error) {
3502                         goto error1;
3503                 }
3504
3505                 if (resblks)
3506                         resblks -= fs_blocks;
3507                 ip->i_d.di_size = pathlen;
3508                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3509
3510                 cur_chunk = target_path;
3511                 for (n = 0; n < nmaps; n++) {
3512                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3513                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3514                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3515                                                BTOBB(byte_cnt), 0);
3516                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3517                         if (pathlen < byte_cnt) {
3518                                 byte_cnt = pathlen;
3519                         }
3520                         pathlen -= byte_cnt;
3521
3522                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3523                         cur_chunk += byte_cnt;
3524
3525                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3526                 }
3527         }
3528
3529         /*
3530          * Create the directory entry for the symlink.
3531          */
3532         error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
3533                                    &first_block, &free_list, resblks);
3534         if (error)
3535                 goto error1;
3536         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3537         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3538
3539         /*
3540          * Bump the in memory version number of the parent directory
3541          * so that other processes accessing it will recognize that
3542          * the directory has changed.
3543          */
3544         dp->i_gen++;
3545
3546         /*
3547          * If this is a synchronous mount, make sure that the
3548          * symlink transaction goes to disk before returning to
3549          * the user.
3550          */
3551         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3552                 xfs_trans_set_sync(tp);
3553         }
3554
3555         /*
3556          * xfs_trans_commit normally decrements the vnode ref count
3557          * when it unlocks the inode. Since we want to return the
3558          * vnode to the caller, we bump the vnode ref count now.
3559          */
3560         IHOLD(ip);
3561
3562         error = xfs_bmap_finish(&tp, &free_list, &committed);
3563         if (error) {
3564                 goto error2;
3565         }
3566         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3567         XFS_QM_DQRELE(mp, udqp);
3568         XFS_QM_DQRELE(mp, gdqp);
3569
3570         /* Fall through to std_return with error = 0 or errno from
3571          * xfs_trans_commit     */
3572 std_return:
3573         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3574                              DM_EVENT_POSTSYMLINK)) {
3575                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3576                                         dir_vp, DM_RIGHT_NULL,
3577                                         error ? NULL : XFS_ITOV(ip),
3578                                         DM_RIGHT_NULL, link_name, target_path,
3579                                         0, error, 0);
3580         }
3581
3582         if (!error) {
3583                 bhv_vnode_t *vp;
3584
3585                 ASSERT(ip);
3586                 vp = XFS_ITOV(ip);
3587                 *vpp = vp;
3588         }
3589         return error;
3590
3591  error2:
3592         IRELE(ip);
3593  error1:
3594         xfs_bmap_cancel(&free_list);
3595         cancel_flags |= XFS_TRANS_ABORT;
3596  error_return:
3597         xfs_trans_cancel(tp, cancel_flags);
3598         XFS_QM_DQRELE(mp, udqp);
3599         XFS_QM_DQRELE(mp, gdqp);
3600
3601         if (!dp_joined_to_trans && (dp != NULL)) {
3602                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3603         }
3604
3605         goto std_return;
3606 }
3607
3608
3609 /*
3610  * xfs_fid2
3611  *
3612  * A fid routine that takes a pointer to a previously allocated
3613  * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3614  */
3615 STATIC int
3616 xfs_fid2(
3617         bhv_desc_t      *bdp,
3618         fid_t           *fidp)
3619 {
3620         xfs_inode_t     *ip;
3621         xfs_fid2_t      *xfid;
3622
3623         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3624                                        (inst_t *)__return_address);
3625         ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
3626
3627         xfid = (xfs_fid2_t *)fidp;
3628         ip = XFS_BHVTOI(bdp);
3629         xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3630         xfid->fid_pad = 0;
3631         /*
3632          * use memcpy because the inode is a long long and there's no
3633          * assurance that xfid->fid_ino is properly aligned.
3634          */
3635         memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3636         xfid->fid_gen = ip->i_d.di_gen;
3637
3638         return 0;
3639 }
3640
3641
3642 /*
3643  * xfs_rwlock
3644  */
3645 int
3646 xfs_rwlock(
3647         bhv_desc_t      *bdp,
3648         bhv_vrwlock_t   locktype)
3649 {
3650         xfs_inode_t     *ip;
3651         bhv_vnode_t     *vp;
3652
3653         vp = BHV_TO_VNODE(bdp);
3654         if (VN_ISDIR(vp))
3655                 return 1;
3656         ip = XFS_BHVTOI(bdp);
3657         if (locktype == VRWLOCK_WRITE) {
3658                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3659         } else if (locktype == VRWLOCK_TRY_READ) {
3660                 return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3661         } else if (locktype == VRWLOCK_TRY_WRITE) {
3662                 return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3663         } else {
3664                 ASSERT((locktype == VRWLOCK_READ) ||
3665                        (locktype == VRWLOCK_WRITE_DIRECT));
3666                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3667         }
3668
3669         return 1;
3670 }
3671
3672
3673 /*
3674  * xfs_rwunlock
3675  */
3676 void
3677 xfs_rwunlock(
3678         bhv_desc_t      *bdp,
3679         bhv_vrwlock_t   locktype)
3680 {
3681         xfs_inode_t     *ip;
3682         bhv_vnode_t     *vp;
3683
3684         vp = BHV_TO_VNODE(bdp);
3685         if (VN_ISDIR(vp))
3686                 return;
3687         ip = XFS_BHVTOI(bdp);
3688         if (locktype == VRWLOCK_WRITE) {
3689                 /*
3690                  * In the write case, we may have added a new entry to
3691                  * the reference cache.  This might store a pointer to
3692                  * an inode to be released in this inode.  If it is there,
3693                  * clear the pointer and release the inode after unlocking
3694                  * this one.
3695                  */
3696                 xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3697         } else {
3698                 ASSERT((locktype == VRWLOCK_READ) ||
3699                        (locktype == VRWLOCK_WRITE_DIRECT));
3700                 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3701         }
3702         return;
3703 }
3704
3705 STATIC int
3706 xfs_inode_flush(
3707         bhv_desc_t      *bdp,
3708         int             flags)
3709 {
3710         xfs_inode_t     *ip;
3711         xfs_mount_t     *mp;
3712         xfs_inode_log_item_t *iip;
3713         int             error = 0;
3714
3715         ip = XFS_BHVTOI(bdp);
3716         mp = ip->i_mount;
3717         iip = ip->i_itemp;
3718
3719         if (XFS_FORCED_SHUTDOWN(mp))
3720                 return XFS_ERROR(EIO);
3721
3722         /*
3723          * Bypass inodes which have already been cleaned by
3724          * the inode flush clustering code inside xfs_iflush
3725          */
3726         if ((ip->i_update_core == 0) &&
3727             ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3728                 return 0;
3729
3730         if (flags & FLUSH_LOG) {
3731                 if (iip && iip->ili_last_lsn) {
3732                         xlog_t          *log = mp->m_log;
3733                         xfs_lsn_t       sync_lsn;
3734                         int             s, log_flags = XFS_LOG_FORCE;
3735
3736                         s = GRANT_LOCK(log);
3737                         sync_lsn = log->l_last_sync_lsn;
3738                         GRANT_UNLOCK(log, s);
3739
3740                         if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
3741                                 return 0;
3742
3743                         if (flags & FLUSH_SYNC)
3744                                 log_flags |= XFS_LOG_SYNC;
3745                         return xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3746                 }
3747         }
3748
3749         /*
3750          * We make this non-blocking if the inode is contended,
3751          * return EAGAIN to indicate to the caller that they
3752          * did not succeed. This prevents the flush path from
3753          * blocking on inodes inside another operation right
3754          * now, they get caught later by xfs_sync.
3755          */
3756         if (flags & FLUSH_INODE) {
3757                 int     flush_flags;
3758
3759                 if (xfs_ipincount(ip))
3760                         return EAGAIN;
3761
3762                 if (flags & FLUSH_SYNC) {
3763                         xfs_ilock(ip, XFS_ILOCK_SHARED);
3764                         xfs_iflock(ip);
3765                 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3766                         if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3767                                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3768                                 return EAGAIN;
3769                         }
3770                 } else {
3771                         return EAGAIN;
3772                 }
3773
3774                 if (flags & FLUSH_SYNC)
3775                         flush_flags = XFS_IFLUSH_SYNC;
3776                 else
3777                         flush_flags = XFS_IFLUSH_ASYNC;
3778
3779                 error = xfs_iflush(ip, flush_flags);
3780                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3781         }
3782
3783         return error;
3784 }
3785
3786 int
3787 xfs_set_dmattrs (
3788         bhv_desc_t      *bdp,
3789         u_int           evmask,
3790         u_int16_t       state,
3791         cred_t          *credp)
3792 {
3793         xfs_inode_t     *ip;
3794         xfs_trans_t     *tp;
3795         xfs_mount_t     *mp;
3796         int             error;
3797
3798         if (!capable(CAP_SYS_ADMIN))
3799                 return XFS_ERROR(EPERM);
3800
3801         ip = XFS_BHVTOI(bdp);
3802         mp = ip->i_mount;
3803
3804         if (XFS_FORCED_SHUTDOWN(mp))
3805                 return XFS_ERROR(EIO);
3806
3807         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3808         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3809         if (error) {
3810                 xfs_trans_cancel(tp, 0);
3811                 return error;
3812         }
3813         xfs_ilock(ip, XFS_ILOCK_EXCL);
3814         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3815
3816         ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3817         ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3818
3819         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3820         IHOLD(ip);
3821         error = xfs_trans_commit(tp, 0);
3822
3823         return error;
3824 }
3825
3826 STATIC int
3827 xfs_reclaim(
3828         bhv_desc_t      *bdp)
3829 {
3830         xfs_inode_t     *ip;
3831         bhv_vnode_t     *vp;
3832
3833         vp = BHV_TO_VNODE(bdp);
3834         ip = XFS_BHVTOI(bdp);
3835
3836         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3837
3838         ASSERT(!VN_MAPPED(vp));
3839
3840         /* bad inode, get out here ASAP */
3841         if (VN_BAD(vp)) {
3842                 xfs_ireclaim(ip);
3843                 return 0;
3844         }
3845
3846         vn_iowait(vp);
3847
3848         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3849
3850         /*
3851          * Make sure the atime in the XFS inode is correct before freeing the
3852          * Linux inode.
3853          */
3854         xfs_synchronize_atime(ip);
3855
3856         /*
3857          * If we have nothing to flush with this inode then complete the
3858          * teardown now, otherwise break the link between the xfs inode and the
3859          * linux inode and clean up the xfs inode later. This avoids flushing
3860          * the inode to disk during the delete operation itself.
3861          *
3862          * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
3863          * first to ensure that xfs_iunpin() will never see an xfs inode
3864          * that has a linux inode being reclaimed. Synchronisation is provided
3865          * by the i_flags_lock.
3866          */
3867         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3868                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3869                 xfs_iflock(ip);
3870                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3871         } else {
3872                 xfs_mount_t     *mp = ip->i_mount;
3873
3874                 /* Protect sync and unpin from us */
3875                 XFS_MOUNT_ILOCK(mp);
3876                 spin_lock(&ip->i_flags_lock);
3877                 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
3878                 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3879                 spin_unlock(&ip->i_flags_lock);
3880                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3881                 XFS_MOUNT_IUNLOCK(mp);
3882         }
3883         return 0;
3884 }
3885
3886 int
3887 xfs_finish_reclaim(
3888         xfs_inode_t     *ip,
3889         int             locked,
3890         int             sync_mode)
3891 {
3892         xfs_ihash_t     *ih = ip->i_hash;
3893         bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
3894         int             error;
3895
3896         if (vp && VN_BAD(vp))
3897                 goto reclaim;
3898
3899         /* The hash lock here protects a thread in xfs_iget_core from
3900          * racing with us on linking the inode back with a vnode.
3901          * Once we have the XFS_IRECLAIM flag set it will not touch
3902          * us.
3903          */
3904         write_lock(&ih->ih_lock);
3905         spin_lock(&ip->i_flags_lock);
3906         if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3907             (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3908                 spin_unlock(&ip->i_flags_lock);
3909                 write_unlock(&ih->ih_lock);
3910                 if (locked) {
3911                         xfs_ifunlock(ip);
3912                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3913                 }
3914                 return 1;
3915         }
3916         __xfs_iflags_set(ip, XFS_IRECLAIM);
3917         spin_unlock(&ip->i_flags_lock);
3918         write_unlock(&ih->ih_lock);
3919
3920         /*
3921          * If the inode is still dirty, then flush it out.  If the inode
3922          * is not in the AIL, then it will be OK to flush it delwri as
3923          * long as xfs_iflush() does not keep any references to the inode.
3924          * We leave that decision up to xfs_iflush() since it has the
3925          * knowledge of whether it's OK to simply do a delwri flush of
3926          * the inode or whether we need to wait until the inode is
3927          * pulled from the AIL.
3928          * We get the flush lock regardless, though, just to make sure
3929          * we don't free it while it is being flushed.
3930          */
3931         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3932                 if (!locked) {
3933                         xfs_ilock(ip, XFS_ILOCK_EXCL);
3934                         xfs_iflock(ip);
3935                 }
3936
3937                 if (ip->i_update_core ||
3938                     ((ip->i_itemp != NULL) &&
3939                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3940                         error = xfs_iflush(ip, sync_mode);
3941                         /*
3942                          * If we hit an error, typically because of filesystem
3943                          * shutdown, we don't need to let vn_reclaim to know
3944                          * because we're gonna reclaim the inode anyway.
3945                          */
3946                         if (error) {
3947                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3948                                 goto reclaim;
3949                         }
3950                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3951                 }
3952
3953                 ASSERT(ip->i_update_core == 0);
3954                 ASSERT(ip->i_itemp == NULL ||
3955                        ip->i_itemp->ili_format.ilf_fields == 0);
3956                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3957         } else if (locked) {
3958                 /*
3959                  * We are not interested in doing an iflush if we're
3960                  * in the process of shutting down the filesystem forcibly.
3961                  * So, just reclaim the inode.
3962                  */
3963                 xfs_ifunlock(ip);
3964                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3965         }
3966
3967  reclaim:
3968         xfs_ireclaim(ip);
3969         return 0;
3970 }
3971
3972 int
3973 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3974 {
3975         int             purged;
3976         xfs_inode_t     *ip, *n;
3977         int             done = 0;
3978
3979         while (!done) {
3980                 purged = 0;
3981                 XFS_MOUNT_ILOCK(mp);
3982                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3983                         if (noblock) {
3984                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3985                                         continue;
3986                                 if (xfs_ipincount(ip) ||
3987                                     !xfs_iflock_nowait(ip)) {
3988                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3989                                         continue;
3990                                 }
3991                         }
3992                         XFS_MOUNT_IUNLOCK(mp);
3993                         if (xfs_finish_reclaim(ip, noblock,
3994                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3995                                 delay(1);
3996                         purged = 1;
3997                         break;
3998                 }
3999
4000                 done = !purged;
4001         }
4002
4003         XFS_MOUNT_IUNLOCK(mp);
4004         return 0;
4005 }
4006
4007 /*
4008  * xfs_alloc_file_space()
4009  *      This routine allocates disk space for the given file.
4010  *
4011  *      If alloc_type == 0, this request is for an ALLOCSP type
4012  *      request which will change the file size.  In this case, no
4013  *      DMAPI event will be generated by the call.  A TRUNCATE event
4014  *      will be generated later by xfs_setattr.
4015  *
4016  *      If alloc_type != 0, this request is for a RESVSP type
4017  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
4018  *      lower block boundary byte address is less than the file's
4019  *      length.
4020  *
4021  * RETURNS:
4022  *       0 on success
4023  *      errno on error
4024  *
4025  */
4026 STATIC int
4027 xfs_alloc_file_space(
4028         xfs_inode_t             *ip,
4029         xfs_off_t               offset,
4030         xfs_off_t               len,
4031         int                     alloc_type,
4032         int                     attr_flags)
4033 {
4034         xfs_mount_t             *mp = ip->i_mount;
4035         xfs_off_t               count;
4036         xfs_filblks_t           allocated_fsb;
4037         xfs_filblks_t           allocatesize_fsb;
4038         xfs_extlen_t            extsz, temp;
4039         xfs_fileoff_t           startoffset_fsb;
4040         xfs_fsblock_t           firstfsb;
4041         int                     nimaps;
4042         int                     bmapi_flag;
4043         int                     quota_flag;
4044         int                     rt;
4045         xfs_trans_t             *tp;
4046         xfs_bmbt_irec_t         imaps[1], *imapp;
4047         xfs_bmap_free_t         free_list;
4048         uint                    qblocks, resblks, resrtextents;
4049         int                     committed;
4050         int                     error;
4051
4052         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4053
4054         if (XFS_FORCED_SHUTDOWN(mp))
4055                 return XFS_ERROR(EIO);
4056
4057         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4058                 return error;
4059
4060         if (len <= 0)
4061                 return XFS_ERROR(EINVAL);
4062
4063         rt = XFS_IS_REALTIME_INODE(ip);
4064         extsz = xfs_get_extsz_hint(ip);
4065
4066         count = len;
4067         imapp = &imaps[0];
4068         nimaps = 1;
4069         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4070         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
4071         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4072
4073         /*      Generate a DMAPI event if needed.       */
4074         if (alloc_type != 0 && offset < ip->i_size &&
4075                         (attr_flags&ATTR_DMI) == 0  &&
4076                         DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4077                 xfs_off_t           end_dmi_offset;
4078
4079                 end_dmi_offset = offset+len;
4080                 if (end_dmi_offset > ip->i_size)
4081                         end_dmi_offset = ip->i_size;
4082                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4083                         offset, end_dmi_offset - offset,
4084                         0, NULL);
4085                 if (error)
4086                         return error;
4087         }
4088
4089         /*
4090          * Allocate file space until done or until there is an error
4091          */
4092 retry:
4093         while (allocatesize_fsb && !error) {
4094                 xfs_fileoff_t   s, e;
4095
4096                 /*
4097                  * Determine space reservations for data/realtime.
4098                  */
4099                 if (unlikely(extsz)) {
4100                         s = startoffset_fsb;
4101                         do_div(s, extsz);
4102                         s *= extsz;
4103                         e = startoffset_fsb + allocatesize_fsb;
4104                         if ((temp = do_mod(startoffset_fsb, extsz)))
4105                                 e += temp;
4106                         if ((temp = do_mod(e, extsz)))
4107                                 e += extsz - temp;
4108                 } else {
4109                         s = 0;
4110                         e = allocatesize_fsb;
4111                 }
4112
4113                 if (unlikely(rt)) {
4114                         resrtextents = qblocks = (uint)(e - s);
4115                         resrtextents /= mp->m_sb.sb_rextsize;
4116                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4117                         quota_flag = XFS_QMOPT_RES_RTBLKS;
4118                 } else {
4119                         resrtextents = 0;
4120                         resblks = qblocks = \
4121                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
4122                         quota_flag = XFS_QMOPT_RES_REGBLKS;
4123                 }
4124
4125                 /*
4126                  * Allocate and setup the transaction.
4127                  */
4128                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4129                 error = xfs_trans_reserve(tp, resblks,
4130                                           XFS_WRITE_LOG_RES(mp), resrtextents,
4131                                           XFS_TRANS_PERM_LOG_RES,
4132                                           XFS_WRITE_LOG_COUNT);
4133                 /*
4134                  * Check for running out of space
4135                  */
4136                 if (error) {
4137                         /*
4138                          * Free the transaction structure.
4139                          */
4140                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4141                         xfs_trans_cancel(tp, 0);
4142                         break;
4143                 }
4144                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4145                 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
4146                                                       qblocks, 0, quota_flag);
4147                 if (error)
4148                         goto error1;
4149
4150                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4151                 xfs_trans_ihold(tp, ip);
4152
4153                 /*
4154                  * Issue the xfs_bmapi() call to allocate the blocks
4155                  */
4156                 XFS_BMAP_INIT(&free_list, &firstfsb);
4157                 error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4158                                   allocatesize_fsb, bmapi_flag,
4159                                   &firstfsb, 0, imapp, &nimaps,
4160                                   &free_list, NULL);
4161                 if (error) {
4162                         goto error0;
4163                 }
4164
4165                 /*
4166                  * Complete the transaction
4167                  */
4168                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4169                 if (error) {
4170                         goto error0;
4171                 }
4172
4173                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4174                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4175                 if (error) {
4176                         break;
4177                 }
4178
4179                 allocated_fsb = imapp->br_blockcount;
4180
4181                 if (nimaps == 0) {
4182                         error = XFS_ERROR(ENOSPC);
4183                         break;
4184                 }
4185
4186                 startoffset_fsb += allocated_fsb;
4187                 allocatesize_fsb -= allocated_fsb;
4188         }
4189 dmapi_enospc_check:
4190         if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
4191             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
4192
4193                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4194                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4195                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4196                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4197                 if (error == 0)
4198                         goto retry;     /* Maybe DMAPI app. has made space */
4199                 /* else fall through with error from XFS_SEND_DATA */
4200         }
4201
4202         return error;
4203
4204 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
4205         xfs_bmap_cancel(&free_list);
4206         XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
4207
4208 error1: /* Just cancel transaction */
4209         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4210         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4211         goto dmapi_enospc_check;
4212 }
4213
4214 /*
4215  * Zero file bytes between startoff and endoff inclusive.
4216  * The iolock is held exclusive and no blocks are buffered.
4217  */
4218 STATIC int
4219 xfs_zero_remaining_bytes(
4220         xfs_inode_t             *ip,
4221         xfs_off_t               startoff,
4222         xfs_off_t               endoff)
4223 {
4224         xfs_bmbt_irec_t         imap;
4225         xfs_fileoff_t           offset_fsb;
4226         xfs_off_t               lastoffset;
4227         xfs_off_t               offset;
4228         xfs_buf_t               *bp;
4229         xfs_mount_t             *mp = ip->i_mount;
4230         int                     nimap;
4231         int                     error = 0;
4232
4233         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4234                                 ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4235                                 mp->m_rtdev_targp : mp->m_ddev_targp);
4236
4237         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4238                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
4239                 nimap = 1;
4240                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
4241                         NULL, 0, &imap, &nimap, NULL, NULL);
4242                 if (error || nimap < 1)
4243                         break;
4244                 ASSERT(imap.br_blockcount >= 1);
4245                 ASSERT(imap.br_startoff == offset_fsb);
4246                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4247                 if (lastoffset > endoff)
4248                         lastoffset = endoff;
4249                 if (imap.br_startblock == HOLESTARTBLOCK)
4250                         continue;
4251                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4252                 if (imap.br_state == XFS_EXT_UNWRITTEN)
4253                         continue;
4254                 XFS_BUF_UNDONE(bp);
4255                 XFS_BUF_UNWRITE(bp);
4256                 XFS_BUF_READ(bp);
4257                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4258                 xfsbdstrat(mp, bp);
4259                 if ((error = xfs_iowait(bp))) {
4260                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4261                                           mp, bp, XFS_BUF_ADDR(bp));
4262                         break;
4263                 }
4264                 memset(XFS_BUF_PTR(bp) +
4265                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4266                       0, lastoffset - offset + 1);
4267                 XFS_BUF_UNDONE(bp);
4268                 XFS_BUF_UNREAD(bp);
4269                 XFS_BUF_WRITE(bp);
4270                 xfsbdstrat(mp, bp);
4271                 if ((error = xfs_iowait(bp))) {
4272                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4273                                           mp, bp, XFS_BUF_ADDR(bp));
4274                         break;
4275                 }
4276         }
4277         xfs_buf_free(bp);
4278         return error;
4279 }
4280
4281 /*
4282  * xfs_free_file_space()
4283  *      This routine frees disk space for the given file.
4284  *
4285  *      This routine is only called by xfs_change_file_space
4286  *      for an UNRESVSP type call.
4287  *
4288  * RETURNS:
4289  *       0 on success
4290  *      errno on error
4291  *
4292  */
4293 STATIC int
4294 xfs_free_file_space(
4295         xfs_inode_t             *ip,
4296         xfs_off_t               offset,
4297         xfs_off_t               len,
4298         int                     attr_flags)
4299 {
4300         bhv_vnode_t             *vp;
4301         int                     committed;
4302         int                     done;
4303         xfs_off_t               end_dmi_offset;
4304         xfs_fileoff_t           endoffset_fsb;
4305         int                     error;
4306         xfs_fsblock_t           firstfsb;
4307         xfs_bmap_free_t         free_list;
4308         xfs_bmbt_irec_t         imap;
4309         xfs_off_t               ioffset;
4310         xfs_extlen_t            mod=0;
4311         xfs_mount_t             *mp;
4312         int                     nimap;
4313         uint                    resblks;
4314         uint                    rounding;
4315         int                     rt;
4316         xfs_fileoff_t           startoffset_fsb;
4317         xfs_trans_t             *tp;
4318         int                     need_iolock = 1;
4319
4320         vp = XFS_ITOV(ip);
4321         mp = ip->i_mount;
4322
4323         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4324
4325         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4326                 return error;
4327
4328         error = 0;
4329         if (len <= 0)   /* if nothing being freed */
4330                 return error;
4331         rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4332         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
4333         end_dmi_offset = offset + len;
4334         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4335
4336         if (offset < ip->i_size &&
4337             (attr_flags & ATTR_DMI) == 0 &&
4338             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4339                 if (end_dmi_offset > ip->i_size)
4340                         end_dmi_offset = ip->i_size;
4341                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4342                                 offset, end_dmi_offset - offset,
4343                                 AT_DELAY_FLAG(attr_flags), NULL);
4344                 if (error)
4345                         return error;
4346         }
4347
4348         if (attr_flags & ATTR_NOLOCK)
4349                 need_iolock = 0;
4350         if (need_iolock) {
4351                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
4352                 vn_iowait(vp);  /* wait for the completion of any pending DIOs */
4353         }
4354
4355         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, NBPP);
4356         ioffset = offset & ~(rounding - 1);
4357
4358         if (VN_CACHED(vp) != 0) {
4359                 xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
4360                                 ctooff(offtoct(ioffset)), -1);
4361                 error = bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)),
4362                                 -1, FI_REMAPF_LOCKED);
4363                 if (error)
4364                         goto out_unlock_iolock;
4365         }
4366
4367         /*
4368          * Need to zero the stuff we're not freeing, on disk.
4369          * If its a realtime file & can't use unwritten extents then we
4370          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4371          * will take care of it for us.
4372          */
4373         if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4374                 nimap = 1;
4375                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
4376                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4377                 if (error)
4378                         goto out_unlock_iolock;
4379                 ASSERT(nimap == 0 || nimap == 1);
4380                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4381                         xfs_daddr_t     block;
4382
4383                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4384                         block = imap.br_startblock;
4385                         mod = do_div(block, mp->m_sb.sb_rextsize);
4386                         if (mod)
4387                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4388                 }
4389                 nimap = 1;
4390                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
4391                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4392                 if (error)
4393                         goto out_unlock_iolock;
4394                 ASSERT(nimap == 0 || nimap == 1);
4395                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4396                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4397                         mod++;
4398                         if (mod && (mod != mp->m_sb.sb_rextsize))
4399                                 endoffset_fsb -= mod;
4400                 }
4401         }
4402         if ((done = (endoffset_fsb <= startoffset_fsb)))
4403                 /*
4404                  * One contiguous piece to clear
4405                  */
4406                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4407         else {
4408                 /*
4409                  * Some full blocks, possibly two pieces to clear
4410                  */
4411                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4412                         error = xfs_zero_remaining_bytes(ip, offset,
4413                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4414                 if (!error &&
4415                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4416                         error = xfs_zero_remaining_bytes(ip,
4417                                 XFS_FSB_TO_B(mp, endoffset_fsb),
4418                                 offset + len - 1);
4419         }
4420
4421         /*
4422          * free file space until done or until there is an error
4423          */
4424         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4425         while (!error && !done) {
4426
4427                 /*
4428                  * allocate and setup the transaction
4429                  */
4430                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4431                 error = xfs_trans_reserve(tp,
4432                                           resblks,
4433                                           XFS_WRITE_LOG_RES(mp),
4434                                           0,
4435                                           XFS_TRANS_PERM_LOG_RES,
4436                                           XFS_WRITE_LOG_COUNT);
4437
4438                 /*
4439                  * check for running out of space
4440                  */
4441                 if (error) {
4442                         /*
4443                          * Free the transaction structure.
4444                          */
4445                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4446                         xfs_trans_cancel(tp, 0);
4447                         break;
4448                 }
4449                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4450                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4451                                 ip->i_udquot, ip->i_gdquot, resblks, 0,
4452                                 XFS_QMOPT_RES_REGBLKS);
4453                 if (error)
4454                         goto error1;
4455
4456                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4457                 xfs_trans_ihold(tp, ip);
4458
4459                 /*
4460                  * issue the bunmapi() call to free the blocks
4461                  */
4462                 XFS_BMAP_INIT(&free_list, &firstfsb);
4463                 error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4464                                   endoffset_fsb - startoffset_fsb,
4465                                   0, 2, &firstfsb, &free_list, NULL, &done);
4466                 if (error) {
4467                         goto error0;
4468                 }
4469
4470                 /*
4471                  * complete the transaction
4472                  */
4473                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4474                 if (error) {
4475                         goto error0;
4476                 }
4477
4478                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4479                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4480         }
4481
4482  out_unlock_iolock:
4483         if (need_iolock)
4484                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4485         return error;
4486
4487  error0:
4488         xfs_bmap_cancel(&free_list);
4489  error1:
4490         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4491         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4492                     XFS_ILOCK_EXCL);
4493         return error;
4494 }
4495
4496 /*
4497  * xfs_change_file_space()
4498  *      This routine allocates or frees disk space for the given file.
4499  *      The user specified parameters are checked for alignment and size
4500  *      limitations.
4501  *
4502  * RETURNS:
4503  *       0 on success
4504  *      errno on error
4505  *
4506  */
4507 int
4508 xfs_change_file_space(
4509         bhv_desc_t      *bdp,
4510         int             cmd,
4511         xfs_flock64_t   *bf,
4512         xfs_off_t       offset,
4513         cred_t          *credp,
4514         int             attr_flags)
4515 {
4516         int             clrprealloc;
4517         int             error;
4518         xfs_fsize_t     fsize;
4519         xfs_inode_t     *ip;
4520         xfs_mount_t     *mp;
4521         int             setprealloc;
4522         xfs_off_t       startoffset;
4523         xfs_off_t       llen;
4524         xfs_trans_t     *tp;
4525         bhv_vattr_t     va;
4526         bhv_vnode_t     *vp;
4527
4528         vp = BHV_TO_VNODE(bdp);
4529         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4530
4531         ip = XFS_BHVTOI(bdp);
4532         mp = ip->i_mount;
4533
4534         /*
4535          * must be a regular file and have write permission
4536          */
4537         if (!VN_ISREG(vp))
4538                 return XFS_ERROR(EINVAL);
4539
4540         xfs_ilock(ip, XFS_ILOCK_SHARED);
4541
4542         if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4543                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
4544                 return error;
4545         }
4546
4547         xfs_iunlock(ip, XFS_ILOCK_SHARED);
4548
4549         switch (bf->l_whence) {
4550         case 0: /*SEEK_SET*/
4551                 break;
4552         case 1: /*SEEK_CUR*/
4553                 bf->l_start += offset;
4554                 break;
4555         case 2: /*SEEK_END*/
4556                 bf->l_start += ip->i_size;
4557                 break;
4558         default:
4559                 return XFS_ERROR(EINVAL);
4560         }
4561
4562         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4563
4564         if (   (bf->l_start < 0)
4565             || (bf->l_start > XFS_MAXIOFFSET(mp))
4566             || (bf->l_start + llen < 0)
4567             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4568                 return XFS_ERROR(EINVAL);
4569
4570         bf->l_whence = 0;
4571
4572         startoffset = bf->l_start;
4573         fsize = ip->i_size;
4574
4575         /*
4576          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4577          * file space.
4578          * These calls do NOT zero the data space allocated to the file,
4579          * nor do they change the file size.
4580          *
4581          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4582          * space.
4583          * These calls cause the new file data to be zeroed and the file
4584          * size to be changed.
4585          */
4586         setprealloc = clrprealloc = 0;
4587
4588         switch (cmd) {
4589         case XFS_IOC_RESVSP:
4590         case XFS_IOC_RESVSP64:
4591                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4592                                                                 1, attr_flags);
4593                 if (error)
4594                         return error;
4595                 setprealloc = 1;
4596                 break;
4597
4598         case XFS_IOC_UNRESVSP:
4599         case XFS_IOC_UNRESVSP64:
4600                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4601                                                                 attr_flags)))
4602                         return error;
4603                 break;
4604
4605         case XFS_IOC_ALLOCSP:
4606         case XFS_IOC_ALLOCSP64:
4607         case XFS_IOC_FREESP:
4608         case XFS_IOC_FREESP64:
4609                 if (startoffset > fsize) {
4610                         error = xfs_alloc_file_space(ip, fsize,
4611                                         startoffset - fsize, 0, attr_flags);
4612                         if (error)
4613                                 break;
4614                 }
4615
4616                 va.va_mask = XFS_AT_SIZE;
4617                 va.va_size = startoffset;
4618
4619                 error = xfs_setattr(bdp, &va, attr_flags, credp);
4620
4621                 if (error)
4622                         return error;
4623
4624                 clrprealloc = 1;
4625                 break;
4626
4627         default:
4628                 ASSERT(0);
4629                 return XFS_ERROR(EINVAL);
4630         }
4631
4632         /*
4633          * update the inode timestamp, mode, and prealloc flag bits
4634          */
4635         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4636
4637         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4638                                       0, 0, 0))) {
4639                 /* ASSERT(0); */
4640                 xfs_trans_cancel(tp, 0);
4641                 return error;
4642         }
4643
4644         xfs_ilock(ip, XFS_ILOCK_EXCL);
4645
4646         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4647         xfs_trans_ihold(tp, ip);
4648
4649         if ((attr_flags & ATTR_DMI) == 0) {
4650                 ip->i_d.di_mode &= ~S_ISUID;
4651
4652                 /*
4653                  * Note that we don't have to worry about mandatory
4654                  * file locking being disabled here because we only
4655                  * clear the S_ISGID bit if the Group execute bit is
4656                  * on, but if it was on then mandatory locking wouldn't
4657                  * have been enabled.
4658                  */
4659                 if (ip->i_d.di_mode & S_IXGRP)
4660                         ip->i_d.di_mode &= ~S_ISGID;
4661
4662                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4663         }
4664         if (setprealloc)
4665                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4666         else if (clrprealloc)
4667                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4668
4669         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4670         xfs_trans_set_sync(tp);
4671
4672         error = xfs_trans_commit(tp, 0);
4673
4674         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4675
4676         return error;
4677 }
4678
4679 bhv_vnodeops_t xfs_vnodeops = {
4680         BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4681         .vop_open               = xfs_open,
4682         .vop_close              = xfs_close,
4683         .vop_read               = xfs_read,
4684 #ifdef HAVE_SPLICE
4685         .vop_splice_read        = xfs_splice_read,
4686         .vop_splice_write       = xfs_splice_write,
4687 #endif
4688         .vop_write              = xfs_write,
4689         .vop_ioctl              = xfs_ioctl,
4690         .vop_getattr            = xfs_getattr,
4691         .vop_setattr            = xfs_setattr,
4692         .vop_access             = xfs_access,
4693         .vop_lookup             = xfs_lookup,
4694         .vop_create             = xfs_create,
4695         .vop_remove             = xfs_remove,
4696         .vop_link               = xfs_link,
4697         .vop_rename             = xfs_rename,
4698         .vop_mkdir              = xfs_mkdir,
4699         .vop_rmdir              = xfs_rmdir,
4700         .vop_readdir            = xfs_readdir,
4701         .vop_symlink            = xfs_symlink,
4702         .vop_readlink           = xfs_readlink,
4703         .vop_fsync              = xfs_fsync,
4704         .vop_inactive           = xfs_inactive,
4705         .vop_fid2               = xfs_fid2,
4706         .vop_rwlock             = xfs_rwlock,
4707         .vop_rwunlock           = xfs_rwunlock,
4708         .vop_bmap               = xfs_bmap,
4709         .vop_reclaim            = xfs_reclaim,
4710         .vop_attr_get           = xfs_attr_get,
4711         .vop_attr_set           = xfs_attr_set,
4712         .vop_attr_remove        = xfs_attr_remove,
4713         .vop_attr_list          = xfs_attr_list,
4714         .vop_link_removed       = (vop_link_removed_t)fs_noval,
4715         .vop_vnode_change       = (vop_vnode_change_t)fs_noval,
4716         .vop_tosspages          = fs_tosspages,
4717         .vop_flushinval_pages   = fs_flushinval_pages,
4718         .vop_flush_pages        = fs_flush_pages,
4719         .vop_release            = xfs_release,
4720         .vop_iflush             = xfs_inode_flush,
4721 };