]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - fs/xfs/xfs_log_recover.c
xfs: Use kmem_free() instead of free()
[karo-tx-linux.git] / fs / xfs / xfs_log_recover.c
index 7c0c1fdc728b4ff6e18da1a4f0b46edde2337e4e..39797490a1f1996e3f92f51efb532f15a75da8fa 100644 (file)
@@ -1585,6 +1585,7 @@ xlog_recover_add_to_trans(
                "bad number of regions (%d) in inode log format",
                                  in_f->ilf_size);
                        ASSERT(0);
+                       kmem_free(ptr);
                        return XFS_ERROR(EIO);
                }
 
@@ -1970,6 +1971,13 @@ xlog_recover_do_inode_buffer(
  * magic number.  If we don't recognise the magic number in the buffer, then
  * return a LSN of -1 so that the caller knows it was an unrecognised block and
  * so can recover the buffer.
+ *
+ * Note: we cannot rely solely on magic number matches to determine that the
+ * buffer has a valid LSN - we also need to verify that it belongs to this
+ * filesystem, so we need to extract the object's LSN and compare it to that
+ * which we read from the superblock. If the UUIDs don't match, then we've got a
+ * stale metadata block from an old filesystem instance that we need to recover
+ * over the top of.
  */
 static xfs_lsn_t
 xlog_recover_get_buf_lsn(
@@ -1980,6 +1988,8 @@ xlog_recover_get_buf_lsn(
        __uint16_t              magic16;
        __uint16_t              magicda;
        void                    *blk = bp->b_addr;
+       uuid_t                  *uuid;
+       xfs_lsn_t               lsn = -1;
 
        /* v4 filesystems always recover immediately */
        if (!xfs_sb_version_hascrc(&mp->m_sb))
@@ -1992,43 +2002,79 @@ xlog_recover_get_buf_lsn(
        case XFS_ABTB_MAGIC:
        case XFS_ABTC_MAGIC:
        case XFS_IBT_CRC_MAGIC:
-       case XFS_IBT_MAGIC:
-               return be64_to_cpu(
-                               ((struct xfs_btree_block *)blk)->bb_u.s.bb_lsn);
+       case XFS_IBT_MAGIC: {
+               struct xfs_btree_block *btb = blk;
+
+               lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
+               uuid = &btb->bb_u.s.bb_uuid;
+               break;
+       }
        case XFS_BMAP_CRC_MAGIC:
-       case XFS_BMAP_MAGIC:
-               return be64_to_cpu(
-                               ((struct xfs_btree_block *)blk)->bb_u.l.bb_lsn);
+       case XFS_BMAP_MAGIC: {
+               struct xfs_btree_block *btb = blk;
+
+               lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
+               uuid = &btb->bb_u.l.bb_uuid;
+               break;
+       }
        case XFS_AGF_MAGIC:
-               return be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+               lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+               uuid = &((struct xfs_agf *)blk)->agf_uuid;
+               break;
        case XFS_AGFL_MAGIC:
-               return be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+               lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+               uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
+               break;
        case XFS_AGI_MAGIC:
-               return be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+               lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+               uuid = &((struct xfs_agi *)blk)->agi_uuid;
+               break;
        case XFS_SYMLINK_MAGIC:
-               return be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+               lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+               uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
+               break;
        case XFS_DIR3_BLOCK_MAGIC:
        case XFS_DIR3_DATA_MAGIC:
        case XFS_DIR3_FREE_MAGIC:
-               return be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+               lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+               uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
+               break;
        case XFS_ATTR3_RMT_MAGIC:
-               return be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
+               lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
+               uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid;
+               break;
        case XFS_SB_MAGIC:
-               return be64_to_cpu(((struct xfs_sb *)blk)->sb_lsn);
+               lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
+               uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+               break;
        default:
                break;
        }
 
+       if (lsn != (xfs_lsn_t)-1) {
+               if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+                       goto recover_immediately;
+               return lsn;
+       }
+
        magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
        switch (magicda) {
        case XFS_DIR3_LEAF1_MAGIC:
        case XFS_DIR3_LEAFN_MAGIC:
        case XFS_DA3_NODE_MAGIC:
-               return be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+               lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+               uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
+               break;
        default:
                break;
        }
 
+       if (lsn != (xfs_lsn_t)-1) {
+               if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+                       goto recover_immediately;
+               return lsn;
+       }
+
        /*
         * We do individual object checks on dquot and inode buffers as they
         * have their own individual LSN records. Also, we could have a stale
@@ -2629,6 +2675,82 @@ out_release:
        return error;
 }
 
+/*
+ * Inode fork owner changes
+ *
+ * If we have been told that we have to reparent the inode fork, it's because an
+ * extent swap operation on a CRC enabled filesystem has been done and we are
+ * replaying it. We need to walk the BMBT of the appropriate fork and change the
+ * owners of it.
+ *
+ * The complexity here is that we don't have an inode context to work with, so
+ * after we've replayed the inode we need to instantiate one.  This is where the
+ * fun begins.
+ *
+ * We are in the middle of log recovery, so we can't run transactions. That
+ * means we cannot use cache coherent inode instantiation via xfs_iget(), as
+ * that will result in the corresponding iput() running the inode through
+ * xfs_inactive(). If we've just replayed an inode core that changes the link
+ * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
+ * transactions (bad!).
+ *
+ * So, to avoid this, we instantiate an inode directly from the inode core we've
+ * just recovered. We have the buffer still locked, and all we really need to
+ * instantiate is the inode core and the forks being modified. We can do this
+ * manually, then run the inode btree owner change, and then tear down the
+ * xfs_inode without having to run any transactions at all.
+ *
+ * Also, because we don't have a transaction context available here but need to
+ * gather all the buffers we modify for writeback so we pass the buffer_list
+ * instead for the operation to use.
+ */
+
+STATIC int
+xfs_recover_inode_owner_change(
+       struct xfs_mount        *mp,
+       struct xfs_dinode       *dip,
+       struct xfs_inode_log_format *in_f,
+       struct list_head        *buffer_list)
+{
+       struct xfs_inode        *ip;
+       int                     error;
+
+       ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
+
+       ip = xfs_inode_alloc(mp, in_f->ilf_ino);
+       if (!ip)
+               return ENOMEM;
+
+       /* instantiate the inode */
+       xfs_dinode_from_disk(&ip->i_d, dip);
+       ASSERT(ip->i_d.di_version >= 3);
+
+       error = xfs_iformat_fork(ip, dip);
+       if (error)
+               goto out_free_ip;
+
+
+       if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
+               ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
+               error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
+                                             ip->i_ino, buffer_list);
+               if (error)
+                       goto out_free_ip;
+       }
+
+       if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
+               ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
+               error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
+                                             ip->i_ino, buffer_list);
+               if (error)
+                       goto out_free_ip;
+       }
+
+out_free_ip:
+       xfs_inode_free(ip);
+       return error;
+}
+
 STATIC int
 xlog_recover_inode_pass2(
        struct xlog                     *log,
@@ -2681,8 +2803,7 @@ xlog_recover_inode_pass2(
        error = bp->b_error;
        if (error) {
                xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
-               xfs_buf_relse(bp);
-               goto error;
+               goto out_release;
        }
        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
        dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
@@ -2692,30 +2813,31 @@ xlog_recover_inode_pass2(
         * like an inode!
         */
        if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
-               xfs_buf_relse(bp);
                xfs_alert(mp,
        "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
                        __func__, dip, bp, in_f->ilf_ino);
                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
-               goto error;
+               goto out_release;
        }
        dicp = item->ri_buf[1].i_addr;
        if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
-               xfs_buf_relse(bp);
                xfs_alert(mp,
                        "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
                        __func__, item, in_f->ilf_ino);
                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
-               goto error;
+               goto out_release;
        }
 
        /*
         * If the inode has an LSN in it, recover the inode only if it's less
-        * than the lsn of the transaction we are replaying.
+        * than the lsn of the transaction we are replaying. Note: we still
+        * need to replay an owner change even though the inode is more recent
+        * than the transaction as there is no guarantee that all the btree
+        * blocks are more recent than this transaction, too.
         */
        if (dip->di_version >= 3) {
                xfs_lsn_t       lsn = be64_to_cpu(dip->di_lsn);
@@ -2723,7 +2845,7 @@ xlog_recover_inode_pass2(
                if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
                        trace_xfs_log_recover_inode_skip(log, in_f);
                        error = 0;
-                       goto out_release;
+                       goto out_owner_change;
                }
        }
 
@@ -2745,10 +2867,9 @@ xlog_recover_inode_pass2(
                    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
                        /* do nothing */
                } else {
-                       xfs_buf_relse(bp);
                        trace_xfs_log_recover_inode_skip(log, in_f);
                        error = 0;
-                       goto error;
+                       goto out_release;
                }
        }
 
@@ -2760,13 +2881,12 @@ xlog_recover_inode_pass2(
                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
                                         XFS_ERRLEVEL_LOW, mp, dicp);
-                       xfs_buf_relse(bp);
                        xfs_alert(mp,
                "%s: Bad regular inode log record, rec ptr 0x%p, "
                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
                                __func__, item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
-                       goto error;
+                       goto out_release;
                }
        } else if (unlikely(S_ISDIR(dicp->di_mode))) {
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -2774,19 +2894,17 @@ xlog_recover_inode_pass2(
                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
                                             XFS_ERRLEVEL_LOW, mp, dicp);
-                       xfs_buf_relse(bp);
                        xfs_alert(mp,
                "%s: Bad dir inode log record, rec ptr 0x%p, "
                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
                                __func__, item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
-                       goto error;
+                       goto out_release;
                }
        }
        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
-               xfs_buf_relse(bp);
                xfs_alert(mp,
        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
        "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
@@ -2794,29 +2912,27 @@ xlog_recover_inode_pass2(
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
                error = EFSCORRUPTED;
-               goto error;
+               goto out_release;
        }
        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
-               xfs_buf_relse(bp);
                xfs_alert(mp,
        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
        "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
                error = EFSCORRUPTED;
-               goto error;
+               goto out_release;
        }
        isize = xfs_icdinode_size(dicp->di_version);
        if (unlikely(item->ri_buf[1].i_len > isize)) {
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
-               xfs_buf_relse(bp);
                xfs_alert(mp,
                        "%s: Bad inode log record length %d, rec ptr 0x%p",
                        __func__, item->ri_buf[1].i_len, item);
                error = EFSCORRUPTED;
-               goto error;
+               goto out_release;
        }
 
        /* The core is in in-core format */
@@ -2842,7 +2958,7 @@ xlog_recover_inode_pass2(
        }
 
        if (in_f->ilf_size == 2)
-               goto write_inode_buffer;
+               goto out_owner_change;
        len = item->ri_buf[2].i_len;
        src = item->ri_buf[2].i_addr;
        ASSERT(in_f->ilf_size <= 4);
@@ -2903,13 +3019,15 @@ xlog_recover_inode_pass2(
                default:
                        xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
                        ASSERT(0);
-                       xfs_buf_relse(bp);
                        error = EIO;
-                       goto error;
+                       goto out_release;
                }
        }
 
-write_inode_buffer:
+out_owner_change:
+       if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
+               error = xfs_recover_inode_owner_change(mp, dip, in_f,
+                                                      buffer_list);
        /* re-generate the checksum. */
        xfs_dinode_calc_crc(log->l_mp, dip);