]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - fs/xfs/xfs_log_recover.c
xfs: Use kmem_free() instead of free()
[karo-tx-linux.git] / fs / xfs / xfs_log_recover.c
index 7681b19aa5dc565a9807005ff3f1d6428a22f016..39797490a1f1996e3f92f51efb532f15a75da8fa 100644 (file)
@@ -17,7 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
@@ -41,7 +41,6 @@
 #include "xfs_extfree_item.h"
 #include "xfs_trans_priv.h"
 #include "xfs_quota.h"
-#include "xfs_utils.h"
 #include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
 #include "xfs_da_btree.h"
 #include "xfs_dir2_format.h"
-#include "xfs_dir2_priv.h"
+#include "xfs_dir2.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_attr_remote.h"
 
+#define BLK_AVG(blk1, blk2)    ((blk1+blk2) >> 1)
+
 STATIC int
 xlog_find_zeroed(
        struct xlog     *,
@@ -607,7 +608,7 @@ out:
 
 /*
  * Head is defined to be the point of the log where the next log write
- * write could go.  This means that incomplete LR writes at the end are
+ * could go.  This means that incomplete LR writes at the end are
  * eliminated when calculating the head.  We aren't guaranteed that previous
  * LR have complete transactions.  We only know that a cycle number of
  * current cycle number -1 won't be present in the log if we start writing
@@ -963,6 +964,7 @@ xlog_find_tail(
        }
        if (!found) {
                xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+               xlog_put_bp(bp);
                ASSERT(0);
                return XFS_ERROR(EIO);
        }
@@ -1144,7 +1146,8 @@ xlog_find_zeroed(
                 */
                xfs_warn(log->l_mp,
                        "Log inconsistent or not a log (last==0, first!=1)");
-               return XFS_ERROR(EINVAL);
+               error = XFS_ERROR(EINVAL);
+               goto bp_err;
        }
 
        /* we have a partially zeroed log */
@@ -1582,6 +1585,7 @@ xlog_recover_add_to_trans(
                "bad number of regions (%d) in inode log format",
                                  in_f->ilf_size);
                        ASSERT(0);
+                       kmem_free(ptr);
                        return XFS_ERROR(EIO);
                }
 
@@ -1766,19 +1770,11 @@ xlog_recover_buffer_pass1(
 
 /*
  * Check to see whether the buffer being recovered has a corresponding
- * entry in the buffer cancel record table.  If it does then return 1
- * so that it will be cancelled, otherwise return 0.  If the buffer is
- * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
- * the refcount on the entry in the table and remove it from the table
- * if this is the last reference.
- *
- * We remove the cancel record from the table when we encounter its
- * last occurrence in the log so that if the same buffer is re-used
- * again after its last cancellation we actually replay the changes
- * made at that point.
+ * entry in the buffer cancel record table. If it is, return the cancel
+ * buffer structure to the caller.
  */
-STATIC int
-xlog_check_buffer_cancelled(
+STATIC struct xfs_buf_cancel *
+xlog_peek_buffer_cancelled(
        struct xlog             *log,
        xfs_daddr_t             blkno,
        uint                    len,
@@ -1787,22 +1783,16 @@ xlog_check_buffer_cancelled(
        struct list_head        *bucket;
        struct xfs_buf_cancel   *bcp;
 
-       if (log->l_buf_cancel_table == NULL) {
-               /*
-                * There is nothing in the table built in pass one,
-                * so this buffer must not be cancelled.
-                */
+       if (!log->l_buf_cancel_table) {
+               /* empty table means no cancelled buffers in the log */
                ASSERT(!(flags & XFS_BLF_CANCEL));
-               return 0;
+               return NULL;
        }
 
-       /*
-        * Search for an entry in the  cancel table that matches our buffer.
-        */
        bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
        list_for_each_entry(bcp, bucket, bc_list) {
                if (bcp->bc_blkno == blkno && bcp->bc_len == len)
-                       goto found;
+                       return bcp;
        }
 
        /*
@@ -1810,9 +1800,32 @@ xlog_check_buffer_cancelled(
         * that the buffer is NOT cancelled.
         */
        ASSERT(!(flags & XFS_BLF_CANCEL));
-       return 0;
+       return NULL;
+}
+
+/*
+ * If the buffer is being cancelled then return 1 so that it will be cancelled,
+ * otherwise return 0.  If the buffer is actually a buffer cancel item
+ * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
+ * table and remove it from the table if this is the last reference.
+ *
+ * We remove the cancel record from the table when we encounter its last
+ * occurrence in the log so that if the same buffer is re-used again after its
+ * last cancellation we actually replay the changes made at that point.
+ */
+STATIC int
+xlog_check_buffer_cancelled(
+       struct xlog             *log,
+       xfs_daddr_t             blkno,
+       uint                    len,
+       ushort                  flags)
+{
+       struct xfs_buf_cancel   *bcp;
+
+       bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
+       if (!bcp)
+               return 0;
 
-found:
        /*
         * We've go a match, so return 1 so that the recovery of this buffer
         * is cancelled.  If this buffer is actually a buffer cancel log
@@ -1946,6 +1959,149 @@ xlog_recover_do_inode_buffer(
        return 0;
 }
 
+/*
+ * V5 filesystems know the age of the buffer on disk being recovered. We can
+ * have newer objects on disk than we are replaying, and so for these cases we
+ * don't want to replay the current change as that will make the buffer contents
+ * temporarily invalid on disk.
+ *
+ * The magic number might not match the buffer type we are going to recover
+ * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags.  Hence
+ * extract the LSN of the existing object in the buffer based on it's current
+ * magic number.  If we don't recognise the magic number in the buffer, then
+ * return a LSN of -1 so that the caller knows it was an unrecognised block and
+ * so can recover the buffer.
+ *
+ * Note: we cannot rely solely on magic number matches to determine that the
+ * buffer has a valid LSN - we also need to verify that it belongs to this
+ * filesystem, so we need to extract the object's LSN and compare it to that
+ * which we read from the superblock. If the UUIDs don't match, then we've got a
+ * stale metadata block from an old filesystem instance that we need to recover
+ * over the top of.
+ */
+static xfs_lsn_t
+xlog_recover_get_buf_lsn(
+       struct xfs_mount        *mp,
+       struct xfs_buf          *bp)
+{
+       __uint32_t              magic32;
+       __uint16_t              magic16;
+       __uint16_t              magicda;
+       void                    *blk = bp->b_addr;
+       uuid_t                  *uuid;
+       xfs_lsn_t               lsn = -1;
+
+       /* v4 filesystems always recover immediately */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               goto recover_immediately;
+
+       magic32 = be32_to_cpu(*(__be32 *)blk);
+       switch (magic32) {
+       case XFS_ABTB_CRC_MAGIC:
+       case XFS_ABTC_CRC_MAGIC:
+       case XFS_ABTB_MAGIC:
+       case XFS_ABTC_MAGIC:
+       case XFS_IBT_CRC_MAGIC:
+       case XFS_IBT_MAGIC: {
+               struct xfs_btree_block *btb = blk;
+
+               lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
+               uuid = &btb->bb_u.s.bb_uuid;
+               break;
+       }
+       case XFS_BMAP_CRC_MAGIC:
+       case XFS_BMAP_MAGIC: {
+               struct xfs_btree_block *btb = blk;
+
+               lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
+               uuid = &btb->bb_u.l.bb_uuid;
+               break;
+       }
+       case XFS_AGF_MAGIC:
+               lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+               uuid = &((struct xfs_agf *)blk)->agf_uuid;
+               break;
+       case XFS_AGFL_MAGIC:
+               lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+               uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
+               break;
+       case XFS_AGI_MAGIC:
+               lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+               uuid = &((struct xfs_agi *)blk)->agi_uuid;
+               break;
+       case XFS_SYMLINK_MAGIC:
+               lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+               uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
+               break;
+       case XFS_DIR3_BLOCK_MAGIC:
+       case XFS_DIR3_DATA_MAGIC:
+       case XFS_DIR3_FREE_MAGIC:
+               lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+               uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
+               break;
+       case XFS_ATTR3_RMT_MAGIC:
+               lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
+               uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid;
+               break;
+       case XFS_SB_MAGIC:
+               lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
+               uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+               break;
+       default:
+               break;
+       }
+
+       if (lsn != (xfs_lsn_t)-1) {
+               if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+                       goto recover_immediately;
+               return lsn;
+       }
+
+       magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
+       switch (magicda) {
+       case XFS_DIR3_LEAF1_MAGIC:
+       case XFS_DIR3_LEAFN_MAGIC:
+       case XFS_DA3_NODE_MAGIC:
+               lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+               uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
+               break;
+       default:
+               break;
+       }
+
+       if (lsn != (xfs_lsn_t)-1) {
+               if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+                       goto recover_immediately;
+               return lsn;
+       }
+
+       /*
+        * We do individual object checks on dquot and inode buffers as they
+        * have their own individual LSN records. Also, we could have a stale
+        * buffer here, so we have to at least recognise these buffer types.
+        *
+        * A notd complexity here is inode unlinked list processing - it logs
+        * the inode directly in the buffer, but we don't know which inodes have
+        * been modified, and there is no global buffer LSN. Hence we need to
+        * recover all inode buffer types immediately. This problem will be
+        * fixed by logical logging of the unlinked list modifications.
+        */
+       magic16 = be16_to_cpu(*(__be16 *)blk);
+       switch (magic16) {
+       case XFS_DQUOT_MAGIC:
+       case XFS_DINODE_MAGIC:
+               goto recover_immediately;
+       default:
+               break;
+       }
+
+       /* unknown buffer contents, recover immediately */
+
+recover_immediately:
+       return (xfs_lsn_t)-1;
+
+}
+
 /*
  * Validate the recovered buffer is of the correct type and attach the
  * appropriate buffer operations to them for writeback. Magic numbers are in a
@@ -1955,7 +2111,7 @@ xlog_recover_do_inode_buffer(
  *     inside a struct xfs_da_blkinfo at the start of the buffer.
  */
 static void
-xlog_recovery_validate_buf_type(
+xlog_recover_validate_buf_type(
        struct xfs_mount        *mp,
        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
@@ -2234,7 +2390,7 @@ xlog_recover_do_reg_buffer(
         * just avoid the verification stage for non-crc filesystems
         */
        if (xfs_sb_version_hascrc(&mp->m_sb))
-               xlog_recovery_validate_buf_type(mp, bp, buf_f);
+               xlog_recover_validate_buf_type(mp, bp, buf_f);
 }
 
 /*
@@ -2366,7 +2522,7 @@ xfs_qm_dqcheck(
 
 /*
  * Perform a dquot buffer recovery.
- * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
+ * Simple algorithm: if we have found a QUOTAOFF log item of the same type
  * (ie. USR or GRP), then just toss this buffer away; don't recover it.
  * Else, treat it as a regular buffer and do recovery.
  */
@@ -2425,20 +2581,22 @@ xlog_recover_do_dquot_buffer(
  * over the log during recovery.  During the first we build a table of
  * those buffers which have been cancelled, and during the second we
  * only replay those buffers which do not have corresponding cancel
- * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
+ * records in the table.  See xlog_recover_buffer_pass[1,2] above
  * for more details on the implementation of the table of cancel records.
  */
 STATIC int
 xlog_recover_buffer_pass2(
        struct xlog                     *log,
        struct list_head                *buffer_list,
-       struct xlog_recover_item        *item)
+       struct xlog_recover_item        *item,
+       xfs_lsn_t                       current_lsn)
 {
        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        int                     error;
        uint                    buf_flags;
+       xfs_lsn_t               lsn;
 
        /*
         * In this pass we only want to recover all the buffers which have
@@ -2463,10 +2621,17 @@ xlog_recover_buffer_pass2(
        error = bp->b_error;
        if (error) {
                xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
-               xfs_buf_relse(bp);
-               return error;
+               goto out_release;
        }
 
+       /*
+        * recover the buffer only if we get an LSN from it and it's less than
+        * the lsn of the transaction we are replaying.
+        */
+       lsn = xlog_recover_get_buf_lsn(mp, bp);
+       if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
+               goto out_release;
+
        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
        } else if (buf_f->blf_flags &
@@ -2476,7 +2641,7 @@ xlog_recover_buffer_pass2(
                xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
        }
        if (error)
-               return XFS_ERROR(error);
+               goto out_release;
 
        /*
         * Perform delayed write on the buffer.  Asynchronous writes will be
@@ -2505,15 +2670,93 @@ xlog_recover_buffer_pass2(
                xfs_buf_delwri_queue(bp, buffer_list);
        }
 
+out_release:
        xfs_buf_relse(bp);
        return error;
 }
 
+/*
+ * Inode fork owner changes
+ *
+ * If we have been told that we have to reparent the inode fork, it's because an
+ * extent swap operation on a CRC enabled filesystem has been done and we are
+ * replaying it. We need to walk the BMBT of the appropriate fork and change the
+ * owners of it.
+ *
+ * The complexity here is that we don't have an inode context to work with, so
+ * after we've replayed the inode we need to instantiate one.  This is where the
+ * fun begins.
+ *
+ * We are in the middle of log recovery, so we can't run transactions. That
+ * means we cannot use cache coherent inode instantiation via xfs_iget(), as
+ * that will result in the corresponding iput() running the inode through
+ * xfs_inactive(). If we've just replayed an inode core that changes the link
+ * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
+ * transactions (bad!).
+ *
+ * So, to avoid this, we instantiate an inode directly from the inode core we've
+ * just recovered. We have the buffer still locked, and all we really need to
+ * instantiate is the inode core and the forks being modified. We can do this
+ * manually, then run the inode btree owner change, and then tear down the
+ * xfs_inode without having to run any transactions at all.
+ *
+ * Also, because we don't have a transaction context available here but need to
+ * gather all the buffers we modify for writeback so we pass the buffer_list
+ * instead for the operation to use.
+ */
+
+STATIC int
+xfs_recover_inode_owner_change(
+       struct xfs_mount        *mp,
+       struct xfs_dinode       *dip,
+       struct xfs_inode_log_format *in_f,
+       struct list_head        *buffer_list)
+{
+       struct xfs_inode        *ip;
+       int                     error;
+
+       ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
+
+       ip = xfs_inode_alloc(mp, in_f->ilf_ino);
+       if (!ip)
+               return ENOMEM;
+
+       /* instantiate the inode */
+       xfs_dinode_from_disk(&ip->i_d, dip);
+       ASSERT(ip->i_d.di_version >= 3);
+
+       error = xfs_iformat_fork(ip, dip);
+       if (error)
+               goto out_free_ip;
+
+
+       if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
+               ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
+               error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
+                                             ip->i_ino, buffer_list);
+               if (error)
+                       goto out_free_ip;
+       }
+
+       if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
+               ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
+               error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
+                                             ip->i_ino, buffer_list);
+               if (error)
+                       goto out_free_ip;
+       }
+
+out_free_ip:
+       xfs_inode_free(ip);
+       return error;
+}
+
 STATIC int
 xlog_recover_inode_pass2(
        struct xlog                     *log,
        struct list_head                *buffer_list,
-       struct xlog_recover_item        *item)
+       struct xlog_recover_item        *item,
+       xfs_lsn_t                       current_lsn)
 {
        xfs_inode_log_format_t  *in_f;
        xfs_mount_t             *mp = log->l_mp;
@@ -2560,8 +2803,7 @@ xlog_recover_inode_pass2(
        error = bp->b_error;
        if (error) {
                xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
-               xfs_buf_relse(bp);
-               goto error;
+               goto out_release;
        }
        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
        dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
@@ -2571,25 +2813,40 @@ xlog_recover_inode_pass2(
         * like an inode!
         */
        if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
-               xfs_buf_relse(bp);
                xfs_alert(mp,
        "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
                        __func__, dip, bp, in_f->ilf_ino);
                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
-               goto error;
+               goto out_release;
        }
        dicp = item->ri_buf[1].i_addr;
        if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
-               xfs_buf_relse(bp);
                xfs_alert(mp,
                        "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
                        __func__, item, in_f->ilf_ino);
                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
-               goto error;
+               goto out_release;
+       }
+
+       /*
+        * If the inode has an LSN in it, recover the inode only if it's less
+        * than the lsn of the transaction we are replaying. Note: we still
+        * need to replay an owner change even though the inode is more recent
+        * than the transaction as there is no guarantee that all the btree
+        * blocks are more recent than this transaction, too.
+        */
+       if (dip->di_version >= 3) {
+               xfs_lsn_t       lsn = be64_to_cpu(dip->di_lsn);
+
+               if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+                       trace_xfs_log_recover_inode_skip(log, in_f);
+                       error = 0;
+                       goto out_owner_change;
+               }
        }
 
        /*
@@ -2610,10 +2867,9 @@ xlog_recover_inode_pass2(
                    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
                        /* do nothing */
                } else {
-                       xfs_buf_relse(bp);
                        trace_xfs_log_recover_inode_skip(log, in_f);
                        error = 0;
-                       goto error;
+                       goto out_release;
                }
        }
 
@@ -2625,13 +2881,12 @@ xlog_recover_inode_pass2(
                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
                                         XFS_ERRLEVEL_LOW, mp, dicp);
-                       xfs_buf_relse(bp);
                        xfs_alert(mp,
                "%s: Bad regular inode log record, rec ptr 0x%p, "
                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
                                __func__, item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
-                       goto error;
+                       goto out_release;
                }
        } else if (unlikely(S_ISDIR(dicp->di_mode))) {
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -2639,19 +2894,17 @@ xlog_recover_inode_pass2(
                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
                                             XFS_ERRLEVEL_LOW, mp, dicp);
-                       xfs_buf_relse(bp);
                        xfs_alert(mp,
                "%s: Bad dir inode log record, rec ptr 0x%p, "
                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
                                __func__, item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
-                       goto error;
+                       goto out_release;
                }
        }
        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
-               xfs_buf_relse(bp);
                xfs_alert(mp,
        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
        "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
@@ -2659,29 +2912,27 @@ xlog_recover_inode_pass2(
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
                error = EFSCORRUPTED;
-               goto error;
+               goto out_release;
        }
        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
-               xfs_buf_relse(bp);
                xfs_alert(mp,
        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
        "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
                error = EFSCORRUPTED;
-               goto error;
+               goto out_release;
        }
        isize = xfs_icdinode_size(dicp->di_version);
        if (unlikely(item->ri_buf[1].i_len > isize)) {
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
-               xfs_buf_relse(bp);
                xfs_alert(mp,
                        "%s: Bad inode log record length %d, rec ptr 0x%p",
                        __func__, item->ri_buf[1].i_len, item);
                error = EFSCORRUPTED;
-               goto error;
+               goto out_release;
        }
 
        /* The core is in in-core format */
@@ -2707,7 +2958,7 @@ xlog_recover_inode_pass2(
        }
 
        if (in_f->ilf_size == 2)
-               goto write_inode_buffer;
+               goto out_owner_change;
        len = item->ri_buf[2].i_len;
        src = item->ri_buf[2].i_addr;
        ASSERT(in_f->ilf_size <= 4);
@@ -2768,19 +3019,23 @@ xlog_recover_inode_pass2(
                default:
                        xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
                        ASSERT(0);
-                       xfs_buf_relse(bp);
                        error = EIO;
-                       goto error;
+                       goto out_release;
                }
        }
 
-write_inode_buffer:
+out_owner_change:
+       if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
+               error = xfs_recover_inode_owner_change(mp, dip, in_f,
+                                                      buffer_list);
        /* re-generate the checksum. */
        xfs_dinode_calc_crc(log->l_mp, dip);
 
        ASSERT(bp->b_target->bt_mount == mp);
        bp->b_iodone = xlog_recover_iodone;
        xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
        xfs_buf_relse(bp);
 error:
        if (need_free)
@@ -2822,7 +3077,8 @@ STATIC int
 xlog_recover_dquot_pass2(
        struct xlog                     *log,
        struct list_head                *buffer_list,
-       struct xlog_recover_item        *item)
+       struct xlog_recover_item        *item,
+       xfs_lsn_t                       current_lsn)
 {
        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
@@ -2896,6 +3152,19 @@ xlog_recover_dquot_pass2(
                return XFS_ERROR(EIO);
        }
 
+       /*
+        * If the dquot has an LSN in it, recover the dquot only if it's less
+        * than the lsn of the transaction we are replaying.
+        */
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
+               xfs_lsn_t       lsn = be64_to_cpu(dqb->dd_lsn);
+
+               if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+                       goto out_release;
+               }
+       }
+
        memcpy(ddq, recddq, item->ri_buf[1].i_len);
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
@@ -2906,9 +3175,10 @@ xlog_recover_dquot_pass2(
        ASSERT(bp->b_target->bt_mount == mp);
        bp->b_iodone = xlog_recover_iodone;
        xfs_buf_delwri_queue(bp, buffer_list);
-       xfs_buf_relse(bp);
 
-       return (0);
+out_release:
+       xfs_buf_relse(bp);
+       return 0;
 }
 
 /*
@@ -3116,6 +3386,106 @@ xlog_recover_free_trans(
        kmem_free(trans);
 }
 
+STATIC void
+xlog_recover_buffer_ra_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item)
+{
+       struct xfs_buf_log_format       *buf_f = item->ri_buf[0].i_addr;
+       struct xfs_mount                *mp = log->l_mp;
+
+       if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
+                       buf_f->blf_len, buf_f->blf_flags)) {
+               return;
+       }
+
+       xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
+                               buf_f->blf_len, NULL);
+}
+
+STATIC void
+xlog_recover_inode_ra_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item)
+{
+       struct xfs_inode_log_format     ilf_buf;
+       struct xfs_inode_log_format     *ilfp;
+       struct xfs_mount                *mp = log->l_mp;
+       int                     error;
+
+       if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
+               ilfp = item->ri_buf[0].i_addr;
+       } else {
+               ilfp = &ilf_buf;
+               memset(ilfp, 0, sizeof(*ilfp));
+               error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
+               if (error)
+                       return;
+       }
+
+       if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
+               return;
+
+       xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
+                               ilfp->ilf_len, &xfs_inode_buf_ra_ops);
+}
+
+STATIC void
+xlog_recover_dquot_ra_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item)
+{
+       struct xfs_mount        *mp = log->l_mp;
+       struct xfs_disk_dquot   *recddq;
+       struct xfs_dq_logformat *dq_f;
+       uint                    type;
+
+
+       if (mp->m_qflags == 0)
+               return;
+
+       recddq = item->ri_buf[1].i_addr;
+       if (recddq == NULL)
+               return;
+       if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
+               return;
+
+       type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
+       ASSERT(type);
+       if (log->l_quotaoffs_flag & type)
+               return;
+
+       dq_f = item->ri_buf[0].i_addr;
+       ASSERT(dq_f);
+       ASSERT(dq_f->qlf_len == 1);
+
+       xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
+                         XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
+}
+
+STATIC void
+xlog_recover_ra_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item)
+{
+       switch (ITEM_TYPE(item)) {
+       case XFS_LI_BUF:
+               xlog_recover_buffer_ra_pass2(log, item);
+               break;
+       case XFS_LI_INODE:
+               xlog_recover_inode_ra_pass2(log, item);
+               break;
+       case XFS_LI_DQUOT:
+               xlog_recover_dquot_ra_pass2(log, item);
+               break;
+       case XFS_LI_EFI:
+       case XFS_LI_EFD:
+       case XFS_LI_QUOTAOFF:
+       default:
+               break;
+       }
+}
+
 STATIC int
 xlog_recover_commit_pass1(
        struct xlog                     *log,
@@ -3155,15 +3525,18 @@ xlog_recover_commit_pass2(
 
        switch (ITEM_TYPE(item)) {
        case XFS_LI_BUF:
-               return xlog_recover_buffer_pass2(log, buffer_list, item);
+               return xlog_recover_buffer_pass2(log, buffer_list, item,
+                                                trans->r_lsn);
        case XFS_LI_INODE:
-               return xlog_recover_inode_pass2(log, buffer_list, item);
+               return xlog_recover_inode_pass2(log, buffer_list, item,
+                                                trans->r_lsn);
        case XFS_LI_EFI:
                return xlog_recover_efi_pass2(log, item, trans->r_lsn);
        case XFS_LI_EFD:
                return xlog_recover_efd_pass2(log, item);
        case XFS_LI_DQUOT:
-               return xlog_recover_dquot_pass2(log, buffer_list, item);
+               return xlog_recover_dquot_pass2(log, buffer_list, item,
+                                               trans->r_lsn);
        case XFS_LI_ICREATE:
                return xlog_recover_do_icreate_pass2(log, buffer_list, item);
        case XFS_LI_QUOTAOFF:
@@ -3177,6 +3550,26 @@ xlog_recover_commit_pass2(
        }
 }
 
+STATIC int
+xlog_recover_items_pass2(
+       struct xlog                     *log,
+       struct xlog_recover             *trans,
+       struct list_head                *buffer_list,
+       struct list_head                *item_list)
+{
+       struct xlog_recover_item        *item;
+       int                             error = 0;
+
+       list_for_each_entry(item, item_list, ri_list) {
+               error = xlog_recover_commit_pass2(log, trans,
+                                         buffer_list, item);
+               if (error)
+                       return error;
+       }
+
+       return error;
+}
+
 /*
  * Perform the transaction.
  *
@@ -3189,9 +3582,16 @@ xlog_recover_commit_trans(
        struct xlog_recover     *trans,
        int                     pass)
 {
-       int                     error = 0, error2;
-       xlog_recover_item_t     *item;
-       LIST_HEAD               (buffer_list);
+       int                             error = 0;
+       int                             error2;
+       int                             items_queued = 0;
+       struct xlog_recover_item        *item;
+       struct xlog_recover_item        *next;
+       LIST_HEAD                       (buffer_list);
+       LIST_HEAD                       (ra_list);
+       LIST_HEAD                       (done_list);
+
+       #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
 
        hlist_del(&trans->r_list);
 
@@ -3199,14 +3599,22 @@ xlog_recover_commit_trans(
        if (error)
                return error;
 
-       list_for_each_entry(item, &trans->r_itemq, ri_list) {
+       list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
                switch (pass) {
                case XLOG_RECOVER_PASS1:
                        error = xlog_recover_commit_pass1(log, trans, item);
                        break;
                case XLOG_RECOVER_PASS2:
-                       error = xlog_recover_commit_pass2(log, trans,
-                                                         &buffer_list, item);
+                       xlog_recover_ra_pass2(log, item);
+                       list_move_tail(&item->ri_list, &ra_list);
+                       items_queued++;
+                       if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
+                               error = xlog_recover_items_pass2(log, trans,
+                                               &buffer_list, &ra_list);
+                               list_splice_tail_init(&ra_list, &done_list);
+                               items_queued = 0;
+                       }
+
                        break;
                default:
                        ASSERT(0);
@@ -3216,9 +3624,19 @@ xlog_recover_commit_trans(
                        goto out;
        }
 
+out:
+       if (!list_empty(&ra_list)) {
+               if (!error)
+                       error = xlog_recover_items_pass2(log, trans,
+                                       &buffer_list, &ra_list);
+               list_splice_tail_init(&ra_list, &done_list);
+       }
+
+       if (!list_empty(&done_list))
+               list_splice_init(&done_list, &trans->r_itemq);
+
        xlog_recover_free_trans(trans);
 
-out:
        error2 = xfs_buf_delwri_submit(&buffer_list);
        return error ? error : error2;
 }
@@ -3376,7 +3794,7 @@ xlog_recover_process_efi(
        }
 
        tp = xfs_trans_alloc(mp, 0);
-       error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error)
                goto abort_error;
        efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
@@ -3482,8 +3900,7 @@ xlog_recover_clear_agi_bucket(
        int             error;
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-       error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
-                                 0, 0, 0);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
        if (error)
                goto out_abort;