]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
ext4: Fix the race between read_inode_bitmap() and ext4_new_inode()
authorAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Tue, 17 Feb 2009 15:58:34 +0000 (10:58 -0500)
committerGreg Kroah-Hartman <gregkh@suse.de>
Fri, 20 Feb 2009 22:37:11 +0000 (14:37 -0800)
(cherry picked from commit 393418676a7602e1d7d3f6e560159c65c8cbd50e)

We need to make sure we update the inode bitmap and clear
EXT4_BG_INODE_UNINIT flag with sb_bgl_lock held, since
ext4_read_inode_bitmap() looks at EXT4_BG_INODE_UNINIT to decide
whether to initialize the inode bitmap each time it is called.
(introduced by commit c806e68f.)

ext4_read_inode_bitmap does:

spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
ext4_init_inode_bitmap(sb, bh, block_group, desc);

and ext4_new_inode does
if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
                   ino, inode_bitmap_bh->b_data))
   ......
   ...
spin_lock(sb_bgl_lock(sbi, group));

gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
i.e., on allocation we update the bitmap then we take the sb_bgl_lock
and clear the EXT4_BG_INODE_UNINIT flag. What can happen is a
parallel ext4_read_inode_bitmap can zero out the bitmap in between
the above ext4_set_bit_atomic and spin_lock(sb_bg_lock..)

The race results in below user visible errors
EXT4-fs error (device sdb1): ext4_free_inode: bit already cleared for inode 168449
EXT4-fs warning (device sdb1): ext4_unlink: Deleting nonexistent file ...
EXT4-fs warning (device sdb1): ext4_rmdir: empty directory has too many links ...
ls: /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71: Stale NFS file handle

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
fs/ext4/ialloc.c

index 98059248430f4e6541ea662d4b341f2a612b1b45..474214e9e1246fab5ed1515eca5193e1402ae4c6 100644 (file)
@@ -566,6 +566,77 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
        return -1;
 }
 
+/*
+ * claim the inode from the inode bitmap. If the group
+ * is uninit we need to take the groups's sb_bgl_lock
+ * and clear the uninit flag. The inode bitmap update
+ * and group desc uninit flag clear should be done
+ * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * doesn't race with the ext4_claim_inode
+ */
+static int ext4_claim_inode(struct super_block *sb,
+                       struct buffer_head *inode_bitmap_bh,
+                       unsigned long ino, ext4_group_t group, int mode)
+{
+       int free = 0, retval = 0;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+
+       spin_lock(sb_bgl_lock(sbi, group));
+       if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+               /* not a free inode */
+               retval = 1;
+               goto err_ret;
+       }
+       ino++;
+       if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
+                       ino > EXT4_INODES_PER_GROUP(sb)) {
+               spin_unlock(sb_bgl_lock(sbi, group));
+               ext4_error(sb, __func__,
+                          "reserved inode or inode > inodes count - "
+                          "block_group = %lu, inode=%lu", group,
+                          ino + group * EXT4_INODES_PER_GROUP(sb));
+               return 1;
+       }
+       /* If we didn't allocate from within the initialized part of the inode
+        * table then we need to initialize up to this inode. */
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+
+               if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+                       gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+                       /* When marking the block group with
+                        * ~EXT4_BG_INODE_UNINIT we don't want to depend
+                        * on the value of bg_itable_unused even though
+                        * mke2fs could have initialized the same for us.
+                        * Instead we calculated the value below
+                        */
+
+                       free = 0;
+               } else {
+                       free = EXT4_INODES_PER_GROUP(sb) -
+                               le16_to_cpu(gdp->bg_itable_unused);
+               }
+
+               /*
+                * Check the relative inode number against the last used
+                * relative inode number in this group. if it is greater
+                * we need to  update the bg_itable_unused count
+                *
+                */
+               if (ino > free)
+                       gdp->bg_itable_unused =
+                               cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
+       }
+       le16_add_cpu(&gdp->bg_free_inodes_count, -1);
+       if (S_ISDIR(mode)) {
+               le16_add_cpu(&gdp->bg_used_dirs_count, 1);
+       }
+       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+err_ret:
+       spin_unlock(sb_bgl_lock(sbi, group));
+       return retval;
+}
+
 /*
  * There are two policies for allocating an inode.  If the new inode is
  * a directory, then a forward search is made for a block group with both
@@ -649,8 +720,12 @@ repeat_in_this_group:
                        if (err)
                                goto fail;
 
-                       if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
-                                               ino, bitmap_bh->b_data)) {
+                       BUFFER_TRACE(bh2, "get_write_access");
+                       err = ext4_journal_get_write_access(handle, bh2);
+                       if (err)
+                               goto fail;
+                       if (!ext4_claim_inode(sb, bitmap_bh,
+                                               ino, group, mode)) {
                                /* we won it */
                                BUFFER_TRACE(bitmap_bh,
                                        "call ext4_journal_dirty_metadata");
@@ -658,10 +733,13 @@ repeat_in_this_group:
                                                                bitmap_bh);
                                if (err)
                                        goto fail;
+                               /* zero bit is inode number 1*/
+                               ino++;
                                goto got;
                        }
                        /* we lost it */
                        jbd2_journal_release_buffer(handle, bitmap_bh);
+                       jbd2_journal_release_buffer(handle, bh2);
 
                        if (++ino < EXT4_INODES_PER_GROUP(sb))
                                goto repeat_in_this_group;
@@ -681,21 +759,6 @@ repeat_in_this_group:
        goto out;
 
 got:
-       ino++;
-       if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
-           ino > EXT4_INODES_PER_GROUP(sb)) {
-               ext4_error(sb, __func__,
-                          "reserved inode or inode > inodes count - "
-                          "block_group = %lu, inode=%lu", group,
-                          ino + group * EXT4_INODES_PER_GROUP(sb));
-               err = -EIO;
-               goto fail;
-       }
-
-       BUFFER_TRACE(bh2, "get_write_access");
-       err = ext4_journal_get_write_access(handle, bh2);
-       if (err) goto fail;
-
        /* We may have to initialize the block bitmap if it isn't already */
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -730,47 +793,10 @@ got:
                if (err)
                        goto fail;
        }
-
-       spin_lock(sb_bgl_lock(sbi, group));
-       /* If we didn't allocate from within the initialized part of the inode
-        * table then we need to initialize up to this inode. */
-       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-               if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
-                       gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-
-                       /* When marking the block group with
-                        * ~EXT4_BG_INODE_UNINIT we don't want to depend
-                        * on the value of bg_itable_unused even though
-                        * mke2fs could have initialized the same for us.
-                        * Instead we calculated the value below
-                        */
-
-                       free = 0;
-               } else {
-                       free = EXT4_INODES_PER_GROUP(sb) -
-                               le16_to_cpu(gdp->bg_itable_unused);
-               }
-
-               /*
-                * Check the relative inode number against the last used
-                * relative inode number in this group. if it is greater
-                * we need to  update the bg_itable_unused count
-                *
-                */
-               if (ino > free)
-                       gdp->bg_itable_unused =
-                               cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
-       }
-
-       le16_add_cpu(&gdp->bg_free_inodes_count, -1);
-       if (S_ISDIR(mode)) {
-               le16_add_cpu(&gdp->bg_used_dirs_count, 1);
-       }
-       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-       spin_unlock(sb_bgl_lock(sbi, group));
-       BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
+       BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
        err = ext4_journal_dirty_metadata(handle, bh2);
-       if (err) goto fail;
+       if (err)
+               goto fail;
 
        percpu_counter_dec(&sbi->s_freeinodes_counter);
        if (S_ISDIR(mode))