]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
bonding: prevent deadlock on slave store with alb mode (v3)
authorNeil Horman <nhorman@tuxdriver.com>
Wed, 25 May 2011 08:13:01 +0000 (08:13 +0000)
committerGreg Kroah-Hartman <gregkh@suse.de>
Fri, 3 Jun 2011 01:34:02 +0000 (10:34 +0900)
[ Upstream commit 9fe0617d9b6d21f700ee9e658e1c9fe3be2fb402 ]

This soft lockup was recently reported:

[root@dell-per715-01 ~]# echo +bond5 > /sys/class/net/bonding_masters
[root@dell-per715-01 ~]# echo +eth1 > /sys/class/net/bond5/bonding/slaves
bonding: bond5: doing slave updates when interface is down.
bonding bond5: master_dev is not up in bond_enslave
[root@dell-per715-01 ~]# echo -eth1 > /sys/class/net/bond5/bonding/slaves
bonding: bond5: doing slave updates when interface is down.

BUG: soft lockup - CPU#12 stuck for 60s! [bash:6444]
CPU 12:
Modules linked in: bonding autofs4 hidp rfcomm l2cap bluetooth lockd sunrpc
be2d
Pid: 6444, comm: bash Not tainted 2.6.18-262.el5 #1
RIP: 0010:[<ffffffff80064bf0>]  [<ffffffff80064bf0>]
.text.lock.spinlock+0x26/00
RSP: 0018:ffff810113167da8  EFLAGS: 00000286
RAX: ffff810113167fd8 RBX: ffff810123a47800 RCX: 0000000000ff1025
RDX: 0000000000000000 RSI: ffff810123a47800 RDI: ffff81021b57f6f8
RBP: ffff81021b57f500 R08: 0000000000000000 R09: 000000000000000c
R10: 00000000ffffffff R11: ffff81011d41c000 R12: ffff81021b57f000
R13: 0000000000000000 R14: 0000000000000282 R15: 0000000000000282
FS:  00002b3b41ef3f50(0000) GS:ffff810123b27940(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00002b3b456dd000 CR3: 000000031fc60000 CR4: 00000000000006e0

Call Trace:
 [<ffffffff80064af9>] _spin_lock_bh+0x9/0x14
 [<ffffffff886937d7>] :bonding:tlb_clear_slave+0x22/0xa1
 [<ffffffff8869423c>] :bonding:bond_alb_deinit_slave+0xba/0xf0
 [<ffffffff8868dda6>] :bonding:bond_release+0x1b4/0x450
 [<ffffffff8006457b>] __down_write_nested+0x12/0x92
 [<ffffffff88696ae4>] :bonding:bonding_store_slaves+0x25c/0x2f7
 [<ffffffff801106f7>] sysfs_write_file+0xb9/0xe8
 [<ffffffff80016b87>] vfs_write+0xce/0x174
 [<ffffffff80017450>] sys_write+0x45/0x6e
 [<ffffffff8005d28d>] tracesys+0xd5/0xe0

It occurs because we are able to change the slave configuarion of a bond while
the bond interface is down.  The bonding driver initializes some data structures
only after its ndo_open routine is called.  Among them is the initalization of
the alb tx and rx hash locks.  So if we add or remove a slave without first
opening the bond master device, we run the risk of trying to lock/unlock a
spinlock that has garbage for data in it, which results in our above softlock.

Note that sometimes this works, because in many cases an unlocked spinlock has
the raw_lock parameter initialized to zero (meaning that the kzalloc of the
net_device private data is equivalent to calling spin_lock_init), but thats not
true in all cases, and we aren't guaranteed that condition, so we need to pass
the relevant spinlocks through the spin_lock_init function.

Fix it by moving the spin_lock_init calls for the tx and rx hashtable locks to
the ndo_init path, so they are ready for use by the bond_store_slaves path.

Change notes:
v2) Based on conversation with Jay and Nicolas it seems that the ability to
enslave devices while the bond master is down should be safe to do.  As such
this is an outlier bug, and so instead we'll just initalize the errant spinlocks
in the init path rather than the open path, solving the problem.  We'll also
remove the warnings about the bond being down during enslave operations, since
it should be safe

v3) Fix spelling error

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: jtluka@redhat.com
CC: Jay Vosburgh <fubar@us.ibm.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: nicolas.2p.debian@gmail.com
CC: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
drivers/net/bonding/bond_alb.c
drivers/net/bonding/bond_main.c
drivers/net/bonding/bond_sysfs.c

index 5c6fba802f2b759561ee6c9c0c44bd3ea761b2c6..11ebd8f353caa8e36c71de37321de09153c7cdbd 100644 (file)
@@ -163,8 +163,6 @@ static int tlb_initialize(struct bonding *bond)
        struct tlb_client_info *new_hashtbl;
        int i;
 
-       spin_lock_init(&(bond_info->tx_hashtbl_lock));
-
        new_hashtbl = kzalloc(size, GFP_KERNEL);
        if (!new_hashtbl) {
                pr_err("%s: Error: Failed to allocate TLB hash table\n",
@@ -764,8 +762,6 @@ static int rlb_initialize(struct bonding *bond)
        int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info);
        int i;
 
-       spin_lock_init(&(bond_info->rx_hashtbl_lock));
-
        new_hashtbl = kmalloc(size, GFP_KERNEL);
        if (!new_hashtbl) {
                pr_err("%s: Error: Failed to allocate RLB hash table\n",
index 163e0b06eaa5d1aba0dda6aef0765016020e2b1a..ac8dce5545a6a2092dfc2355d01b0d97a0619cab 100644 (file)
@@ -1441,12 +1441,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
                           bond_dev->name, slave_dev->name);
        }
 
-       /* bond must be initialized by bond_open() before enslaving */
-       if (!(bond_dev->flags & IFF_UP)) {
-               pr_warning("%s: master_dev is not up in bond_enslave\n",
-                          bond_dev->name);
-       }
-
        /* already enslaved */
        if (slave_dev->flags & IFF_SLAVE) {
                pr_debug("Error, Device was already enslaved\n");
@@ -5157,9 +5151,19 @@ static int bond_init(struct net_device *bond_dev)
 {
        struct bonding *bond = netdev_priv(bond_dev);
        struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id);
+       struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
 
        pr_debug("Begin bond_init for %s\n", bond_dev->name);
 
+       /*
+        * Initialize locks that may be required during
+        * en/deslave operations.  All of the bond_open work
+        * (of which this is part) should really be moved to
+        * a phase prior to dev_open
+        */
+       spin_lock_init(&(bond_info->tx_hashtbl_lock));
+       spin_lock_init(&(bond_info->rx_hashtbl_lock));
+
        bond->wq = create_singlethread_workqueue(bond_dev->name);
        if (!bond->wq)
                return -ENOMEM;
index 8fd0174c5380499f5a87178ca2a6024c4cc124e9..ddc316500fa8a0394beb70ba741f28ab79e56000 100644 (file)
@@ -224,12 +224,6 @@ static ssize_t bonding_store_slaves(struct device *d,
        struct net_device *dev;
        struct bonding *bond = to_bond(d);
 
-       /* Quick sanity check -- is the bond interface up? */
-       if (!(bond->dev->flags & IFF_UP)) {
-               pr_warning("%s: doing slave updates when interface is down.\n",
-                          bond->dev->name);
-       }
-
        if (!rtnl_trylock())
                return restart_syscall();