]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - drivers/net/ethernet/mellanox/mlx5/core/health.c
Merge remote-tracking branch 'arm-soc/for-next'
[karo-tx-linux.git] / drivers / net / ethernet / mellanox / mlx5 / core / health.c
index 9b81e1ceb8dec8454506a5e70f39254ba16a121e..f5deb642d0d6c0e693805e34234cbb0cb2e5ccc6 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/vmalloc.h>
+#include <linux/hardirq.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
@@ -57,6 +58,91 @@ enum {
        MLX5_HEALTH_SYNDR_HIGH_TEMP             = 0x10
 };
 
+enum {
+       MLX5_NIC_IFC_FULL               = 0,
+       MLX5_NIC_IFC_DISABLED           = 1,
+       MLX5_NIC_IFC_NO_DRAM_NIC        = 2
+};
+
+static u8 get_nic_interface(struct mlx5_core_dev *dev)
+{
+       return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
+}
+
+static void trigger_cmd_completions(struct mlx5_core_dev *dev)
+{
+       unsigned long flags;
+       u64 vector;
+
+       /* wait for pending handlers to complete */
+       synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector);
+       spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
+       vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
+       if (!vector)
+               goto no_trig;
+
+       vector |= MLX5_TRIGGERED_CMD_COMP;
+       spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+
+       mlx5_core_dbg(dev, "vector 0x%llx\n", vector);
+       mlx5_cmd_comp_handler(dev, vector);
+       return;
+
+no_trig:
+       spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+}
+
+static int in_fatal(struct mlx5_core_dev *dev)
+{
+       struct mlx5_core_health *health = &dev->priv.health;
+       struct health_buffer __iomem *h = health->health;
+
+       if (get_nic_interface(dev) == MLX5_NIC_IFC_DISABLED)
+               return 1;
+
+       if (ioread32be(&h->fw_ver) == 0xffffffff)
+               return 1;
+
+       return 0;
+}
+
+void mlx5_enter_error_state(struct mlx5_core_dev *dev)
+{
+       if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+               return;
+
+       mlx5_core_err(dev, "start\n");
+       if (pci_channel_offline(dev->pdev) || in_fatal(dev))
+               dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+
+       mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0);
+       mlx5_core_err(dev, "end\n");
+}
+
+static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
+{
+       u8 nic_interface = get_nic_interface(dev);
+
+       switch (nic_interface) {
+       case MLX5_NIC_IFC_FULL:
+               mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n");
+               break;
+
+       case MLX5_NIC_IFC_DISABLED:
+               mlx5_core_warn(dev, "starting teardown\n");
+               break;
+
+       case MLX5_NIC_IFC_NO_DRAM_NIC:
+               mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n");
+               break;
+       default:
+               mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n",
+                              nic_interface);
+       }
+
+       mlx5_disable_device(dev);
+}
+
 static void health_care(struct work_struct *work)
 {
        struct mlx5_core_health *health;
@@ -67,6 +153,7 @@ static void health_care(struct work_struct *work)
        priv = container_of(health, struct mlx5_priv, health);
        dev = container_of(priv, struct mlx5_core_dev, priv);
        mlx5_core_warn(dev, "handling bad device here\n");
+       mlx5_handle_bad_state(dev);
 }
 
 static const char *hsynd_str(u8 synd)
@@ -122,6 +209,10 @@ static void print_health_info(struct mlx5_core_dev *dev)
        u32 fw;
        int i;
 
+       /* If the syndrom is 0, the device is OK and no need to print buffer */
+       if (!ioread8(&h->synd))
+               return;
+
        for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
                dev_err(&dev->pdev->dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i));
 
@@ -136,13 +227,29 @@ static void print_health_info(struct mlx5_core_dev *dev)
        dev_err(&dev->pdev->dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
 }
 
+static unsigned long get_next_poll_jiffies(void)
+{
+       unsigned long next;
+
+       get_random_bytes(&next, sizeof(next));
+       next %= HZ;
+       next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
+
+       return next;
+}
+
 static void poll_health(unsigned long data)
 {
        struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data;
        struct mlx5_core_health *health = &dev->priv.health;
-       unsigned long next;
        u32 count;
 
+       if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+               trigger_cmd_completions(dev);
+               mod_timer(&health->timer, get_next_poll_jiffies());
+               return;
+       }
+
        count = ioread32be(health->health_counter);
        if (count == health->prev)
                ++health->miss_counter;
@@ -151,14 +258,16 @@ static void poll_health(unsigned long data)
 
        health->prev = count;
        if (health->miss_counter == MAX_MISSES) {
-               mlx5_core_err(dev, "device's health compromised\n");
+               dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n");
                print_health_info(dev);
-               queue_work(health->wq, &health->work);
        } else {
-               get_random_bytes(&next, sizeof(next));
-               next %= HZ;
-               next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
-               mod_timer(&health->timer, next);
+               mod_timer(&health->timer, get_next_poll_jiffies());
+       }
+
+       if (in_fatal(dev) && !health->sick) {
+               health->sick = true;
+               print_health_info(dev);
+               queue_work(health->wq, &health->work);
        }
 }