nvme-pci: factor out the cqe reading mechanics from __nvme_process_cq

[karo-tx-linux.git] / drivers / nvme / host / pci.c
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c

index d52701df72457d0fa2b85a168c500fd022b8b717..d309b6c90511ce0098d05560964e76c674f11f4d 100644 (file)
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -17,28 +17,15 @@
  #include <linux/blkdev.h>
  #include <linux/blk-mq.h>
  #include <linux/blk-mq-pci.h>
-#include <linux/cpu.h>
-#include <linux/delay.h>
  #include <linux/dmi.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/genhd.h>
-#include <linux/hdreg.h>
-#include <linux/idr.h>
  #include <linux/init.h>
  #include <linux/interrupt.h>
  #include <linux/io.h>
-#include <linux/kdev_t.h>
-#include <linux/kernel.h>
  #include <linux/mm.h>
  #include <linux/module.h>
-#include <linux/moduleparam.h>
  #include <linux/mutex.h>
  #include <linux/pci.h>
  #include <linux/poison.h>
-#include <linux/ptrace.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
  #include <linux/t10-pi.h>
  #include <linux/timer.h>
  #include <linux/types.h>
@@ -66,12 +53,14 @@ static bool use_cmb_sqes = true;
  module_param(use_cmb_sqes, bool, 0644);
  MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
  
-static struct workqueue_struct *nvme_workq;
+static unsigned int max_host_mem_size_mb = 128;
+module_param(max_host_mem_size_mb, uint, 0444);
+MODULE_PARM_DESC(max_host_mem_size_mb,
+       "Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
  
  struct nvme_dev;
  struct nvme_queue;
  
-static int nvme_reset(struct nvme_dev *dev);
  static void nvme_process_cq(struct nvme_queue *nvmeq);
  static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
  
@@ -92,9 +81,8 @@ struct nvme_dev {
         int q_depth;
         u32 db_stride;
         void __iomem *bar;
-       struct work_struct reset_work;
+       unsigned long bar_mapped_size;
         struct work_struct remove_work;
-       struct timer_list watchdog_timer;
         struct mutex shutdown_lock;
         bool subsystem;
         void __iomem *cmb;
@@ -104,10 +92,18 @@ struct nvme_dev {
         u32 cmbloc;
         struct nvme_ctrl ctrl;
         struct completion ioq_wait;
+
+       /* shadow doorbell buffer support: */
         u32 *dbbuf_dbs;
         dma_addr_t dbbuf_dbs_dma_addr;
         u32 *dbbuf_eis;
         dma_addr_t dbbuf_eis_dma_addr;
+
+       /* host memory buffer support: */
+       u64 host_mem_size;
+       u32 nr_host_mem_descs;
+       struct nvme_host_mem_buf_desc *host_mem_descs;
+       void **host_mem_desc_bufs;
  };
  
  static inline unsigned int sq_idx(unsigned int qid, u32 stride)
@@ -185,8 +181,8 @@ static inline void _nvme_check_size(void)
         BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
         BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
         BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
-       BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
-       BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
+       BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
+       BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
         BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
         BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
         BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
@@ -350,19 +346,6 @@ static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_i
         nvmeq->tags = NULL;
  }
  
-static int nvme_admin_init_request(struct blk_mq_tag_set *set,
-               struct request *req, unsigned int hctx_idx,
-               unsigned int numa_node)
-{
-       struct nvme_dev *dev = set->driver_data;
-       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-       struct nvme_queue *nvmeq = dev->queues[0];
-
-       BUG_ON(!nvmeq);
-       iod->nvmeq = nvmeq;
-       return 0;
-}
-
  static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
                           unsigned int hctx_idx)
  {
@@ -382,7 +365,8 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
  {
         struct nvme_dev *dev = set->driver_data;
         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-       struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+       int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
+       struct nvme_queue *nvmeq = dev->queues[queue_idx];
  
         BUG_ON(!nvmeq);
         iod->nvmeq = nvmeq;
@@ -427,7 +411,7 @@ static __le64 **iod_list(struct request *req)
         return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req));
  }
  
-static int nvme_init_iod(struct request *rq, struct nvme_dev *dev)
+static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
  {
         struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
         int nseg = blk_rq_nr_phys_segments(rq);
@@ -436,7 +420,7 @@ static int nvme_init_iod(struct request *rq, struct nvme_dev *dev)
         if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
                 iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC);
                 if (!iod->sg)
-                       return BLK_MQ_RQ_QUEUE_BUSY;
+                       return BLK_STS_RESOURCE;
         } else {
                 iod->sg = iod->inline_sg;
         }
@@ -446,7 +430,7 @@ static int nvme_init_iod(struct request *rq, struct nvme_dev *dev)
         iod->nents = 0;
         iod->length = size;
  
-       return BLK_MQ_RQ_QUEUE_OK;
+       return BLK_STS_OK;
  }
  
  static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
@@ -616,21 +600,21 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req)
         return true;
  }
  
-static int nvme_map_data(struct nvme_dev *dev, struct request *req,
+static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
                 struct nvme_command *cmnd)
  {
         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
         struct request_queue *q = req->q;
         enum dma_data_direction dma_dir = rq_data_dir(req) ?
                         DMA_TO_DEVICE : DMA_FROM_DEVICE;
-       int ret = BLK_MQ_RQ_QUEUE_ERROR;
+       blk_status_t ret = BLK_STS_IOERR;
  
         sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
         iod->nents = blk_rq_map_sg(q, req, iod->sg);
         if (!iod->nents)
                 goto out;
  
-       ret = BLK_MQ_RQ_QUEUE_BUSY;
+       ret = BLK_STS_RESOURCE;
         if (!dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
                                 DMA_ATTR_NO_WARN))
                 goto out;
@@ -638,7 +622,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req,
         if (!nvme_setup_prps(dev, req))
                 goto out_unmap;
  
-       ret = BLK_MQ_RQ_QUEUE_ERROR;
+       ret = BLK_STS_IOERR;
         if (blk_integrity_rq(req)) {
                 if (blk_rq_count_integrity_sg(q, req->bio) != 1)
                         goto out_unmap;
@@ -658,7 +642,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req,
         cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma);
         if (blk_integrity_rq(req))
                 cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
-       return BLK_MQ_RQ_QUEUE_OK;
+       return BLK_STS_OK;
  
  out_unmap:
         dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
@@ -688,7 +672,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
  /*
   * NOTE: ns is NULL when called on the admin queue.
   */
-static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
                          const struct blk_mq_queue_data *bd)
  {
         struct nvme_ns *ns = hctx->queue->queuedata;
@@ -696,47 +680,34 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
         struct nvme_dev *dev = nvmeq->dev;
         struct request *req = bd->rq;
         struct nvme_command cmnd;
-       int ret = BLK_MQ_RQ_QUEUE_OK;
-
-       /*
-        * If formated with metadata, require the block layer provide a buffer
-        * unless this namespace is formated such that the metadata can be
-        * stripped/generated by the controller with PRACT=1.
-        */
-       if (ns && ns->ms && !blk_integrity_rq(req)) {
-               if (!(ns->pi_type && ns->ms == 8) &&
-                   !blk_rq_is_passthrough(req)) {
-                       blk_mq_end_request(req, -EFAULT);
-                       return BLK_MQ_RQ_QUEUE_OK;
-               }
-       }
+       blk_status_t ret;
  
         ret = nvme_setup_cmd(ns, req, &cmnd);
-       if (ret != BLK_MQ_RQ_QUEUE_OK)
+       if (ret)
                 return ret;
  
         ret = nvme_init_iod(req, dev);
-       if (ret != BLK_MQ_RQ_QUEUE_OK)
+       if (ret)
                 goto out_free_cmd;
  
-       if (blk_rq_nr_phys_segments(req))
+       if (blk_rq_nr_phys_segments(req)) {
                 ret = nvme_map_data(dev, req, &cmnd);
-
-       if (ret != BLK_MQ_RQ_QUEUE_OK)
-               goto out_cleanup_iod;
+               if (ret)
+                       goto out_cleanup_iod;
+       }
  
         blk_mq_start_request(req);
  
         spin_lock_irq(&nvmeq->q_lock);
         if (unlikely(nvmeq->cq_vector < 0)) {
-               ret = BLK_MQ_RQ_QUEUE_ERROR;
+               ret = BLK_STS_IOERR;
                 spin_unlock_irq(&nvmeq->q_lock);
                 goto out_cleanup_iod;
         }
         __nvme_submit_cmd(nvmeq, &cmnd);
         nvme_process_cq(nvmeq);
         spin_unlock_irq(&nvmeq->q_lock);
-       return BLK_MQ_RQ_QUEUE_OK;
+       return BLK_STS_OK;
  out_cleanup_iod:
         nvme_free_iod(dev, req);
  out_free_cmd:
@@ -759,60 +730,80 @@ static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head,
         return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase;
  }
  
-static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
+static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
  {
-       u16 head, phase;
+       u16 head = nvmeq->cq_head;
  
-       head = nvmeq->cq_head;
-       phase = nvmeq->cq_phase;
+       if (likely(nvmeq->cq_vector >= 0)) {
+               if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
+                                                     nvmeq->dbbuf_cq_ei))
+                       writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+       }
+}
  
-       while (nvme_cqe_valid(nvmeq, head, phase)) {
-               struct nvme_completion cqe = nvmeq->cqes[head];
-               struct request *req;
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
+               struct nvme_completion *cqe)
+{
+       struct request *req;
  
-               if (++head == nvmeq->q_depth) {
-                       head = 0;
-                       phase = !phase;
-               }
+       if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
+               dev_warn(nvmeq->dev->ctrl.device,
+                       "invalid id %d completed on queue %d\n",
+                       cqe->command_id, le16_to_cpu(cqe->sq_id));
+               return;
+       }
  
-               if (tag && *tag == cqe.command_id)
-                       *tag = -1;
+       /*
+        * AEN requests are special as they don't time out and can
+        * survive any kind of queue freeze and often don't respond to
+        * aborts.  We don't even bother to allocate a struct request
+        * for them but rather special case them here.
+        */
+       if (unlikely(nvmeq->qid == 0 &&
+                       cqe->command_id >= NVME_AQ_BLKMQ_DEPTH)) {
+               nvme_complete_async_event(&nvmeq->dev->ctrl,
+                               cqe->status, &cqe->result);
+               return;
+       }
  
-               if (unlikely(cqe.command_id >= nvmeq->q_depth)) {
-                       dev_warn(nvmeq->dev->ctrl.device,
-                               "invalid id %d completed on queue %d\n",
-                               cqe.command_id, le16_to_cpu(cqe.sq_id));
-                       continue;
-               }
+       req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
+       nvme_end_request(req, cqe->status, cqe->result);
+}
  
-               /*
-                * AEN requests are special as they don't time out and can
-                * survive any kind of queue freeze and often don't respond to
-                * aborts.  We don't even bother to allocate a struct request
-                * for them but rather special case them here.
-                */
-               if (unlikely(nvmeq->qid == 0 &&
-                               cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) {
-                       nvme_complete_async_event(&nvmeq->dev->ctrl,
-                                       cqe.status, &cqe.result);
-                       continue;
-               }
+static inline bool nvme_read_cqe(struct nvme_queue *nvmeq,
+               struct nvme_completion *cqe)
+{
+       if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
+               *cqe = nvmeq->cqes[nvmeq->cq_head];
  
-               req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
-               nvme_end_request(req, cqe.status, cqe.result);
+               if (++nvmeq->cq_head == nvmeq->q_depth) {
+                       nvmeq->cq_head = 0;
+                       nvmeq->cq_phase = !nvmeq->cq_phase;
+               }
+               return true;
         }
+       return false;
+}
  
-       if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
-               return;
+static void __nvme_process_cq(struct nvme_queue *nvmeq, int *tag)
+{
+       struct nvme_completion cqe;
+       int consumed = 0;
  
-       if (likely(nvmeq->cq_vector >= 0))
-               if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
-                                                     nvmeq->dbbuf_cq_ei))
-                       writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
-       nvmeq->cq_head = head;
-       nvmeq->cq_phase = phase;
+       while (nvme_read_cqe(nvmeq, &cqe)) {
+               nvme_handle_cqe(nvmeq, &cqe);
+               consumed++;
  
-       nvmeq->cqe_seen = 1;
+               if (tag && *tag == cqe.command_id) {
+                       *tag = -1;
+                       break;
+               }
+       }
+
+       if (consumed) {
+               nvme_ring_cq_doorbell(nvmeq);
+               nvmeq->cqe_seen = 1;
+       }
  }
  
  static void nvme_process_cq(struct nvme_queue *nvmeq)
@@ -939,7 +930,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
         return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
  }
  
-static void abort_endio(struct request *req, int error)
+static void abort_endio(struct request *req, blk_status_t error)
  {
         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
         struct nvme_queue *nvmeq = iod->nvmeq;
@@ -950,6 +941,51 @@ static void abort_endio(struct request *req, int error)
         blk_mq_free_request(req);
  }
  
+static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
+{
+
+       /* If true, indicates loss of adapter communication, possibly by a
+        * NVMe Subsystem reset.
+        */
+       bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
+
+       /* If there is a reset ongoing, we shouldn't reset again. */
+       if (dev->ctrl.state == NVME_CTRL_RESETTING)
+               return false;
+
+       /* We shouldn't reset unless the controller is on fatal error state
+        * _or_ if we lost the communication with it.
+        */
+       if (!(csts & NVME_CSTS_CFS) && !nssro)
+               return false;
+
+       /* If PCI error recovery process is happening, we cannot reset or
+        * the recovery mechanism will surely fail.
+        */
+       if (pci_channel_offline(to_pci_dev(dev->dev)))
+               return false;
+
+       return true;
+}
+
+static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
+{
+       /* Read a config register to help see what died. */
+       u16 pci_status;
+       int result;
+
+       result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
+                                     &pci_status);
+       if (result == PCIBIOS_SUCCESSFUL)
+               dev_warn(dev->ctrl.device,
+                        "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
+                        csts, pci_status);
+       else
+               dev_warn(dev->ctrl.device,
+                        "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
+                        csts, result);
+}
+
  static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
  {
         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -957,6 +993,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
         struct nvme_dev *dev = nvmeq->dev;
         struct request *abort_req;
         struct nvme_command cmd;
+       u32 csts = readl(dev->bar + NVME_REG_CSTS);
+
+       /*
+        * Reset immediately if the controller is failed
+        */
+       if (nvme_should_reset(dev, csts)) {
+               nvme_warn_reset(dev, csts);
+               nvme_dev_disable(dev, false);
+               nvme_reset_ctrl(&dev->ctrl);
+               return BLK_EH_HANDLED;
+       }
  
         /*
          * Did we miss an interrupt?
@@ -993,7 +1040,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
                          "I/O %d QID %d timeout, reset controller\n",
                          req->tag, nvmeq->qid);
                 nvme_dev_disable(dev, false);
-               nvme_reset(dev);
+               nvme_reset_ctrl(&dev->ctrl);
  
                 /*
                  * Mark the request as handled, since the inline shutdown
@@ -1247,7 +1294,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
         .complete       = nvme_pci_complete_rq,
         .init_hctx      = nvme_admin_init_hctx,
         .exit_hctx      = nvme_admin_exit_hctx,
-       .init_request   = nvme_admin_init_request,
+       .init_request   = nvme_init_request,
         .timeout        = nvme_timeout,
  };
  
@@ -1311,6 +1358,32 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
         return 0;
  }
  
+static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
+{
+       return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
+}
+
+static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
+{
+       struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+       if (size <= dev->bar_mapped_size)
+               return 0;
+       if (size > pci_resource_len(pdev, 0))
+               return -ENOMEM;
+       if (dev->bar)
+               iounmap(dev->bar);
+       dev->bar = ioremap(pci_resource_start(pdev, 0), size);
+       if (!dev->bar) {
+               dev->bar_mapped_size = 0;
+               return -ENOMEM;
+       }
+       dev->bar_mapped_size = size;
+       dev->dbs = dev->bar + NVME_REG_DBS;
+
+       return 0;
+}
+
  static int nvme_configure_admin_queue(struct nvme_dev *dev)
  {
         int result;
@@ -1318,6 +1391,10 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
         u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
         struct nvme_queue *nvmeq;
  
+       result = nvme_remap_bar(dev, db_bar_size(dev, 0));
+       if (result < 0)
+               return result;
+
         dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
                                                 NVME_CAP_NSSRC(cap) : 0;
  
@@ -1358,66 +1435,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
         return result;
  }
  
-static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
-{
-
-       /* If true, indicates loss of adapter communication, possibly by a
-        * NVMe Subsystem reset.
-        */
-       bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
-
-       /* If there is a reset ongoing, we shouldn't reset again. */
-       if (work_busy(&dev->reset_work))
-               return false;
-
-       /* We shouldn't reset unless the controller is on fatal error state
-        * _or_ if we lost the communication with it.
-        */
-       if (!(csts & NVME_CSTS_CFS) && !nssro)
-               return false;
-
-       /* If PCI error recovery process is happening, we cannot reset or
-        * the recovery mechanism will surely fail.
-        */
-       if (pci_channel_offline(to_pci_dev(dev->dev)))
-               return false;
-
-       return true;
-}
-
-static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
-{
-       /* Read a config register to help see what died. */
-       u16 pci_status;
-       int result;
-
-       result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
-                                     &pci_status);
-       if (result == PCIBIOS_SUCCESSFUL)
-               dev_warn(dev->ctrl.device,
-                        "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
-                        csts, pci_status);
-       else
-               dev_warn(dev->ctrl.device,
-                        "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
-                        csts, result);
-}
-
-static void nvme_watchdog_timer(unsigned long data)
-{
-       struct nvme_dev *dev = (struct nvme_dev *)data;
-       u32 csts = readl(dev->bar + NVME_REG_CSTS);
-
-       /* Skip controllers under certain specific conditions. */
-       if (nvme_should_reset(dev, csts)) {
-               if (!nvme_reset(dev))
-                       nvme_warn_reset(dev, csts);
-               return;
-       }
-
-       mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
-}
-
  static int nvme_create_io_queues(struct nvme_dev *dev)
  {
         unsigned i, max;
@@ -1514,16 +1531,168 @@ static inline void nvme_release_cmb(struct nvme_dev *dev)
         }
  }
  
-static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
+static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
+{
+       size_t len = dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs);
+       struct nvme_command c;
+       u64 dma_addr;
+       int ret;
+
+       dma_addr = dma_map_single(dev->dev, dev->host_mem_descs, len,
+                       DMA_TO_DEVICE);
+       if (dma_mapping_error(dev->dev, dma_addr))
+               return -ENOMEM;
+
+       memset(&c, 0, sizeof(c));
+       c.features.opcode       = nvme_admin_set_features;
+       c.features.fid          = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
+       c.features.dword11      = cpu_to_le32(bits);
+       c.features.dword12      = cpu_to_le32(dev->host_mem_size >>
+                                             ilog2(dev->ctrl.page_size));
+       c.features.dword13      = cpu_to_le32(lower_32_bits(dma_addr));
+       c.features.dword14      = cpu_to_le32(upper_32_bits(dma_addr));
+       c.features.dword15      = cpu_to_le32(dev->nr_host_mem_descs);
+
+       ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
+       if (ret) {
+               dev_warn(dev->ctrl.device,
+                        "failed to set host mem (err %d, flags %#x).\n",
+                        ret, bits);
+       }
+       dma_unmap_single(dev->dev, dma_addr, len, DMA_TO_DEVICE);
+       return ret;
+}
+
+static void nvme_free_host_mem(struct nvme_dev *dev)
  {
-       return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
+       int i;
+
+       for (i = 0; i < dev->nr_host_mem_descs; i++) {
+               struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
+               size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;
+
+               dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i],
+                               le64_to_cpu(desc->addr));
+       }
+
+       kfree(dev->host_mem_desc_bufs);
+       dev->host_mem_desc_bufs = NULL;
+       kfree(dev->host_mem_descs);
+       dev->host_mem_descs = NULL;
+}
+
+static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
+{
+       struct nvme_host_mem_buf_desc *descs;
+       u32 chunk_size, max_entries, i = 0;
+       void **bufs;
+       u64 size, tmp;
+
+       /* start big and work our way down */
+       chunk_size = min(preferred, (u64)PAGE_SIZE << MAX_ORDER);
+retry:
+       tmp = (preferred + chunk_size - 1);
+       do_div(tmp, chunk_size);
+       max_entries = tmp;
+       descs = kcalloc(max_entries, sizeof(*descs), GFP_KERNEL);
+       if (!descs)
+               goto out;
+
+       bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
+       if (!bufs)
+               goto out_free_descs;
+
+       for (size = 0; size < preferred; size += chunk_size) {
+               u32 len = min_t(u64, chunk_size, preferred - size);
+               dma_addr_t dma_addr;
+
+               bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
+                               DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
+               if (!bufs[i])
+                       break;
+
+               descs[i].addr = cpu_to_le64(dma_addr);
+               descs[i].size = cpu_to_le32(len / dev->ctrl.page_size);
+               i++;
+       }
+
+       if (!size || (min && size < min)) {
+               dev_warn(dev->ctrl.device,
+                       "failed to allocate host memory buffer.\n");
+               goto out_free_bufs;
+       }
+
+       dev_info(dev->ctrl.device,
+               "allocated %lld MiB host memory buffer.\n",
+               size >> ilog2(SZ_1M));
+       dev->nr_host_mem_descs = i;
+       dev->host_mem_size = size;
+       dev->host_mem_descs = descs;
+       dev->host_mem_desc_bufs = bufs;
+       return 0;
+
+out_free_bufs:
+       while (--i >= 0) {
+               size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size;
+
+               dma_free_coherent(dev->dev, size, bufs[i],
+                               le64_to_cpu(descs[i].addr));
+       }
+
+       kfree(bufs);
+out_free_descs:
+       kfree(descs);
+out:
+       /* try a smaller chunk size if we failed early */
+       if (chunk_size >= PAGE_SIZE * 2 && (i == 0 || size < min)) {
+               chunk_size /= 2;
+               goto retry;
+       }
+       dev->host_mem_descs = NULL;
+       return -ENOMEM;
+}
+
+static void nvme_setup_host_mem(struct nvme_dev *dev)
+{
+       u64 max = (u64)max_host_mem_size_mb * SZ_1M;
+       u64 preferred = (u64)dev->ctrl.hmpre * 4096;
+       u64 min = (u64)dev->ctrl.hmmin * 4096;
+       u32 enable_bits = NVME_HOST_MEM_ENABLE;
+
+       preferred = min(preferred, max);
+       if (min > max) {
+               dev_warn(dev->ctrl.device,
+                       "min host memory (%lld MiB) above limit (%d MiB).\n",
+                       min >> ilog2(SZ_1M), max_host_mem_size_mb);
+               nvme_free_host_mem(dev);
+               return;
+       }
+
+       /*
+        * If we already have a buffer allocated check if we can reuse it.
+        */
+       if (dev->host_mem_descs) {
+               if (dev->host_mem_size >= min)
+                       enable_bits |= NVME_HOST_MEM_RETURN;
+               else
+                       nvme_free_host_mem(dev);
+       }
+
+       if (!dev->host_mem_descs) {
+               if (nvme_alloc_host_mem(dev, min, preferred))
+                       return;
+       }
+
+       if (nvme_set_host_mem(dev, enable_bits))
+               nvme_free_host_mem(dev);
  }
  
  static int nvme_setup_io_queues(struct nvme_dev *dev)
  {
         struct nvme_queue *adminq = dev->queues[0];
         struct pci_dev *pdev = to_pci_dev(dev->dev);
-       int result, nr_io_queues, size;
+       int result, nr_io_queues;
+       unsigned long size;
  
         nr_io_queues = num_online_cpus();
         result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
@@ -1542,20 +1711,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
                         nvme_release_cmb(dev);
         }
  
-       size = db_bar_size(dev, nr_io_queues);
-       if (size > 8192) {
-               iounmap(dev->bar);
-               do {
-                       dev->bar = ioremap(pci_resource_start(pdev, 0), size);
-                       if (dev->bar)
-                               break;
-                       if (!--nr_io_queues)
-                               return -ENOMEM;
-                       size = db_bar_size(dev, nr_io_queues);
-               } while (1);
-               dev->dbs = dev->bar + 4096;
-               adminq->q_db = dev->dbs;
-       }
+       do {
+               size = db_bar_size(dev, nr_io_queues);
+               result = nvme_remap_bar(dev, size);
+               if (!result)
+                       break;
+               if (!--nr_io_queues)
+                       return -ENOMEM;
+       } while (1);
+       adminq->q_db = dev->dbs;
  
         /* Deregister the admin queue's interrupt */
         pci_free_irq(pdev, 0, adminq);
@@ -1586,7 +1750,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
         return nvme_create_io_queues(dev);
  }
  
-static void nvme_del_queue_end(struct request *req, int error)
+static void nvme_del_queue_end(struct request *req, blk_status_t error)
  {
         struct nvme_queue *nvmeq = req->end_io_data;
  
@@ -1594,7 +1758,7 @@ static void nvme_del_queue_end(struct request *req, int error)
         complete(&nvmeq->dev->ioq_wait);
  }
  
-static void nvme_del_cq_end(struct request *req, int error)
+static void nvme_del_cq_end(struct request *req, blk_status_t error)
  {
         struct nvme_queue *nvmeq = req->end_io_data;
  
@@ -1799,8 +1963,6 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
         bool dead = true;
         struct pci_dev *pdev = to_pci_dev(dev->dev);
  
-       del_timer_sync(&dev->watchdog_timer);
-
         mutex_lock(&dev->shutdown_lock);
         if (pci_is_enabled(pdev)) {
                 u32 csts = readl(dev->bar + NVME_REG_CSTS);
@@ -1815,8 +1977,20 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
          * Give the controller a chance to complete all entered requests if
          * doing a safe shutdown.
          */
-       if (!dead && shutdown)
-               nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
+       if (!dead) {
+               if (shutdown)
+                       nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
+
+               /*
+                * If the controller is still alive tell it to stop using the
+                * host memory buffer.  In theory the shutdown / reset should
+                * make sure that it doesn't access the host memoery anymore,
+                * but I'd rather be safe than sorry..
+                */
+               if (dev->host_mem_descs)
+                       nvme_set_host_mem(dev, 0);
+
+       }
         nvme_stop_queues(&dev->ctrl);
  
         queues = dev->online_queues - 1;
@@ -1899,11 +2073,12 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
  
  static void nvme_reset_work(struct work_struct *work)
  {
-       struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
+       struct nvme_dev *dev =
+               container_of(work, struct nvme_dev, ctrl.reset_work);
         bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
         int result = -ENODEV;
  
-       if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING))
+       if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
                 goto out;
  
         /*
@@ -1913,9 +2088,6 @@ static void nvme_reset_work(struct work_struct *work)
         if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
                 nvme_dev_disable(dev, false);
  
-       if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
-               goto out;
-
         result = nvme_pci_enable(dev);
         if (result)
                 goto out;
@@ -1951,6 +2123,9 @@ static void nvme_reset_work(struct work_struct *work)
                                  "unable to allocate dma for dbbuf\n");
         }
  
+       if (dev->ctrl.hmpre)
+               nvme_setup_host_mem(dev);
+
         result = nvme_setup_io_queues(dev);
         if (result)
                 goto out;
@@ -1964,8 +2139,6 @@ static void nvme_reset_work(struct work_struct *work)
         if (dev->online_queues > 1)
                 nvme_queue_async_events(&dev->ctrl);
  
-       mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
-
         /*
          * Keep the controller around but remove all namespaces if we don't have
          * any working I/O queue.
@@ -2005,17 +2178,6 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
         nvme_put_ctrl(&dev->ctrl);
  }
  
-static int nvme_reset(struct nvme_dev *dev)
-{
-       if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
-               return -ENODEV;
-       if (work_busy(&dev->reset_work))
-               return -ENODEV;
-       if (!queue_work(nvme_workq, &dev->reset_work))
-               return -EBUSY;
-       return 0;
-}
-
  static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
  {
         *val = readl(to_nvme_dev(ctrl)->bar + off);
@@ -2034,16 +2196,6 @@ static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
         return 0;
  }
  
-static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl)
-{
-       struct nvme_dev *dev = to_nvme_dev(ctrl);
-       int ret = nvme_reset(dev);
-
-       if (!ret)
-               flush_work(&dev->reset_work);
-       return ret;
-}
-
  static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
         .name                   = "pcie",
         .module                 = THIS_MODULE,
@@ -2051,7 +2203,6 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
         .reg_read32             = nvme_pci_reg_read32,
         .reg_write32            = nvme_pci_reg_write32,
         .reg_read64             = nvme_pci_reg_read64,
-       .reset_ctrl             = nvme_pci_reset_ctrl,
         .free_ctrl              = nvme_pci_free_ctrl,
         .submit_async_event     = nvme_pci_submit_async_event,
  };
@@ -2063,8 +2214,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
         if (pci_request_mem_regions(pdev, "nvme"))
                 return -ENODEV;
  
-       dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
-       if (!dev->bar)
+       if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
                 goto release;
  
         return 0;
@@ -2118,10 +2268,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
         if (result)
                 goto free;
  
-       INIT_WORK(&dev->reset_work, nvme_reset_work);
+       INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
         INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
-       setup_timer(&dev->watchdog_timer, nvme_watchdog_timer,
-               (unsigned long)dev);
         mutex_init(&dev->shutdown_lock);
         init_completion(&dev->ioq_wait);
  
@@ -2136,9 +2284,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
         if (result)
                 goto release_pools;
  
+       nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
         dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
  
-       queue_work(nvme_workq, &dev->reset_work);
+       queue_work(nvme_wq, &dev->ctrl.reset_work);
         return 0;
  
   release_pools:
@@ -2159,7 +2308,7 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
         if (prepare)
                 nvme_dev_disable(dev, false);
         else
-               nvme_reset(dev);
+               nvme_reset_ctrl(&dev->ctrl);
  }
  
  static void nvme_shutdown(struct pci_dev *pdev)
@@ -2179,6 +2328,7 @@ static void nvme_remove(struct pci_dev *pdev)
  
         nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
  
+       cancel_work_sync(&dev->ctrl.reset_work);
         pci_set_drvdata(pdev, NULL);
  
         if (!pci_device_is_present(pdev)) {
@@ -2186,9 +2336,10 @@ static void nvme_remove(struct pci_dev *pdev)
                 nvme_dev_disable(dev, false);
         }
  
-       flush_work(&dev->reset_work);
+       flush_work(&dev->ctrl.reset_work);
         nvme_uninit_ctrl(&dev->ctrl);
         nvme_dev_disable(dev, true);
+       nvme_free_host_mem(dev);
         nvme_dev_remove_admin(dev);
         nvme_free_queues(dev, 0);
         nvme_release_prp_pools(dev);
@@ -2229,7 +2380,7 @@ static int nvme_resume(struct device *dev)
         struct pci_dev *pdev = to_pci_dev(dev);
         struct nvme_dev *ndev = pci_get_drvdata(pdev);
  
-       nvme_reset(ndev);
+       nvme_reset_ctrl(&ndev->ctrl);
         return 0;
  }
  #endif
@@ -2268,7 +2419,7 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
  
         dev_info(dev->ctrl.device, "restart after slot reset\n");
         pci_restore_state(pdev);
-       nvme_reset(dev);
+       nvme_reset_ctrl(&dev->ctrl);
         return PCI_ERS_RESULT_RECOVERED;
  }
  
@@ -2324,22 +2475,12 @@ static struct pci_driver nvme_driver = {
  
  static int __init nvme_init(void)
  {
-       int result;
-
-       nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
-       if (!nvme_workq)
-               return -ENOMEM;
-
-       result = pci_register_driver(&nvme_driver);
-       if (result)
-               destroy_workqueue(nvme_workq);
-       return result;
+       return pci_register_driver(&nvme_driver);
  }
  
  static void __exit nvme_exit(void)
  {
         pci_unregister_driver(&nvme_driver);
-       destroy_workqueue(nvme_workq);
         _nvme_check_size();
  }