block: introduce new block status code type

[karo-tx-linux.git] / drivers / nvme / host / core.c
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c

index eeb409c287b8ed304bdafea7804619dfb39d15b5..07e95c7d837a6e96a58786cb9b3ecfc8262cde64 100644 (file)
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -49,10 +49,9 @@ unsigned char shutdown_timeout = 5;
  module_param(shutdown_timeout, byte, 0644);
  MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
  
-unsigned int nvme_max_retries = 5;
-module_param_named(max_retries, nvme_max_retries, uint, 0644);
+static u8 nvme_max_retries = 5;
+module_param_named(max_retries, nvme_max_retries, byte, 0644);
  MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
-EXPORT_SYMBOL_GPL(nvme_max_retries);
  
  static int nvme_char_major;
  module_param(nvme_char_major, int, 0);
@@ -62,11 +61,58 @@ module_param(default_ps_max_latency_us, ulong, 0644);
  MODULE_PARM_DESC(default_ps_max_latency_us,
                  "max power saving latency for new devices; use PM QOS to change per device");
  
+static bool force_apst;
+module_param(force_apst, bool, 0644);
+MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
+
  static LIST_HEAD(nvme_ctrl_list);
  static DEFINE_SPINLOCK(dev_list_lock);
  
  static struct class *nvme_class;
  
+static blk_status_t nvme_error_status(struct request *req)
+{
+       switch (nvme_req(req)->status & 0x7ff) {
+       case NVME_SC_SUCCESS:
+               return BLK_STS_OK;
+       case NVME_SC_CAP_EXCEEDED:
+               return BLK_STS_NOSPC;
+       case NVME_SC_ONCS_NOT_SUPPORTED:
+               return BLK_STS_NOTSUPP;
+       case NVME_SC_WRITE_FAULT:
+       case NVME_SC_READ_ERROR:
+       case NVME_SC_UNWRITTEN_BLOCK:
+               return BLK_STS_MEDIUM;
+       default:
+               return BLK_STS_IOERR;
+       }
+}
+
+static inline bool nvme_req_needs_retry(struct request *req)
+{
+       if (blk_noretry_request(req))
+               return false;
+       if (nvme_req(req)->status & NVME_SC_DNR)
+               return false;
+       if (jiffies - req->start_time >= req->timeout)
+               return false;
+       if (nvme_req(req)->retries >= nvme_max_retries)
+               return false;
+       return true;
+}
+
+void nvme_complete_rq(struct request *req)
+{
+       if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
+               nvme_req(req)->retries++;
+               blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
+               return;
+       }
+
+       blk_mq_end_request(req, nvme_error_status(req));
+}
+EXPORT_SYMBOL_GPL(nvme_complete_rq);
+
  void nvme_cancel_request(struct request *req, void *data, bool reserved)
  {
         int status;
@@ -80,7 +126,9 @@ void nvme_cancel_request(struct request *req, void *data, bool reserved)
         status = NVME_SC_ABORT_REQ;
         if (blk_queue_dying(req->q))
                 status |= NVME_SC_DNR;
-       blk_mq_complete_request(req, status);
+       nvme_req(req)->status = status;
+       blk_mq_complete_request(req);
+
  }
  EXPORT_SYMBOL_GPL(nvme_cancel_request);
  
@@ -205,12 +253,6 @@ fail:
         return NULL;
  }
  
-void nvme_requeue_req(struct request *req)
-{
-       blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
-}
-EXPORT_SYMBOL_GPL(nvme_requeue_req);
-
  struct request *nvme_alloc_request(struct request_queue *q,
                 struct nvme_command *cmd, unsigned int flags, int qid)
  {
@@ -327,6 +369,12 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
  {
         int ret = BLK_MQ_RQ_QUEUE_OK;
  
+       if (!(req->rq_flags & RQF_DONTPREP)) {
+               nvme_req(req)->retries = 0;
+               nvme_req(req)->flags = 0;
+               req->rq_flags |= RQF_DONTPREP;
+       }
+
         switch (req_op(req)) {
         case REQ_OP_DRV_IN:
         case REQ_OP_DRV_OUT:
@@ -335,6 +383,8 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
         case REQ_OP_FLUSH:
                 nvme_setup_flush(ns, cmd);
                 break;
+       case REQ_OP_WRITE_ZEROES:
+               /* currently only aliased to deallocate for a few ctrls: */
         case REQ_OP_DISCARD:
                 ret = nvme_setup_discard(ns, req, cmd);
                 break;
@@ -378,7 +428,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
         blk_execute_rq(req->q, NULL, req, at_head);
         if (result)
                 *result = nvme_req(req)->result;
-       ret = req->errors;
+       if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+               ret = -EINTR;
+       else
+               ret = nvme_req(req)->status;
   out:
         blk_mq_free_request(req);
         return ret;
@@ -463,7 +516,10 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
         }
   submit:
         blk_execute_rq(req->q, disk, req, 0);
-       ret = req->errors;
+       if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+               ret = -EINTR;
+       else
+               ret = nvme_req(req)->status;
         if (result)
                 *result = le32_to_cpu(nvme_req(req)->result.u32);
         if (meta && !ret && !write) {
@@ -491,15 +547,16 @@ int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
                         result, timeout);
  }
  
-static void nvme_keep_alive_end_io(struct request *rq, int error)
+static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
  {
         struct nvme_ctrl *ctrl = rq->end_io_data;
  
         blk_mq_free_request(rq);
  
-       if (error) {
+       if (status) {
                 dev_err(ctrl->device,
-                       "failed nvme_keep_alive_end_io error=%d\n", error);
+                       "failed nvme_keep_alive_end_io error=%d\n",
+                               status);
                 return;
         }
  
@@ -861,6 +918,29 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
  }
  
  #ifdef CONFIG_BLK_DEV_INTEGRITY
+static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
+               u16 bs)
+{
+       struct nvme_ns *ns = disk->private_data;
+       u16 old_ms = ns->ms;
+       u8 pi_type = 0;
+
+       ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+       ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+
+       /* PI implementation requires metadata equal t10 pi tuple size */
+       if (ns->ms == sizeof(struct t10_pi_tuple))
+               pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+
+       if (blk_get_integrity(disk) &&
+           (ns->pi_type != pi_type || ns->ms != old_ms ||
+            bs != queue_logical_block_size(disk->queue) ||
+            (ns->ms && ns->ext)))
+               blk_integrity_unregister(disk);
+
+       ns->pi_type = pi_type;
+}
+
  static void nvme_init_integrity(struct nvme_ns *ns)
  {
         struct blk_integrity integrity;
@@ -887,6 +967,10 @@ static void nvme_init_integrity(struct nvme_ns *ns)
         blk_queue_max_integrity_segments(ns->queue, 1);
  }
  #else
+static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
+               u16 bs)
+{
+}
  static void nvme_init_integrity(struct nvme_ns *ns)
  {
  }
@@ -900,16 +984,14 @@ static void nvme_config_discard(struct nvme_ns *ns)
         BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
                         NVME_DSM_MAX_RANGES);
  
-       if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES)
-               ns->queue->limits.discard_zeroes_data = 1;
-       else
-               ns->queue->limits.discard_zeroes_data = 0;
-
         ns->queue->limits.discard_alignment = logical_block_size;
         ns->queue->limits.discard_granularity = logical_block_size;
         blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
         blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
         queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+
+       if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+               blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
  }
  
  static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
@@ -935,37 +1017,22 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
  static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
  {
         struct nvme_ns *ns = disk->private_data;
-       u8 lbaf, pi_type;
-       u16 old_ms;
-       unsigned short bs;
-
-       old_ms = ns->ms;
-       lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
-       ns->lba_shift = id->lbaf[lbaf].ds;
-       ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
-       ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+       u16 bs;
  
         /*
          * If identify namespace failed, use default 512 byte block size so
          * block layer can use before failing read/write for 0 capacity.
          */
+       ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
         if (ns->lba_shift == 0)
                 ns->lba_shift = 9;
         bs = 1 << ns->lba_shift;
-       /* XXX: PI implementation requires metadata equal t10 pi tuple size */
-       pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
-                                       id->dps & NVME_NS_DPS_PI_MASK : 0;
  
         blk_mq_freeze_queue(disk->queue);
-       if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
-                               ns->ms != old_ms ||
-                               bs != queue_logical_block_size(disk->queue) ||
-                               (ns->ms && ns->ext)))
-               blk_integrity_unregister(disk);
  
-       ns->pi_type = pi_type;
+       if (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
+               nvme_prep_integrity(disk, id, bs);
         blk_queue_logical_block_size(ns->queue, bs);
-
         if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
                 nvme_init_integrity(ns);
         if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
@@ -1267,7 +1334,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
          * heuristic: we are willing to spend at most 2% of the time
          * transitioning between power states.  Therefore, when running
          * in any given state, we will enter the next lower-power
-        * non-operational state after waiting 100 * (enlat + exlat)
+        * non-operational state after waiting 50 * (enlat + exlat)
          * microseconds, as long as that state's total latency is under
          * the requested maximum latency.
          *
@@ -1278,6 +1345,8 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
  
         unsigned apste;
         struct nvme_feat_auto_pst *table;
+       u64 max_lat_us = 0;
+       int max_ps = -1;
         int ret;
  
         /*
@@ -1299,6 +1368,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
         if (ctrl->ps_max_latency_us == 0) {
                 /* Turn off APST. */
                 apste = 0;
+               dev_dbg(ctrl->device, "APST disabled\n");
         } else {
                 __le64 target = cpu_to_le64(0);
                 int state;
@@ -1348,9 +1418,22 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
  
                         target = cpu_to_le64((state << 3) |
                                              (transition_ms << 8));
+
+                       if (max_ps == -1)
+                               max_ps = state;
+
+                       if (total_latency_us > max_lat_us)
+                               max_lat_us = total_latency_us;
                 }
  
                 apste = 1;
+
+               if (max_ps == -1) {
+                       dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
+               } else {
+                       dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
+                               max_ps, max_lat_us, (int)sizeof(*table), table);
+               }
         }
  
         ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
@@ -1488,6 +1571,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
                 }
         }
  
+       if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
+               dev_warn(ctrl->dev, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
+               ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
+       }
+
         ctrl->oacs = le16_to_cpu(id->oacs);
         ctrl->vid = le16_to_cpu(id->vid);
         ctrl->oncs = le16_to_cpup(&id->oncs);
@@ -1510,10 +1598,19 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
  
         ctrl->npss = id->npss;
         prev_apsta = ctrl->apsta;
-       ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta;
+       if (ctrl->quirks & NVME_QUIRK_NO_APST) {
+               if (force_apst && id->apsta) {
+                       dev_warn(ctrl->dev, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
+                       ctrl->apsta = 1;
+               } else {
+                       ctrl->apsta = 0;
+               }
+       } else {
+               ctrl->apsta = id->apsta;
+       }
         memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
  
-       if (ctrl->ops->is_fabrics) {
+       if (ctrl->ops->flags & NVME_F_FABRICS) {
                 ctrl->icdoff = le16_to_cpu(id->icdoff);
                 ctrl->ioccsz = le32_to_cpu(id->ioccsz);
                 ctrl->iorcsz = le32_to_cpu(id->iorcsz);
@@ -2006,7 +2103,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
                 if (ns->ndev)
                         nvme_nvm_unregister_sysfs(ns);
                 del_gendisk(ns->disk);
-               blk_mq_abort_requeue_list(ns->queue);
                 blk_cleanup_queue(ns->queue);
         }
  
@@ -2344,8 +2440,16 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
                         continue;
                 revalidate_disk(ns->disk);
                 blk_set_queue_dying(ns->queue);
-               blk_mq_abort_requeue_list(ns->queue);
-               blk_mq_start_stopped_hw_queues(ns->queue, true);
+
+               /*
+                * Forcibly start all queues to avoid having stuck requests.
+                * Note that we must ensure the queues are not stopped
+                * when the final removal happens.
+                */
+               blk_mq_start_hw_queues(ns->queue);
+
+               /* draining requests in requeue list */
+               blk_mq_kick_requeue_list(ns->queue);
         }
         mutex_unlock(&ctrl->namespaces_mutex);
  }
@@ -2393,7 +2497,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
  
         mutex_lock(&ctrl->namespaces_mutex);
         list_for_each_entry(ns, &ctrl->namespaces, list)
-               blk_mq_freeze_queue_start(ns->queue);
+               blk_freeze_queue_start(ns->queue);
         mutex_unlock(&ctrl->namespaces_mutex);
  }
  EXPORT_SYMBOL_GPL(nvme_start_freeze);