]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - drivers/nvme/host/core.c
block: introduce new block status code type
[karo-tx-linux.git] / drivers / nvme / host / core.c
index eeb409c287b8ed304bdafea7804619dfb39d15b5..07e95c7d837a6e96a58786cb9b3ecfc8262cde64 100644 (file)
@@ -49,10 +49,9 @@ unsigned char shutdown_timeout = 5;
 module_param(shutdown_timeout, byte, 0644);
 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
 
-unsigned int nvme_max_retries = 5;
-module_param_named(max_retries, nvme_max_retries, uint, 0644);
+static u8 nvme_max_retries = 5;
+module_param_named(max_retries, nvme_max_retries, byte, 0644);
 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
-EXPORT_SYMBOL_GPL(nvme_max_retries);
 
 static int nvme_char_major;
 module_param(nvme_char_major, int, 0);
@@ -62,11 +61,58 @@ module_param(default_ps_max_latency_us, ulong, 0644);
 MODULE_PARM_DESC(default_ps_max_latency_us,
                 "max power saving latency for new devices; use PM QOS to change per device");
 
+static bool force_apst;
+module_param(force_apst, bool, 0644);
+MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
+
 static LIST_HEAD(nvme_ctrl_list);
 static DEFINE_SPINLOCK(dev_list_lock);
 
 static struct class *nvme_class;
 
+static blk_status_t nvme_error_status(struct request *req)
+{
+       switch (nvme_req(req)->status & 0x7ff) {
+       case NVME_SC_SUCCESS:
+               return BLK_STS_OK;
+       case NVME_SC_CAP_EXCEEDED:
+               return BLK_STS_NOSPC;
+       case NVME_SC_ONCS_NOT_SUPPORTED:
+               return BLK_STS_NOTSUPP;
+       case NVME_SC_WRITE_FAULT:
+       case NVME_SC_READ_ERROR:
+       case NVME_SC_UNWRITTEN_BLOCK:
+               return BLK_STS_MEDIUM;
+       default:
+               return BLK_STS_IOERR;
+       }
+}
+
+static inline bool nvme_req_needs_retry(struct request *req)
+{
+       if (blk_noretry_request(req))
+               return false;
+       if (nvme_req(req)->status & NVME_SC_DNR)
+               return false;
+       if (jiffies - req->start_time >= req->timeout)
+               return false;
+       if (nvme_req(req)->retries >= nvme_max_retries)
+               return false;
+       return true;
+}
+
+void nvme_complete_rq(struct request *req)
+{
+       if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
+               nvme_req(req)->retries++;
+               blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
+               return;
+       }
+
+       blk_mq_end_request(req, nvme_error_status(req));
+}
+EXPORT_SYMBOL_GPL(nvme_complete_rq);
+
 void nvme_cancel_request(struct request *req, void *data, bool reserved)
 {
        int status;
@@ -80,7 +126,9 @@ void nvme_cancel_request(struct request *req, void *data, bool reserved)
        status = NVME_SC_ABORT_REQ;
        if (blk_queue_dying(req->q))
                status |= NVME_SC_DNR;
-       blk_mq_complete_request(req, status);
+       nvme_req(req)->status = status;
+       blk_mq_complete_request(req);
+
 }
 EXPORT_SYMBOL_GPL(nvme_cancel_request);
 
@@ -205,12 +253,6 @@ fail:
        return NULL;
 }
 
-void nvme_requeue_req(struct request *req)
-{
-       blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
-}
-EXPORT_SYMBOL_GPL(nvme_requeue_req);
-
 struct request *nvme_alloc_request(struct request_queue *q,
                struct nvme_command *cmd, unsigned int flags, int qid)
 {
@@ -327,6 +369,12 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 {
        int ret = BLK_MQ_RQ_QUEUE_OK;
 
+       if (!(req->rq_flags & RQF_DONTPREP)) {
+               nvme_req(req)->retries = 0;
+               nvme_req(req)->flags = 0;
+               req->rq_flags |= RQF_DONTPREP;
+       }
+
        switch (req_op(req)) {
        case REQ_OP_DRV_IN:
        case REQ_OP_DRV_OUT:
@@ -335,6 +383,8 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
        case REQ_OP_FLUSH:
                nvme_setup_flush(ns, cmd);
                break;
+       case REQ_OP_WRITE_ZEROES:
+               /* currently only aliased to deallocate for a few ctrls: */
        case REQ_OP_DISCARD:
                ret = nvme_setup_discard(ns, req, cmd);
                break;
@@ -378,7 +428,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
        blk_execute_rq(req->q, NULL, req, at_head);
        if (result)
                *result = nvme_req(req)->result;
-       ret = req->errors;
+       if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+               ret = -EINTR;
+       else
+               ret = nvme_req(req)->status;
  out:
        blk_mq_free_request(req);
        return ret;
@@ -463,7 +516,10 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
        }
  submit:
        blk_execute_rq(req->q, disk, req, 0);
-       ret = req->errors;
+       if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+               ret = -EINTR;
+       else
+               ret = nvme_req(req)->status;
        if (result)
                *result = le32_to_cpu(nvme_req(req)->result.u32);
        if (meta && !ret && !write) {
@@ -491,15 +547,16 @@ int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
                        result, timeout);
 }
 
-static void nvme_keep_alive_end_io(struct request *rq, int error)
+static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
 {
        struct nvme_ctrl *ctrl = rq->end_io_data;
 
        blk_mq_free_request(rq);
 
-       if (error) {
+       if (status) {
                dev_err(ctrl->device,
-                       "failed nvme_keep_alive_end_io error=%d\n", error);
+                       "failed nvme_keep_alive_end_io error=%d\n",
+                               status);
                return;
        }
 
@@ -861,6 +918,29 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 }
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
+static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
+               u16 bs)
+{
+       struct nvme_ns *ns = disk->private_data;
+       u16 old_ms = ns->ms;
+       u8 pi_type = 0;
+
+       ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+       ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+
+       /* PI implementation requires metadata equal t10 pi tuple size */
+       if (ns->ms == sizeof(struct t10_pi_tuple))
+               pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+
+       if (blk_get_integrity(disk) &&
+           (ns->pi_type != pi_type || ns->ms != old_ms ||
+            bs != queue_logical_block_size(disk->queue) ||
+            (ns->ms && ns->ext)))
+               blk_integrity_unregister(disk);
+
+       ns->pi_type = pi_type;
+}
+
 static void nvme_init_integrity(struct nvme_ns *ns)
 {
        struct blk_integrity integrity;
@@ -887,6 +967,10 @@ static void nvme_init_integrity(struct nvme_ns *ns)
        blk_queue_max_integrity_segments(ns->queue, 1);
 }
 #else
+static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
+               u16 bs)
+{
+}
 static void nvme_init_integrity(struct nvme_ns *ns)
 {
 }
@@ -900,16 +984,14 @@ static void nvme_config_discard(struct nvme_ns *ns)
        BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
                        NVME_DSM_MAX_RANGES);
 
-       if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES)
-               ns->queue->limits.discard_zeroes_data = 1;
-       else
-               ns->queue->limits.discard_zeroes_data = 0;
-
        ns->queue->limits.discard_alignment = logical_block_size;
        ns->queue->limits.discard_granularity = logical_block_size;
        blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
        blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+
+       if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+               blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
 }
 
 static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
@@ -935,37 +1017,22 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
 static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 {
        struct nvme_ns *ns = disk->private_data;
-       u8 lbaf, pi_type;
-       u16 old_ms;
-       unsigned short bs;
-
-       old_ms = ns->ms;
-       lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
-       ns->lba_shift = id->lbaf[lbaf].ds;
-       ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
-       ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+       u16 bs;
 
        /*
         * If identify namespace failed, use default 512 byte block size so
         * block layer can use before failing read/write for 0 capacity.
         */
+       ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
        if (ns->lba_shift == 0)
                ns->lba_shift = 9;
        bs = 1 << ns->lba_shift;
-       /* XXX: PI implementation requires metadata equal t10 pi tuple size */
-       pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
-                                       id->dps & NVME_NS_DPS_PI_MASK : 0;
 
        blk_mq_freeze_queue(disk->queue);
-       if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
-                               ns->ms != old_ms ||
-                               bs != queue_logical_block_size(disk->queue) ||
-                               (ns->ms && ns->ext)))
-               blk_integrity_unregister(disk);
 
-       ns->pi_type = pi_type;
+       if (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
+               nvme_prep_integrity(disk, id, bs);
        blk_queue_logical_block_size(ns->queue, bs);
-
        if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
                nvme_init_integrity(ns);
        if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
@@ -1267,7 +1334,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
         * heuristic: we are willing to spend at most 2% of the time
         * transitioning between power states.  Therefore, when running
         * in any given state, we will enter the next lower-power
-        * non-operational state after waiting 100 * (enlat + exlat)
+        * non-operational state after waiting 50 * (enlat + exlat)
         * microseconds, as long as that state's total latency is under
         * the requested maximum latency.
         *
@@ -1278,6 +1345,8 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
 
        unsigned apste;
        struct nvme_feat_auto_pst *table;
+       u64 max_lat_us = 0;
+       int max_ps = -1;
        int ret;
 
        /*
@@ -1299,6 +1368,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
        if (ctrl->ps_max_latency_us == 0) {
                /* Turn off APST. */
                apste = 0;
+               dev_dbg(ctrl->device, "APST disabled\n");
        } else {
                __le64 target = cpu_to_le64(0);
                int state;
@@ -1348,9 +1418,22 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
 
                        target = cpu_to_le64((state << 3) |
                                             (transition_ms << 8));
+
+                       if (max_ps == -1)
+                               max_ps = state;
+
+                       if (total_latency_us > max_lat_us)
+                               max_lat_us = total_latency_us;
                }
 
                apste = 1;
+
+               if (max_ps == -1) {
+                       dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
+               } else {
+                       dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
+                               max_ps, max_lat_us, (int)sizeof(*table), table);
+               }
        }
 
        ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
@@ -1488,6 +1571,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
                }
        }
 
+       if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
+               dev_warn(ctrl->dev, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
+               ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
+       }
+
        ctrl->oacs = le16_to_cpu(id->oacs);
        ctrl->vid = le16_to_cpu(id->vid);
        ctrl->oncs = le16_to_cpup(&id->oncs);
@@ -1510,10 +1598,19 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 
        ctrl->npss = id->npss;
        prev_apsta = ctrl->apsta;
-       ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta;
+       if (ctrl->quirks & NVME_QUIRK_NO_APST) {
+               if (force_apst && id->apsta) {
+                       dev_warn(ctrl->dev, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
+                       ctrl->apsta = 1;
+               } else {
+                       ctrl->apsta = 0;
+               }
+       } else {
+               ctrl->apsta = id->apsta;
+       }
        memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
 
-       if (ctrl->ops->is_fabrics) {
+       if (ctrl->ops->flags & NVME_F_FABRICS) {
                ctrl->icdoff = le16_to_cpu(id->icdoff);
                ctrl->ioccsz = le32_to_cpu(id->ioccsz);
                ctrl->iorcsz = le32_to_cpu(id->iorcsz);
@@ -2006,7 +2103,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
                if (ns->ndev)
                        nvme_nvm_unregister_sysfs(ns);
                del_gendisk(ns->disk);
-               blk_mq_abort_requeue_list(ns->queue);
                blk_cleanup_queue(ns->queue);
        }
 
@@ -2344,8 +2440,16 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
                        continue;
                revalidate_disk(ns->disk);
                blk_set_queue_dying(ns->queue);
-               blk_mq_abort_requeue_list(ns->queue);
-               blk_mq_start_stopped_hw_queues(ns->queue, true);
+
+               /*
+                * Forcibly start all queues to avoid having stuck requests.
+                * Note that we must ensure the queues are not stopped
+                * when the final removal happens.
+                */
+               blk_mq_start_hw_queues(ns->queue);
+
+               /* draining requests in requeue list */
+               blk_mq_kick_requeue_list(ns->queue);
        }
        mutex_unlock(&ctrl->namespaces_mutex);
 }
@@ -2393,7 +2497,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 
        mutex_lock(&ctrl->namespaces_mutex);
        list_for_each_entry(ns, &ctrl->namespaces, list)
-               blk_mq_freeze_queue_start(ns->queue);
+               blk_freeze_queue_start(ns->queue);
        mutex_unlock(&ctrl->namespaces_mutex);
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);