module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
-unsigned int nvme_max_retries = 5;
-module_param_named(max_retries, nvme_max_retries, uint, 0644);
+static u8 nvme_max_retries = 5;
+module_param_named(max_retries, nvme_max_retries, byte, 0644);
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
-EXPORT_SYMBOL_GPL(nvme_max_retries);
static int nvme_char_major;
module_param(nvme_char_major, int, 0);
MODULE_PARM_DESC(default_ps_max_latency_us,
"max power saving latency for new devices; use PM QOS to change per device");
+static bool force_apst;
+module_param(force_apst, bool, 0644);
+MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
+
static LIST_HEAD(nvme_ctrl_list);
static DEFINE_SPINLOCK(dev_list_lock);
static struct class *nvme_class;
+static blk_status_t nvme_error_status(struct request *req)
+{
+ switch (nvme_req(req)->status & 0x7ff) {
+ case NVME_SC_SUCCESS:
+ return BLK_STS_OK;
+ case NVME_SC_CAP_EXCEEDED:
+ return BLK_STS_NOSPC;
+ case NVME_SC_ONCS_NOT_SUPPORTED:
+ return BLK_STS_NOTSUPP;
+ case NVME_SC_WRITE_FAULT:
+ case NVME_SC_READ_ERROR:
+ case NVME_SC_UNWRITTEN_BLOCK:
+ return BLK_STS_MEDIUM;
+ default:
+ return BLK_STS_IOERR;
+ }
+}
+
+static inline bool nvme_req_needs_retry(struct request *req)
+{
+ if (blk_noretry_request(req))
+ return false;
+ if (nvme_req(req)->status & NVME_SC_DNR)
+ return false;
+ if (jiffies - req->start_time >= req->timeout)
+ return false;
+ if (nvme_req(req)->retries >= nvme_max_retries)
+ return false;
+ return true;
+}
+
+void nvme_complete_rq(struct request *req)
+{
+ if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
+ nvme_req(req)->retries++;
+ blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
+ return;
+ }
+
+ blk_mq_end_request(req, nvme_error_status(req));
+}
+EXPORT_SYMBOL_GPL(nvme_complete_rq);
+
void nvme_cancel_request(struct request *req, void *data, bool reserved)
{
int status;
status = NVME_SC_ABORT_REQ;
if (blk_queue_dying(req->q))
status |= NVME_SC_DNR;
- blk_mq_complete_request(req, status);
+ nvme_req(req)->status = status;
+ blk_mq_complete_request(req);
+
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
return NULL;
}
-void nvme_requeue_req(struct request *req)
-{
- blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
-}
-EXPORT_SYMBOL_GPL(nvme_requeue_req);
-
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, unsigned int flags, int qid)
{
{
int ret = BLK_MQ_RQ_QUEUE_OK;
+ if (!(req->rq_flags & RQF_DONTPREP)) {
+ nvme_req(req)->retries = 0;
+ nvme_req(req)->flags = 0;
+ req->rq_flags |= RQF_DONTPREP;
+ }
+
switch (req_op(req)) {
case REQ_OP_DRV_IN:
case REQ_OP_DRV_OUT:
case REQ_OP_FLUSH:
nvme_setup_flush(ns, cmd);
break;
+ case REQ_OP_WRITE_ZEROES:
+ /* currently only aliased to deallocate for a few ctrls: */
case REQ_OP_DISCARD:
ret = nvme_setup_discard(ns, req, cmd);
break;
blk_execute_rq(req->q, NULL, req, at_head);
if (result)
*result = nvme_req(req)->result;
- ret = req->errors;
+ if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+ ret = -EINTR;
+ else
+ ret = nvme_req(req)->status;
out:
blk_mq_free_request(req);
return ret;
}
submit:
blk_execute_rq(req->q, disk, req, 0);
- ret = req->errors;
+ if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+ ret = -EINTR;
+ else
+ ret = nvme_req(req)->status;
if (result)
*result = le32_to_cpu(nvme_req(req)->result.u32);
if (meta && !ret && !write) {
result, timeout);
}
-static void nvme_keep_alive_end_io(struct request *rq, int error)
+static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
{
struct nvme_ctrl *ctrl = rq->end_io_data;
blk_mq_free_request(rq);
- if (error) {
+ if (status) {
dev_err(ctrl->device,
- "failed nvme_keep_alive_end_io error=%d\n", error);
+ "failed nvme_keep_alive_end_io error=%d\n",
+ status);
return;
}
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
+static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
+ u16 bs)
+{
+ struct nvme_ns *ns = disk->private_data;
+ u16 old_ms = ns->ms;
+ u8 pi_type = 0;
+
+ ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+ ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+
+ /* PI implementation requires metadata equal t10 pi tuple size */
+ if (ns->ms == sizeof(struct t10_pi_tuple))
+ pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+
+ if (blk_get_integrity(disk) &&
+ (ns->pi_type != pi_type || ns->ms != old_ms ||
+ bs != queue_logical_block_size(disk->queue) ||
+ (ns->ms && ns->ext)))
+ blk_integrity_unregister(disk);
+
+ ns->pi_type = pi_type;
+}
+
static void nvme_init_integrity(struct nvme_ns *ns)
{
struct blk_integrity integrity;
blk_queue_max_integrity_segments(ns->queue, 1);
}
#else
+static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
+ u16 bs)
+{
+}
static void nvme_init_integrity(struct nvme_ns *ns)
{
}
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES)
- ns->queue->limits.discard_zeroes_data = 1;
- else
- ns->queue->limits.discard_zeroes_data = 0;
-
ns->queue->limits.discard_alignment = logical_block_size;
ns->queue->limits.discard_granularity = logical_block_size;
blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+
+ if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+ blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
}
static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
struct nvme_ns *ns = disk->private_data;
- u8 lbaf, pi_type;
- u16 old_ms;
- unsigned short bs;
-
- old_ms = ns->ms;
- lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
- ns->lba_shift = id->lbaf[lbaf].ds;
- ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
- ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+ u16 bs;
/*
* If identify namespace failed, use default 512 byte block size so
* block layer can use before failing read/write for 0 capacity.
*/
+ ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
if (ns->lba_shift == 0)
ns->lba_shift = 9;
bs = 1 << ns->lba_shift;
- /* XXX: PI implementation requires metadata equal t10 pi tuple size */
- pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
- id->dps & NVME_NS_DPS_PI_MASK : 0;
blk_mq_freeze_queue(disk->queue);
- if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
- ns->ms != old_ms ||
- bs != queue_logical_block_size(disk->queue) ||
- (ns->ms && ns->ext)))
- blk_integrity_unregister(disk);
- ns->pi_type = pi_type;
+ if (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
+ nvme_prep_integrity(disk, id, bs);
blk_queue_logical_block_size(ns->queue, bs);
-
if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
nvme_init_integrity(ns);
if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
* heuristic: we are willing to spend at most 2% of the time
* transitioning between power states. Therefore, when running
* in any given state, we will enter the next lower-power
- * non-operational state after waiting 100 * (enlat + exlat)
+ * non-operational state after waiting 50 * (enlat + exlat)
* microseconds, as long as that state's total latency is under
* the requested maximum latency.
*
unsigned apste;
struct nvme_feat_auto_pst *table;
+ u64 max_lat_us = 0;
+ int max_ps = -1;
int ret;
/*
if (ctrl->ps_max_latency_us == 0) {
/* Turn off APST. */
apste = 0;
+ dev_dbg(ctrl->device, "APST disabled\n");
} else {
__le64 target = cpu_to_le64(0);
int state;
target = cpu_to_le64((state << 3) |
(transition_ms << 8));
+
+ if (max_ps == -1)
+ max_ps = state;
+
+ if (total_latency_us > max_lat_us)
+ max_lat_us = total_latency_us;
}
apste = 1;
+
+ if (max_ps == -1) {
+ dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
+ } else {
+ dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
+ max_ps, max_lat_us, (int)sizeof(*table), table);
+ }
}
ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
}
}
+ if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
+ dev_warn(ctrl->dev, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
+ ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
+ }
+
ctrl->oacs = le16_to_cpu(id->oacs);
ctrl->vid = le16_to_cpu(id->vid);
ctrl->oncs = le16_to_cpup(&id->oncs);
ctrl->npss = id->npss;
prev_apsta = ctrl->apsta;
- ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta;
+ if (ctrl->quirks & NVME_QUIRK_NO_APST) {
+ if (force_apst && id->apsta) {
+ dev_warn(ctrl->dev, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
+ ctrl->apsta = 1;
+ } else {
+ ctrl->apsta = 0;
+ }
+ } else {
+ ctrl->apsta = id->apsta;
+ }
memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
- if (ctrl->ops->is_fabrics) {
+ if (ctrl->ops->flags & NVME_F_FABRICS) {
ctrl->icdoff = le16_to_cpu(id->icdoff);
ctrl->ioccsz = le32_to_cpu(id->ioccsz);
ctrl->iorcsz = le32_to_cpu(id->iorcsz);
if (ns->ndev)
nvme_nvm_unregister_sysfs(ns);
del_gendisk(ns->disk);
- blk_mq_abort_requeue_list(ns->queue);
blk_cleanup_queue(ns->queue);
}
continue;
revalidate_disk(ns->disk);
blk_set_queue_dying(ns->queue);
- blk_mq_abort_requeue_list(ns->queue);
- blk_mq_start_stopped_hw_queues(ns->queue, true);
+
+ /*
+ * Forcibly start all queues to avoid having stuck requests.
+ * Note that we must ensure the queues are not stopped
+ * when the final removal happens.
+ */
+ blk_mq_start_hw_queues(ns->queue);
+
+ /* draining requests in requeue list */
+ blk_mq_kick_requeue_list(ns->queue);
}
mutex_unlock(&ctrl->namespaces_mutex);
}
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list)
- blk_mq_freeze_queue_start(ns->queue);
+ blk_freeze_queue_start(ns->queue);
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);