]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - drivers/nvme/host/core.c
nvme: add new reconnecting controller state
[karo-tx-linux.git] / drivers / nvme / host / core.c
1 /*
2  * NVM Express device driver
3  * Copyright (c) 2011-2014, Intel Corporation.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  */
14
15 #include <linux/blkdev.h>
16 #include <linux/blk-mq.h>
17 #include <linux/delay.h>
18 #include <linux/errno.h>
19 #include <linux/hdreg.h>
20 #include <linux/kernel.h>
21 #include <linux/module.h>
22 #include <linux/list_sort.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/pr.h>
26 #include <linux/ptrace.h>
27 #include <linux/nvme_ioctl.h>
28 #include <linux/t10-pi.h>
29 #include <scsi/sg.h>
30 #include <asm/unaligned.h>
31
32 #include "nvme.h"
33 #include "fabrics.h"
34
35 #define NVME_MINORS             (1U << MINORBITS)
36
37 unsigned char admin_timeout = 60;
38 module_param(admin_timeout, byte, 0644);
39 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
40 EXPORT_SYMBOL_GPL(admin_timeout);
41
42 unsigned char nvme_io_timeout = 30;
43 module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
44 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
45 EXPORT_SYMBOL_GPL(nvme_io_timeout);
46
47 unsigned char shutdown_timeout = 5;
48 module_param(shutdown_timeout, byte, 0644);
49 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
50
51 static int nvme_major;
52 module_param(nvme_major, int, 0);
53
54 static int nvme_char_major;
55 module_param(nvme_char_major, int, 0);
56
57 static LIST_HEAD(nvme_ctrl_list);
58 static DEFINE_SPINLOCK(dev_list_lock);
59
60 static struct class *nvme_class;
61
62 void nvme_cancel_request(struct request *req, void *data, bool reserved)
63 {
64         int status;
65
66         if (!blk_mq_request_started(req))
67                 return;
68
69         dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
70                                 "Cancelling I/O %d", req->tag);
71
72         status = NVME_SC_ABORT_REQ;
73         if (blk_queue_dying(req->q))
74                 status |= NVME_SC_DNR;
75         blk_mq_complete_request(req, status);
76 }
77 EXPORT_SYMBOL_GPL(nvme_cancel_request);
78
79 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
80                 enum nvme_ctrl_state new_state)
81 {
82         enum nvme_ctrl_state old_state = ctrl->state;
83         bool changed = false;
84
85         spin_lock_irq(&ctrl->lock);
86         switch (new_state) {
87         case NVME_CTRL_LIVE:
88                 switch (old_state) {
89                 case NVME_CTRL_NEW:
90                 case NVME_CTRL_RESETTING:
91                 case NVME_CTRL_RECONNECTING:
92                         changed = true;
93                         /* FALLTHRU */
94                 default:
95                         break;
96                 }
97                 break;
98         case NVME_CTRL_RESETTING:
99                 switch (old_state) {
100                 case NVME_CTRL_NEW:
101                 case NVME_CTRL_LIVE:
102                 case NVME_CTRL_RECONNECTING:
103                         changed = true;
104                         /* FALLTHRU */
105                 default:
106                         break;
107                 }
108                 break;
109         case NVME_CTRL_RECONNECTING:
110                 switch (old_state) {
111                 case NVME_CTRL_LIVE:
112                         changed = true;
113                         /* FALLTHRU */
114                 default:
115                         break;
116                 }
117                 break;
118         case NVME_CTRL_DELETING:
119                 switch (old_state) {
120                 case NVME_CTRL_LIVE:
121                 case NVME_CTRL_RESETTING:
122                 case NVME_CTRL_RECONNECTING:
123                         changed = true;
124                         /* FALLTHRU */
125                 default:
126                         break;
127                 }
128                 break;
129         case NVME_CTRL_DEAD:
130                 switch (old_state) {
131                 case NVME_CTRL_DELETING:
132                         changed = true;
133                         /* FALLTHRU */
134                 default:
135                         break;
136                 }
137                 break;
138         default:
139                 break;
140         }
141         spin_unlock_irq(&ctrl->lock);
142
143         if (changed)
144                 ctrl->state = new_state;
145
146         return changed;
147 }
148 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
149
150 static void nvme_free_ns(struct kref *kref)
151 {
152         struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
153
154         if (ns->type == NVME_NS_LIGHTNVM)
155                 nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
156
157         spin_lock(&dev_list_lock);
158         ns->disk->private_data = NULL;
159         spin_unlock(&dev_list_lock);
160
161         put_disk(ns->disk);
162         ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
163         nvme_put_ctrl(ns->ctrl);
164         kfree(ns);
165 }
166
167 static void nvme_put_ns(struct nvme_ns *ns)
168 {
169         kref_put(&ns->kref, nvme_free_ns);
170 }
171
172 static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
173 {
174         struct nvme_ns *ns;
175
176         spin_lock(&dev_list_lock);
177         ns = disk->private_data;
178         if (ns) {
179                 if (!kref_get_unless_zero(&ns->kref))
180                         goto fail;
181                 if (!try_module_get(ns->ctrl->ops->module))
182                         goto fail_put_ns;
183         }
184         spin_unlock(&dev_list_lock);
185
186         return ns;
187
188 fail_put_ns:
189         kref_put(&ns->kref, nvme_free_ns);
190 fail:
191         spin_unlock(&dev_list_lock);
192         return NULL;
193 }
194
195 void nvme_requeue_req(struct request *req)
196 {
197         unsigned long flags;
198
199         blk_mq_requeue_request(req);
200         spin_lock_irqsave(req->q->queue_lock, flags);
201         if (!blk_queue_stopped(req->q))
202                 blk_mq_kick_requeue_list(req->q);
203         spin_unlock_irqrestore(req->q->queue_lock, flags);
204 }
205 EXPORT_SYMBOL_GPL(nvme_requeue_req);
206
207 struct request *nvme_alloc_request(struct request_queue *q,
208                 struct nvme_command *cmd, unsigned int flags, int qid)
209 {
210         struct request *req;
211
212         if (qid == NVME_QID_ANY) {
213                 req = blk_mq_alloc_request(q, nvme_is_write(cmd), flags);
214         } else {
215                 req = blk_mq_alloc_request_hctx(q, nvme_is_write(cmd), flags,
216                                 qid ? qid - 1 : 0);
217         }
218         if (IS_ERR(req))
219                 return req;
220
221         req->cmd_type = REQ_TYPE_DRV_PRIV;
222         req->cmd_flags |= REQ_FAILFAST_DRIVER;
223         req->__data_len = 0;
224         req->__sector = (sector_t) -1;
225         req->bio = req->biotail = NULL;
226
227         req->cmd = (unsigned char *)cmd;
228         req->cmd_len = sizeof(struct nvme_command);
229
230         return req;
231 }
232 EXPORT_SYMBOL_GPL(nvme_alloc_request);
233
234 static inline void nvme_setup_flush(struct nvme_ns *ns,
235                 struct nvme_command *cmnd)
236 {
237         memset(cmnd, 0, sizeof(*cmnd));
238         cmnd->common.opcode = nvme_cmd_flush;
239         cmnd->common.nsid = cpu_to_le32(ns->ns_id);
240 }
241
242 static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req,
243                 struct nvme_command *cmnd)
244 {
245         struct nvme_dsm_range *range;
246         struct page *page;
247         int offset;
248         unsigned int nr_bytes = blk_rq_bytes(req);
249
250         range = kmalloc(sizeof(*range), GFP_ATOMIC);
251         if (!range)
252                 return BLK_MQ_RQ_QUEUE_BUSY;
253
254         range->cattr = cpu_to_le32(0);
255         range->nlb = cpu_to_le32(nr_bytes >> ns->lba_shift);
256         range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
257
258         memset(cmnd, 0, sizeof(*cmnd));
259         cmnd->dsm.opcode = nvme_cmd_dsm;
260         cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
261         cmnd->dsm.nr = 0;
262         cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
263
264         req->completion_data = range;
265         page = virt_to_page(range);
266         offset = offset_in_page(range);
267         blk_add_request_payload(req, page, offset, sizeof(*range));
268
269         /*
270          * we set __data_len back to the size of the area to be discarded
271          * on disk. This allows us to report completion on the full amount
272          * of blocks described by the request.
273          */
274         req->__data_len = nr_bytes;
275
276         return 0;
277 }
278
279 static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
280                 struct nvme_command *cmnd)
281 {
282         u16 control = 0;
283         u32 dsmgmt = 0;
284
285         if (req->cmd_flags & REQ_FUA)
286                 control |= NVME_RW_FUA;
287         if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
288                 control |= NVME_RW_LR;
289
290         if (req->cmd_flags & REQ_RAHEAD)
291                 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
292
293         memset(cmnd, 0, sizeof(*cmnd));
294         cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
295         cmnd->rw.command_id = req->tag;
296         cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
297         cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
298         cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
299
300         if (ns->ms) {
301                 switch (ns->pi_type) {
302                 case NVME_NS_DPS_PI_TYPE3:
303                         control |= NVME_RW_PRINFO_PRCHK_GUARD;
304                         break;
305                 case NVME_NS_DPS_PI_TYPE1:
306                 case NVME_NS_DPS_PI_TYPE2:
307                         control |= NVME_RW_PRINFO_PRCHK_GUARD |
308                                         NVME_RW_PRINFO_PRCHK_REF;
309                         cmnd->rw.reftag = cpu_to_le32(
310                                         nvme_block_nr(ns, blk_rq_pos(req)));
311                         break;
312                 }
313                 if (!blk_integrity_rq(req))
314                         control |= NVME_RW_PRINFO_PRACT;
315         }
316
317         cmnd->rw.control = cpu_to_le16(control);
318         cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
319 }
320
321 int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
322                 struct nvme_command *cmd)
323 {
324         int ret = 0;
325
326         if (req->cmd_type == REQ_TYPE_DRV_PRIV)
327                 memcpy(cmd, req->cmd, sizeof(*cmd));
328         else if (req_op(req) == REQ_OP_FLUSH)
329                 nvme_setup_flush(ns, cmd);
330         else if (req_op(req) == REQ_OP_DISCARD)
331                 ret = nvme_setup_discard(ns, req, cmd);
332         else
333                 nvme_setup_rw(ns, req, cmd);
334
335         return ret;
336 }
337 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
338
339 /*
340  * Returns 0 on success.  If the result is negative, it's a Linux error code;
341  * if the result is positive, it's an NVM Express status code
342  */
343 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
344                 struct nvme_completion *cqe, void *buffer, unsigned bufflen,
345                 unsigned timeout, int qid, int at_head, int flags)
346 {
347         struct request *req;
348         int ret;
349
350         req = nvme_alloc_request(q, cmd, flags, qid);
351         if (IS_ERR(req))
352                 return PTR_ERR(req);
353
354         req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
355         req->special = cqe;
356
357         if (buffer && bufflen) {
358                 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
359                 if (ret)
360                         goto out;
361         }
362
363         blk_execute_rq(req->q, NULL, req, at_head);
364         ret = req->errors;
365  out:
366         blk_mq_free_request(req);
367         return ret;
368 }
369 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
370
371 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
372                 void *buffer, unsigned bufflen)
373 {
374         return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
375                         NVME_QID_ANY, 0, 0);
376 }
377 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
378
379 int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
380                 void __user *ubuffer, unsigned bufflen,
381                 void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
382                 u32 *result, unsigned timeout)
383 {
384         bool write = nvme_is_write(cmd);
385         struct nvme_completion cqe;
386         struct nvme_ns *ns = q->queuedata;
387         struct gendisk *disk = ns ? ns->disk : NULL;
388         struct request *req;
389         struct bio *bio = NULL;
390         void *meta = NULL;
391         int ret;
392
393         req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
394         if (IS_ERR(req))
395                 return PTR_ERR(req);
396
397         req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
398         req->special = &cqe;
399
400         if (ubuffer && bufflen) {
401                 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
402                                 GFP_KERNEL);
403                 if (ret)
404                         goto out;
405                 bio = req->bio;
406
407                 if (!disk)
408                         goto submit;
409                 bio->bi_bdev = bdget_disk(disk, 0);
410                 if (!bio->bi_bdev) {
411                         ret = -ENODEV;
412                         goto out_unmap;
413                 }
414
415                 if (meta_buffer && meta_len) {
416                         struct bio_integrity_payload *bip;
417
418                         meta = kmalloc(meta_len, GFP_KERNEL);
419                         if (!meta) {
420                                 ret = -ENOMEM;
421                                 goto out_unmap;
422                         }
423
424                         if (write) {
425                                 if (copy_from_user(meta, meta_buffer,
426                                                 meta_len)) {
427                                         ret = -EFAULT;
428                                         goto out_free_meta;
429                                 }
430                         }
431
432                         bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
433                         if (IS_ERR(bip)) {
434                                 ret = PTR_ERR(bip);
435                                 goto out_free_meta;
436                         }
437
438                         bip->bip_iter.bi_size = meta_len;
439                         bip->bip_iter.bi_sector = meta_seed;
440
441                         ret = bio_integrity_add_page(bio, virt_to_page(meta),
442                                         meta_len, offset_in_page(meta));
443                         if (ret != meta_len) {
444                                 ret = -ENOMEM;
445                                 goto out_free_meta;
446                         }
447                 }
448         }
449  submit:
450         blk_execute_rq(req->q, disk, req, 0);
451         ret = req->errors;
452         if (result)
453                 *result = le32_to_cpu(cqe.result);
454         if (meta && !ret && !write) {
455                 if (copy_to_user(meta_buffer, meta, meta_len))
456                         ret = -EFAULT;
457         }
458  out_free_meta:
459         kfree(meta);
460  out_unmap:
461         if (bio) {
462                 if (disk && bio->bi_bdev)
463                         bdput(bio->bi_bdev);
464                 blk_rq_unmap_user(bio);
465         }
466  out:
467         blk_mq_free_request(req);
468         return ret;
469 }
470
471 int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
472                 void __user *ubuffer, unsigned bufflen, u32 *result,
473                 unsigned timeout)
474 {
475         return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
476                         result, timeout);
477 }
478
479 static void nvme_keep_alive_end_io(struct request *rq, int error)
480 {
481         struct nvme_ctrl *ctrl = rq->end_io_data;
482
483         blk_mq_free_request(rq);
484
485         if (error) {
486                 dev_err(ctrl->device,
487                         "failed nvme_keep_alive_end_io error=%d\n", error);
488                 return;
489         }
490
491         schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
492 }
493
494 static int nvme_keep_alive(struct nvme_ctrl *ctrl)
495 {
496         struct nvme_command c;
497         struct request *rq;
498
499         memset(&c, 0, sizeof(c));
500         c.common.opcode = nvme_admin_keep_alive;
501
502         rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED,
503                         NVME_QID_ANY);
504         if (IS_ERR(rq))
505                 return PTR_ERR(rq);
506
507         rq->timeout = ctrl->kato * HZ;
508         rq->end_io_data = ctrl;
509
510         blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
511
512         return 0;
513 }
514
515 static void nvme_keep_alive_work(struct work_struct *work)
516 {
517         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
518                         struct nvme_ctrl, ka_work);
519
520         if (nvme_keep_alive(ctrl)) {
521                 /* allocation failure, reset the controller */
522                 dev_err(ctrl->device, "keep-alive failed\n");
523                 ctrl->ops->reset_ctrl(ctrl);
524                 return;
525         }
526 }
527
528 void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
529 {
530         if (unlikely(ctrl->kato == 0))
531                 return;
532
533         INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
534         schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
535 }
536 EXPORT_SYMBOL_GPL(nvme_start_keep_alive);
537
538 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
539 {
540         if (unlikely(ctrl->kato == 0))
541                 return;
542
543         cancel_delayed_work_sync(&ctrl->ka_work);
544 }
545 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
546
547 int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
548 {
549         struct nvme_command c = { };
550         int error;
551
552         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
553         c.identify.opcode = nvme_admin_identify;
554         c.identify.cns = cpu_to_le32(1);
555
556         *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
557         if (!*id)
558                 return -ENOMEM;
559
560         error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
561                         sizeof(struct nvme_id_ctrl));
562         if (error)
563                 kfree(*id);
564         return error;
565 }
566
567 static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
568 {
569         struct nvme_command c = { };
570
571         c.identify.opcode = nvme_admin_identify;
572         c.identify.cns = cpu_to_le32(2);
573         c.identify.nsid = cpu_to_le32(nsid);
574         return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
575 }
576
577 int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
578                 struct nvme_id_ns **id)
579 {
580         struct nvme_command c = { };
581         int error;
582
583         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
584         c.identify.opcode = nvme_admin_identify,
585         c.identify.nsid = cpu_to_le32(nsid),
586
587         *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
588         if (!*id)
589                 return -ENOMEM;
590
591         error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
592                         sizeof(struct nvme_id_ns));
593         if (error)
594                 kfree(*id);
595         return error;
596 }
597
598 int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
599                                         dma_addr_t dma_addr, u32 *result)
600 {
601         struct nvme_command c;
602         struct nvme_completion cqe;
603         int ret;
604
605         memset(&c, 0, sizeof(c));
606         c.features.opcode = nvme_admin_get_features;
607         c.features.nsid = cpu_to_le32(nsid);
608         c.features.dptr.prp1 = cpu_to_le64(dma_addr);
609         c.features.fid = cpu_to_le32(fid);
610
611         ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0,
612                         NVME_QID_ANY, 0, 0);
613         if (ret >= 0)
614                 *result = le32_to_cpu(cqe.result);
615         return ret;
616 }
617
618 int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
619                                         dma_addr_t dma_addr, u32 *result)
620 {
621         struct nvme_command c;
622         struct nvme_completion cqe;
623         int ret;
624
625         memset(&c, 0, sizeof(c));
626         c.features.opcode = nvme_admin_set_features;
627         c.features.dptr.prp1 = cpu_to_le64(dma_addr);
628         c.features.fid = cpu_to_le32(fid);
629         c.features.dword11 = cpu_to_le32(dword11);
630
631         ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0,
632                         NVME_QID_ANY, 0, 0);
633         if (ret >= 0)
634                 *result = le32_to_cpu(cqe.result);
635         return ret;
636 }
637
638 int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log)
639 {
640         struct nvme_command c = { };
641         int error;
642
643         c.common.opcode = nvme_admin_get_log_page,
644         c.common.nsid = cpu_to_le32(0xFFFFFFFF),
645         c.common.cdw10[0] = cpu_to_le32(
646                         (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
647                          NVME_LOG_SMART),
648
649         *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
650         if (!*log)
651                 return -ENOMEM;
652
653         error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
654                         sizeof(struct nvme_smart_log));
655         if (error)
656                 kfree(*log);
657         return error;
658 }
659
660 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
661 {
662         u32 q_count = (*count - 1) | ((*count - 1) << 16);
663         u32 result;
664         int status, nr_io_queues;
665
666         status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
667                         &result);
668         if (status < 0)
669                 return status;
670
671         /*
672          * Degraded controllers might return an error when setting the queue
673          * count.  We still want to be able to bring them online and offer
674          * access to the admin queue, as that might be only way to fix them up.
675          */
676         if (status > 0) {
677                 dev_err(ctrl->dev, "Could not set queue count (%d)\n", status);
678                 *count = 0;
679         } else {
680                 nr_io_queues = min(result & 0xffff, result >> 16) + 1;
681                 *count = min(*count, nr_io_queues);
682         }
683
684         return 0;
685 }
686 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
687
688 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
689 {
690         struct nvme_user_io io;
691         struct nvme_command c;
692         unsigned length, meta_len;
693         void __user *metadata;
694
695         if (copy_from_user(&io, uio, sizeof(io)))
696                 return -EFAULT;
697         if (io.flags)
698                 return -EINVAL;
699
700         switch (io.opcode) {
701         case nvme_cmd_write:
702         case nvme_cmd_read:
703         case nvme_cmd_compare:
704                 break;
705         default:
706                 return -EINVAL;
707         }
708
709         length = (io.nblocks + 1) << ns->lba_shift;
710         meta_len = (io.nblocks + 1) * ns->ms;
711         metadata = (void __user *)(uintptr_t)io.metadata;
712
713         if (ns->ext) {
714                 length += meta_len;
715                 meta_len = 0;
716         } else if (meta_len) {
717                 if ((io.metadata & 3) || !io.metadata)
718                         return -EINVAL;
719         }
720
721         memset(&c, 0, sizeof(c));
722         c.rw.opcode = io.opcode;
723         c.rw.flags = io.flags;
724         c.rw.nsid = cpu_to_le32(ns->ns_id);
725         c.rw.slba = cpu_to_le64(io.slba);
726         c.rw.length = cpu_to_le16(io.nblocks);
727         c.rw.control = cpu_to_le16(io.control);
728         c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
729         c.rw.reftag = cpu_to_le32(io.reftag);
730         c.rw.apptag = cpu_to_le16(io.apptag);
731         c.rw.appmask = cpu_to_le16(io.appmask);
732
733         return __nvme_submit_user_cmd(ns->queue, &c,
734                         (void __user *)(uintptr_t)io.addr, length,
735                         metadata, meta_len, io.slba, NULL, 0);
736 }
737
738 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
739                         struct nvme_passthru_cmd __user *ucmd)
740 {
741         struct nvme_passthru_cmd cmd;
742         struct nvme_command c;
743         unsigned timeout = 0;
744         int status;
745
746         if (!capable(CAP_SYS_ADMIN))
747                 return -EACCES;
748         if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
749                 return -EFAULT;
750         if (cmd.flags)
751                 return -EINVAL;
752
753         memset(&c, 0, sizeof(c));
754         c.common.opcode = cmd.opcode;
755         c.common.flags = cmd.flags;
756         c.common.nsid = cpu_to_le32(cmd.nsid);
757         c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
758         c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
759         c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
760         c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
761         c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
762         c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
763         c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
764         c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
765
766         if (cmd.timeout_ms)
767                 timeout = msecs_to_jiffies(cmd.timeout_ms);
768
769         status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
770                         (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
771                         &cmd.result, timeout);
772         if (status >= 0) {
773                 if (put_user(cmd.result, &ucmd->result))
774                         return -EFAULT;
775         }
776
777         return status;
778 }
779
780 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
781                 unsigned int cmd, unsigned long arg)
782 {
783         struct nvme_ns *ns = bdev->bd_disk->private_data;
784
785         switch (cmd) {
786         case NVME_IOCTL_ID:
787                 force_successful_syscall_return();
788                 return ns->ns_id;
789         case NVME_IOCTL_ADMIN_CMD:
790                 return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
791         case NVME_IOCTL_IO_CMD:
792                 return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
793         case NVME_IOCTL_SUBMIT_IO:
794                 return nvme_submit_io(ns, (void __user *)arg);
795 #ifdef CONFIG_BLK_DEV_NVME_SCSI
796         case SG_GET_VERSION_NUM:
797                 return nvme_sg_get_version_num((void __user *)arg);
798         case SG_IO:
799                 return nvme_sg_io(ns, (void __user *)arg);
800 #endif
801         default:
802                 return -ENOTTY;
803         }
804 }
805
806 #ifdef CONFIG_COMPAT
807 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
808                         unsigned int cmd, unsigned long arg)
809 {
810         switch (cmd) {
811         case SG_IO:
812                 return -ENOIOCTLCMD;
813         }
814         return nvme_ioctl(bdev, mode, cmd, arg);
815 }
816 #else
817 #define nvme_compat_ioctl       NULL
818 #endif
819
820 static int nvme_open(struct block_device *bdev, fmode_t mode)
821 {
822         return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
823 }
824
825 static void nvme_release(struct gendisk *disk, fmode_t mode)
826 {
827         struct nvme_ns *ns = disk->private_data;
828
829         module_put(ns->ctrl->ops->module);
830         nvme_put_ns(ns);
831 }
832
833 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
834 {
835         /* some standard values */
836         geo->heads = 1 << 6;
837         geo->sectors = 1 << 5;
838         geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
839         return 0;
840 }
841
842 #ifdef CONFIG_BLK_DEV_INTEGRITY
843 static void nvme_init_integrity(struct nvme_ns *ns)
844 {
845         struct blk_integrity integrity;
846
847         switch (ns->pi_type) {
848         case NVME_NS_DPS_PI_TYPE3:
849                 integrity.profile = &t10_pi_type3_crc;
850                 integrity.tag_size = sizeof(u16) + sizeof(u32);
851                 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
852                 break;
853         case NVME_NS_DPS_PI_TYPE1:
854         case NVME_NS_DPS_PI_TYPE2:
855                 integrity.profile = &t10_pi_type1_crc;
856                 integrity.tag_size = sizeof(u16);
857                 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
858                 break;
859         default:
860                 integrity.profile = NULL;
861                 break;
862         }
863         integrity.tuple_size = ns->ms;
864         blk_integrity_register(ns->disk, &integrity);
865         blk_queue_max_integrity_segments(ns->queue, 1);
866 }
867 #else
868 static void nvme_init_integrity(struct nvme_ns *ns)
869 {
870 }
871 #endif /* CONFIG_BLK_DEV_INTEGRITY */
872
873 static void nvme_config_discard(struct nvme_ns *ns)
874 {
875         struct nvme_ctrl *ctrl = ns->ctrl;
876         u32 logical_block_size = queue_logical_block_size(ns->queue);
877
878         if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES)
879                 ns->queue->limits.discard_zeroes_data = 1;
880         else
881                 ns->queue->limits.discard_zeroes_data = 0;
882
883         ns->queue->limits.discard_alignment = logical_block_size;
884         ns->queue->limits.discard_granularity = logical_block_size;
885         blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
886         queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
887 }
888
889 static int nvme_revalidate_disk(struct gendisk *disk)
890 {
891         struct nvme_ns *ns = disk->private_data;
892         struct nvme_id_ns *id;
893         u8 lbaf, pi_type;
894         u16 old_ms;
895         unsigned short bs;
896
897         if (test_bit(NVME_NS_DEAD, &ns->flags)) {
898                 set_capacity(disk, 0);
899                 return -ENODEV;
900         }
901         if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
902                 dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n",
903                                 __func__);
904                 return -ENODEV;
905         }
906         if (id->ncap == 0) {
907                 kfree(id);
908                 return -ENODEV;
909         }
910
911         if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
912                 if (nvme_nvm_register(ns->queue, disk->disk_name)) {
913                         dev_warn(disk_to_dev(ns->disk),
914                                 "%s: LightNVM init failure\n", __func__);
915                         kfree(id);
916                         return -ENODEV;
917                 }
918                 ns->type = NVME_NS_LIGHTNVM;
919         }
920
921         if (ns->ctrl->vs >= NVME_VS(1, 1))
922                 memcpy(ns->eui, id->eui64, sizeof(ns->eui));
923         if (ns->ctrl->vs >= NVME_VS(1, 2))
924                 memcpy(ns->uuid, id->nguid, sizeof(ns->uuid));
925
926         old_ms = ns->ms;
927         lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
928         ns->lba_shift = id->lbaf[lbaf].ds;
929         ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
930         ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
931
932         /*
933          * If identify namespace failed, use default 512 byte block size so
934          * block layer can use before failing read/write for 0 capacity.
935          */
936         if (ns->lba_shift == 0)
937                 ns->lba_shift = 9;
938         bs = 1 << ns->lba_shift;
939         /* XXX: PI implementation requires metadata equal t10 pi tuple size */
940         pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
941                                         id->dps & NVME_NS_DPS_PI_MASK : 0;
942
943         blk_mq_freeze_queue(disk->queue);
944         if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
945                                 ns->ms != old_ms ||
946                                 bs != queue_logical_block_size(disk->queue) ||
947                                 (ns->ms && ns->ext)))
948                 blk_integrity_unregister(disk);
949
950         ns->pi_type = pi_type;
951         blk_queue_logical_block_size(ns->queue, bs);
952
953         if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
954                 nvme_init_integrity(ns);
955         if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
956                 set_capacity(disk, 0);
957         else
958                 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
959
960         if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
961                 nvme_config_discard(ns);
962         blk_mq_unfreeze_queue(disk->queue);
963
964         kfree(id);
965         return 0;
966 }
967
968 static char nvme_pr_type(enum pr_type type)
969 {
970         switch (type) {
971         case PR_WRITE_EXCLUSIVE:
972                 return 1;
973         case PR_EXCLUSIVE_ACCESS:
974                 return 2;
975         case PR_WRITE_EXCLUSIVE_REG_ONLY:
976                 return 3;
977         case PR_EXCLUSIVE_ACCESS_REG_ONLY:
978                 return 4;
979         case PR_WRITE_EXCLUSIVE_ALL_REGS:
980                 return 5;
981         case PR_EXCLUSIVE_ACCESS_ALL_REGS:
982                 return 6;
983         default:
984                 return 0;
985         }
986 };
987
988 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
989                                 u64 key, u64 sa_key, u8 op)
990 {
991         struct nvme_ns *ns = bdev->bd_disk->private_data;
992         struct nvme_command c;
993         u8 data[16] = { 0, };
994
995         put_unaligned_le64(key, &data[0]);
996         put_unaligned_le64(sa_key, &data[8]);
997
998         memset(&c, 0, sizeof(c));
999         c.common.opcode = op;
1000         c.common.nsid = cpu_to_le32(ns->ns_id);
1001         c.common.cdw10[0] = cpu_to_le32(cdw10);
1002
1003         return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
1004 }
1005
1006 static int nvme_pr_register(struct block_device *bdev, u64 old,
1007                 u64 new, unsigned flags)
1008 {
1009         u32 cdw10;
1010
1011         if (flags & ~PR_FL_IGNORE_KEY)
1012                 return -EOPNOTSUPP;
1013
1014         cdw10 = old ? 2 : 0;
1015         cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
1016         cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
1017         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
1018 }
1019
1020 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
1021                 enum pr_type type, unsigned flags)
1022 {
1023         u32 cdw10;
1024
1025         if (flags & ~PR_FL_IGNORE_KEY)
1026                 return -EOPNOTSUPP;
1027
1028         cdw10 = nvme_pr_type(type) << 8;
1029         cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
1030         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
1031 }
1032
1033 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
1034                 enum pr_type type, bool abort)
1035 {
1036         u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
1037         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
1038 }
1039
1040 static int nvme_pr_clear(struct block_device *bdev, u64 key)
1041 {
1042         u32 cdw10 = 1 | (key ? 1 << 3 : 0);
1043         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
1044 }
1045
1046 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
1047 {
1048         u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
1049         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
1050 }
1051
1052 static const struct pr_ops nvme_pr_ops = {
1053         .pr_register    = nvme_pr_register,
1054         .pr_reserve     = nvme_pr_reserve,
1055         .pr_release     = nvme_pr_release,
1056         .pr_preempt     = nvme_pr_preempt,
1057         .pr_clear       = nvme_pr_clear,
1058 };
1059
1060 static const struct block_device_operations nvme_fops = {
1061         .owner          = THIS_MODULE,
1062         .ioctl          = nvme_ioctl,
1063         .compat_ioctl   = nvme_compat_ioctl,
1064         .open           = nvme_open,
1065         .release        = nvme_release,
1066         .getgeo         = nvme_getgeo,
1067         .revalidate_disk= nvme_revalidate_disk,
1068         .pr_ops         = &nvme_pr_ops,
1069 };
1070
1071 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
1072 {
1073         unsigned long timeout =
1074                 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1075         u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
1076         int ret;
1077
1078         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
1079                 if ((csts & NVME_CSTS_RDY) == bit)
1080                         break;
1081
1082                 msleep(100);
1083                 if (fatal_signal_pending(current))
1084                         return -EINTR;
1085                 if (time_after(jiffies, timeout)) {
1086                         dev_err(ctrl->device,
1087                                 "Device not ready; aborting %s\n", enabled ?
1088                                                 "initialisation" : "reset");
1089                         return -ENODEV;
1090                 }
1091         }
1092
1093         return ret;
1094 }
1095
1096 /*
1097  * If the device has been passed off to us in an enabled state, just clear
1098  * the enabled bit.  The spec says we should set the 'shutdown notification
1099  * bits', but doing so may cause the device to complete commands to the
1100  * admin queue ... and we don't know what memory that might be pointing at!
1101  */
1102 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
1103 {
1104         int ret;
1105
1106         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
1107         ctrl->ctrl_config &= ~NVME_CC_ENABLE;
1108
1109         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1110         if (ret)
1111                 return ret;
1112         return nvme_wait_ready(ctrl, cap, false);
1113 }
1114 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
1115
1116 int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
1117 {
1118         /*
1119          * Default to a 4K page size, with the intention to update this
1120          * path in the future to accomodate architectures with differing
1121          * kernel and IO page sizes.
1122          */
1123         unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
1124         int ret;
1125
1126         if (page_shift < dev_page_min) {
1127                 dev_err(ctrl->device,
1128                         "Minimum device page size %u too large for host (%u)\n",
1129                         1 << dev_page_min, 1 << page_shift);
1130                 return -ENODEV;
1131         }
1132
1133         ctrl->page_size = 1 << page_shift;
1134
1135         ctrl->ctrl_config = NVME_CC_CSS_NVM;
1136         ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
1137         ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
1138         ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
1139         ctrl->ctrl_config |= NVME_CC_ENABLE;
1140
1141         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1142         if (ret)
1143                 return ret;
1144         return nvme_wait_ready(ctrl, cap, true);
1145 }
1146 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
1147
1148 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
1149 {
1150         unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies;
1151         u32 csts;
1152         int ret;
1153
1154         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
1155         ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
1156
1157         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1158         if (ret)
1159                 return ret;
1160
1161         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
1162                 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
1163                         break;
1164
1165                 msleep(100);
1166                 if (fatal_signal_pending(current))
1167                         return -EINTR;
1168                 if (time_after(jiffies, timeout)) {
1169                         dev_err(ctrl->device,
1170                                 "Device shutdown incomplete; abort shutdown\n");
1171                         return -ENODEV;
1172                 }
1173         }
1174
1175         return ret;
1176 }
1177 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
1178
1179 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
1180                 struct request_queue *q)
1181 {
1182         bool vwc = false;
1183
1184         if (ctrl->max_hw_sectors) {
1185                 u32 max_segments =
1186                         (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
1187
1188                 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
1189                 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
1190         }
1191         if (ctrl->stripe_size)
1192                 blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9);
1193         blk_queue_virt_boundary(q, ctrl->page_size - 1);
1194         if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
1195                 vwc = true;
1196         blk_queue_write_cache(q, vwc, vwc);
1197 }
1198
1199 /*
1200  * Initialize the cached copies of the Identify data and various controller
1201  * register in our nvme_ctrl structure.  This should be called as soon as
1202  * the admin queue is fully up and running.
1203  */
1204 int nvme_init_identify(struct nvme_ctrl *ctrl)
1205 {
1206         struct nvme_id_ctrl *id;
1207         u64 cap;
1208         int ret, page_shift;
1209         u32 max_hw_sectors;
1210
1211         ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
1212         if (ret) {
1213                 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
1214                 return ret;
1215         }
1216
1217         ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
1218         if (ret) {
1219                 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
1220                 return ret;
1221         }
1222         page_shift = NVME_CAP_MPSMIN(cap) + 12;
1223
1224         if (ctrl->vs >= NVME_VS(1, 1))
1225                 ctrl->subsystem = NVME_CAP_NSSRC(cap);
1226
1227         ret = nvme_identify_ctrl(ctrl, &id);
1228         if (ret) {
1229                 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
1230                 return -EIO;
1231         }
1232
1233         ctrl->vid = le16_to_cpu(id->vid);
1234         ctrl->oncs = le16_to_cpup(&id->oncs);
1235         atomic_set(&ctrl->abort_limit, id->acl + 1);
1236         ctrl->vwc = id->vwc;
1237         ctrl->cntlid = le16_to_cpup(&id->cntlid);
1238         memcpy(ctrl->serial, id->sn, sizeof(id->sn));
1239         memcpy(ctrl->model, id->mn, sizeof(id->mn));
1240         memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
1241         if (id->mdts)
1242                 max_hw_sectors = 1 << (id->mdts + page_shift - 9);
1243         else
1244                 max_hw_sectors = UINT_MAX;
1245         ctrl->max_hw_sectors =
1246                 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
1247
1248         if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) {
1249                 unsigned int max_hw_sectors;
1250
1251                 ctrl->stripe_size = 1 << (id->vs[3] + page_shift);
1252                 max_hw_sectors = ctrl->stripe_size >> (page_shift - 9);
1253                 if (ctrl->max_hw_sectors) {
1254                         ctrl->max_hw_sectors = min(max_hw_sectors,
1255                                                         ctrl->max_hw_sectors);
1256                 } else {
1257                         ctrl->max_hw_sectors = max_hw_sectors;
1258                 }
1259         }
1260
1261         nvme_set_queue_limits(ctrl, ctrl->admin_q);
1262         ctrl->sgls = le32_to_cpu(id->sgls);
1263         ctrl->kas = le16_to_cpu(id->kas);
1264
1265         if (ctrl->ops->is_fabrics) {
1266                 ctrl->icdoff = le16_to_cpu(id->icdoff);
1267                 ctrl->ioccsz = le32_to_cpu(id->ioccsz);
1268                 ctrl->iorcsz = le32_to_cpu(id->iorcsz);
1269                 ctrl->maxcmd = le16_to_cpu(id->maxcmd);
1270
1271                 /*
1272                  * In fabrics we need to verify the cntlid matches the
1273                  * admin connect
1274                  */
1275                 if (ctrl->cntlid != le16_to_cpu(id->cntlid))
1276                         ret = -EINVAL;
1277
1278                 if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
1279                         dev_err(ctrl->dev,
1280                                 "keep-alive support is mandatory for fabrics\n");
1281                         ret = -EINVAL;
1282                 }
1283         } else {
1284                 ctrl->cntlid = le16_to_cpu(id->cntlid);
1285         }
1286
1287         kfree(id);
1288         return ret;
1289 }
1290 EXPORT_SYMBOL_GPL(nvme_init_identify);
1291
1292 static int nvme_dev_open(struct inode *inode, struct file *file)
1293 {
1294         struct nvme_ctrl *ctrl;
1295         int instance = iminor(inode);
1296         int ret = -ENODEV;
1297
1298         spin_lock(&dev_list_lock);
1299         list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
1300                 if (ctrl->instance != instance)
1301                         continue;
1302
1303                 if (!ctrl->admin_q) {
1304                         ret = -EWOULDBLOCK;
1305                         break;
1306                 }
1307                 if (!kref_get_unless_zero(&ctrl->kref))
1308                         break;
1309                 file->private_data = ctrl;
1310                 ret = 0;
1311                 break;
1312         }
1313         spin_unlock(&dev_list_lock);
1314
1315         return ret;
1316 }
1317
1318 static int nvme_dev_release(struct inode *inode, struct file *file)
1319 {
1320         nvme_put_ctrl(file->private_data);
1321         return 0;
1322 }
1323
1324 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
1325 {
1326         struct nvme_ns *ns;
1327         int ret;
1328
1329         mutex_lock(&ctrl->namespaces_mutex);
1330         if (list_empty(&ctrl->namespaces)) {
1331                 ret = -ENOTTY;
1332                 goto out_unlock;
1333         }
1334
1335         ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
1336         if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
1337                 dev_warn(ctrl->device,
1338                         "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
1339                 ret = -EINVAL;
1340                 goto out_unlock;
1341         }
1342
1343         dev_warn(ctrl->device,
1344                 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
1345         kref_get(&ns->kref);
1346         mutex_unlock(&ctrl->namespaces_mutex);
1347
1348         ret = nvme_user_cmd(ctrl, ns, argp);
1349         nvme_put_ns(ns);
1350         return ret;
1351
1352 out_unlock:
1353         mutex_unlock(&ctrl->namespaces_mutex);
1354         return ret;
1355 }
1356
1357 static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
1358                 unsigned long arg)
1359 {
1360         struct nvme_ctrl *ctrl = file->private_data;
1361         void __user *argp = (void __user *)arg;
1362
1363         switch (cmd) {
1364         case NVME_IOCTL_ADMIN_CMD:
1365                 return nvme_user_cmd(ctrl, NULL, argp);
1366         case NVME_IOCTL_IO_CMD:
1367                 return nvme_dev_user_cmd(ctrl, argp);
1368         case NVME_IOCTL_RESET:
1369                 dev_warn(ctrl->device, "resetting controller\n");
1370                 return ctrl->ops->reset_ctrl(ctrl);
1371         case NVME_IOCTL_SUBSYS_RESET:
1372                 return nvme_reset_subsystem(ctrl);
1373         case NVME_IOCTL_RESCAN:
1374                 nvme_queue_scan(ctrl);
1375                 return 0;
1376         default:
1377                 return -ENOTTY;
1378         }
1379 }
1380
1381 static const struct file_operations nvme_dev_fops = {
1382         .owner          = THIS_MODULE,
1383         .open           = nvme_dev_open,
1384         .release        = nvme_dev_release,
1385         .unlocked_ioctl = nvme_dev_ioctl,
1386         .compat_ioctl   = nvme_dev_ioctl,
1387 };
1388
1389 static ssize_t nvme_sysfs_reset(struct device *dev,
1390                                 struct device_attribute *attr, const char *buf,
1391                                 size_t count)
1392 {
1393         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1394         int ret;
1395
1396         ret = ctrl->ops->reset_ctrl(ctrl);
1397         if (ret < 0)
1398                 return ret;
1399         return count;
1400 }
1401 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
1402
1403 static ssize_t nvme_sysfs_rescan(struct device *dev,
1404                                 struct device_attribute *attr, const char *buf,
1405                                 size_t count)
1406 {
1407         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1408
1409         nvme_queue_scan(ctrl);
1410         return count;
1411 }
1412 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
1413
1414 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
1415                                                                 char *buf)
1416 {
1417         struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1418         struct nvme_ctrl *ctrl = ns->ctrl;
1419         int serial_len = sizeof(ctrl->serial);
1420         int model_len = sizeof(ctrl->model);
1421
1422         if (memchr_inv(ns->uuid, 0, sizeof(ns->uuid)))
1423                 return sprintf(buf, "eui.%16phN\n", ns->uuid);
1424
1425         if (memchr_inv(ns->eui, 0, sizeof(ns->eui)))
1426                 return sprintf(buf, "eui.%8phN\n", ns->eui);
1427
1428         while (ctrl->serial[serial_len - 1] == ' ')
1429                 serial_len--;
1430         while (ctrl->model[model_len - 1] == ' ')
1431                 model_len--;
1432
1433         return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid,
1434                 serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id);
1435 }
1436 static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
1437
1438 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
1439                                                                 char *buf)
1440 {
1441         struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1442         return sprintf(buf, "%pU\n", ns->uuid);
1443 }
1444 static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
1445
1446 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
1447                                                                 char *buf)
1448 {
1449         struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1450         return sprintf(buf, "%8phd\n", ns->eui);
1451 }
1452 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
1453
1454 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
1455                                                                 char *buf)
1456 {
1457         struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1458         return sprintf(buf, "%d\n", ns->ns_id);
1459 }
1460 static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
1461
1462 static struct attribute *nvme_ns_attrs[] = {
1463         &dev_attr_wwid.attr,
1464         &dev_attr_uuid.attr,
1465         &dev_attr_eui.attr,
1466         &dev_attr_nsid.attr,
1467         NULL,
1468 };
1469
1470 static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
1471                 struct attribute *a, int n)
1472 {
1473         struct device *dev = container_of(kobj, struct device, kobj);
1474         struct nvme_ns *ns = dev_to_disk(dev)->private_data;
1475
1476         if (a == &dev_attr_uuid.attr) {
1477                 if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid)))
1478                         return 0;
1479         }
1480         if (a == &dev_attr_eui.attr) {
1481                 if (!memchr_inv(ns->eui, 0, sizeof(ns->eui)))
1482                         return 0;
1483         }
1484         return a->mode;
1485 }
1486
1487 static const struct attribute_group nvme_ns_attr_group = {
1488         .attrs          = nvme_ns_attrs,
1489         .is_visible     = nvme_ns_attrs_are_visible,
1490 };
1491
1492 #define nvme_show_str_function(field)                                           \
1493 static ssize_t  field##_show(struct device *dev,                                \
1494                             struct device_attribute *attr, char *buf)           \
1495 {                                                                               \
1496         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
1497         return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field);   \
1498 }                                                                               \
1499 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
1500
1501 #define nvme_show_int_function(field)                                           \
1502 static ssize_t  field##_show(struct device *dev,                                \
1503                             struct device_attribute *attr, char *buf)           \
1504 {                                                                               \
1505         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
1506         return sprintf(buf, "%d\n", ctrl->field);       \
1507 }                                                                               \
1508 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
1509
1510 nvme_show_str_function(model);
1511 nvme_show_str_function(serial);
1512 nvme_show_str_function(firmware_rev);
1513 nvme_show_int_function(cntlid);
1514
1515 static ssize_t nvme_sysfs_delete(struct device *dev,
1516                                 struct device_attribute *attr, const char *buf,
1517                                 size_t count)
1518 {
1519         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1520
1521         if (device_remove_file_self(dev, attr))
1522                 ctrl->ops->delete_ctrl(ctrl);
1523         return count;
1524 }
1525 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
1526
1527 static ssize_t nvme_sysfs_show_transport(struct device *dev,
1528                                          struct device_attribute *attr,
1529                                          char *buf)
1530 {
1531         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1532
1533         return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
1534 }
1535 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
1536
1537 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
1538                                          struct device_attribute *attr,
1539                                          char *buf)
1540 {
1541         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1542
1543         return snprintf(buf, PAGE_SIZE, "%s\n",
1544                         ctrl->ops->get_subsysnqn(ctrl));
1545 }
1546 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
1547
1548 static ssize_t nvme_sysfs_show_address(struct device *dev,
1549                                          struct device_attribute *attr,
1550                                          char *buf)
1551 {
1552         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1553
1554         return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
1555 }
1556 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
1557
1558 static struct attribute *nvme_dev_attrs[] = {
1559         &dev_attr_reset_controller.attr,
1560         &dev_attr_rescan_controller.attr,
1561         &dev_attr_model.attr,
1562         &dev_attr_serial.attr,
1563         &dev_attr_firmware_rev.attr,
1564         &dev_attr_cntlid.attr,
1565         &dev_attr_delete_controller.attr,
1566         &dev_attr_transport.attr,
1567         &dev_attr_subsysnqn.attr,
1568         &dev_attr_address.attr,
1569         NULL
1570 };
1571
1572 #define CHECK_ATTR(ctrl, a, name)               \
1573         if ((a) == &dev_attr_##name.attr &&     \
1574             !(ctrl)->ops->get_##name)           \
1575                 return 0
1576
1577 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
1578                 struct attribute *a, int n)
1579 {
1580         struct device *dev = container_of(kobj, struct device, kobj);
1581         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1582
1583         if (a == &dev_attr_delete_controller.attr) {
1584                 if (!ctrl->ops->delete_ctrl)
1585                         return 0;
1586         }
1587
1588         CHECK_ATTR(ctrl, a, subsysnqn);
1589         CHECK_ATTR(ctrl, a, address);
1590
1591         return a->mode;
1592 }
1593
1594 static struct attribute_group nvme_dev_attrs_group = {
1595         .attrs          = nvme_dev_attrs,
1596         .is_visible     = nvme_dev_attrs_are_visible,
1597 };
1598
1599 static const struct attribute_group *nvme_dev_attr_groups[] = {
1600         &nvme_dev_attrs_group,
1601         NULL,
1602 };
1603
1604 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
1605 {
1606         struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
1607         struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
1608
1609         return nsa->ns_id - nsb->ns_id;
1610 }
1611
1612 static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1613 {
1614         struct nvme_ns *ns;
1615
1616         lockdep_assert_held(&ctrl->namespaces_mutex);
1617
1618         list_for_each_entry(ns, &ctrl->namespaces, list) {
1619                 if (ns->ns_id == nsid)
1620                         return ns;
1621                 if (ns->ns_id > nsid)
1622                         break;
1623         }
1624         return NULL;
1625 }
1626
1627 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1628 {
1629         struct nvme_ns *ns;
1630         struct gendisk *disk;
1631         int node = dev_to_node(ctrl->dev);
1632
1633         lockdep_assert_held(&ctrl->namespaces_mutex);
1634
1635         ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
1636         if (!ns)
1637                 return;
1638
1639         ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
1640         if (ns->instance < 0)
1641                 goto out_free_ns;
1642
1643         ns->queue = blk_mq_init_queue(ctrl->tagset);
1644         if (IS_ERR(ns->queue))
1645                 goto out_release_instance;
1646         queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
1647         ns->queue->queuedata = ns;
1648         ns->ctrl = ctrl;
1649
1650         disk = alloc_disk_node(0, node);
1651         if (!disk)
1652                 goto out_free_queue;
1653
1654         kref_init(&ns->kref);
1655         ns->ns_id = nsid;
1656         ns->disk = disk;
1657         ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
1658
1659
1660         blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
1661         nvme_set_queue_limits(ctrl, ns->queue);
1662
1663         disk->major = nvme_major;
1664         disk->first_minor = 0;
1665         disk->fops = &nvme_fops;
1666         disk->private_data = ns;
1667         disk->queue = ns->queue;
1668         disk->driverfs_dev = ctrl->device;
1669         disk->flags = GENHD_FL_EXT_DEVT;
1670         sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
1671
1672         if (nvme_revalidate_disk(ns->disk))
1673                 goto out_free_disk;
1674
1675         list_add_tail_rcu(&ns->list, &ctrl->namespaces);
1676         kref_get(&ctrl->kref);
1677         if (ns->type == NVME_NS_LIGHTNVM)
1678                 return;
1679
1680         add_disk(ns->disk);
1681         if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
1682                                         &nvme_ns_attr_group))
1683                 pr_warn("%s: failed to create sysfs group for identification\n",
1684                         ns->disk->disk_name);
1685         return;
1686  out_free_disk:
1687         kfree(disk);
1688  out_free_queue:
1689         blk_cleanup_queue(ns->queue);
1690  out_release_instance:
1691         ida_simple_remove(&ctrl->ns_ida, ns->instance);
1692  out_free_ns:
1693         kfree(ns);
1694 }
1695
1696 static void nvme_ns_remove(struct nvme_ns *ns)
1697 {
1698         lockdep_assert_held(&ns->ctrl->namespaces_mutex);
1699
1700         if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
1701                 return;
1702
1703         if (ns->disk->flags & GENHD_FL_UP) {
1704                 if (blk_get_integrity(ns->disk))
1705                         blk_integrity_unregister(ns->disk);
1706                 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
1707                                         &nvme_ns_attr_group);
1708                 del_gendisk(ns->disk);
1709                 blk_mq_abort_requeue_list(ns->queue);
1710                 blk_cleanup_queue(ns->queue);
1711         }
1712         list_del_init(&ns->list);
1713         synchronize_rcu();
1714         nvme_put_ns(ns);
1715 }
1716
1717 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1718 {
1719         struct nvme_ns *ns;
1720
1721         ns = nvme_find_ns(ctrl, nsid);
1722         if (ns) {
1723                 if (revalidate_disk(ns->disk))
1724                         nvme_ns_remove(ns);
1725         } else
1726                 nvme_alloc_ns(ctrl, nsid);
1727 }
1728
1729 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
1730                                         unsigned nsid)
1731 {
1732         struct nvme_ns *ns, *next;
1733
1734         list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
1735                 if (ns->ns_id > nsid)
1736                         nvme_ns_remove(ns);
1737         }
1738 }
1739
1740 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
1741 {
1742         struct nvme_ns *ns;
1743         __le32 *ns_list;
1744         unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
1745         int ret = 0;
1746
1747         ns_list = kzalloc(0x1000, GFP_KERNEL);
1748         if (!ns_list)
1749                 return -ENOMEM;
1750
1751         for (i = 0; i < num_lists; i++) {
1752                 ret = nvme_identify_ns_list(ctrl, prev, ns_list);
1753                 if (ret)
1754                         goto free;
1755
1756                 for (j = 0; j < min(nn, 1024U); j++) {
1757                         nsid = le32_to_cpu(ns_list[j]);
1758                         if (!nsid)
1759                                 goto out;
1760
1761                         nvme_validate_ns(ctrl, nsid);
1762
1763                         while (++prev < nsid) {
1764                                 ns = nvme_find_ns(ctrl, prev);
1765                                 if (ns)
1766                                         nvme_ns_remove(ns);
1767                         }
1768                 }
1769                 nn -= j;
1770         }
1771  out:
1772         nvme_remove_invalid_namespaces(ctrl, prev);
1773  free:
1774         kfree(ns_list);
1775         return ret;
1776 }
1777
1778 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
1779 {
1780         unsigned i;
1781
1782         lockdep_assert_held(&ctrl->namespaces_mutex);
1783
1784         for (i = 1; i <= nn; i++)
1785                 nvme_validate_ns(ctrl, i);
1786
1787         nvme_remove_invalid_namespaces(ctrl, nn);
1788 }
1789
1790 static void nvme_scan_work(struct work_struct *work)
1791 {
1792         struct nvme_ctrl *ctrl =
1793                 container_of(work, struct nvme_ctrl, scan_work);
1794         struct nvme_id_ctrl *id;
1795         unsigned nn;
1796
1797         if (ctrl->state != NVME_CTRL_LIVE)
1798                 return;
1799
1800         if (nvme_identify_ctrl(ctrl, &id))
1801                 return;
1802
1803         mutex_lock(&ctrl->namespaces_mutex);
1804         nn = le32_to_cpu(id->nn);
1805         if (ctrl->vs >= NVME_VS(1, 1) &&
1806             !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
1807                 if (!nvme_scan_ns_list(ctrl, nn))
1808                         goto done;
1809         }
1810         nvme_scan_ns_sequential(ctrl, nn);
1811  done:
1812         list_sort(NULL, &ctrl->namespaces, ns_cmp);
1813         mutex_unlock(&ctrl->namespaces_mutex);
1814         kfree(id);
1815
1816         if (ctrl->ops->post_scan)
1817                 ctrl->ops->post_scan(ctrl);
1818 }
1819
1820 void nvme_queue_scan(struct nvme_ctrl *ctrl)
1821 {
1822         /*
1823          * Do not queue new scan work when a controller is reset during
1824          * removal.
1825          */
1826         if (ctrl->state == NVME_CTRL_LIVE)
1827                 schedule_work(&ctrl->scan_work);
1828 }
1829 EXPORT_SYMBOL_GPL(nvme_queue_scan);
1830
1831 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
1832 {
1833         struct nvme_ns *ns, *next;
1834
1835         /*
1836          * The dead states indicates the controller was not gracefully
1837          * disconnected. In that case, we won't be able to flush any data while
1838          * removing the namespaces' disks; fail all the queues now to avoid
1839          * potentially having to clean up the failed sync later.
1840          */
1841         if (ctrl->state == NVME_CTRL_DEAD)
1842                 nvme_kill_queues(ctrl);
1843
1844         mutex_lock(&ctrl->namespaces_mutex);
1845         list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
1846                 nvme_ns_remove(ns);
1847         mutex_unlock(&ctrl->namespaces_mutex);
1848 }
1849 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
1850
1851 static void nvme_async_event_work(struct work_struct *work)
1852 {
1853         struct nvme_ctrl *ctrl =
1854                 container_of(work, struct nvme_ctrl, async_event_work);
1855
1856         spin_lock_irq(&ctrl->lock);
1857         while (ctrl->event_limit > 0) {
1858                 int aer_idx = --ctrl->event_limit;
1859
1860                 spin_unlock_irq(&ctrl->lock);
1861                 ctrl->ops->submit_async_event(ctrl, aer_idx);
1862                 spin_lock_irq(&ctrl->lock);
1863         }
1864         spin_unlock_irq(&ctrl->lock);
1865 }
1866
1867 void nvme_complete_async_event(struct nvme_ctrl *ctrl,
1868                 struct nvme_completion *cqe)
1869 {
1870         u16 status = le16_to_cpu(cqe->status) >> 1;
1871         u32 result = le32_to_cpu(cqe->result);
1872
1873         if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) {
1874                 ++ctrl->event_limit;
1875                 schedule_work(&ctrl->async_event_work);
1876         }
1877
1878         if (status != NVME_SC_SUCCESS)
1879                 return;
1880
1881         switch (result & 0xff07) {
1882         case NVME_AER_NOTICE_NS_CHANGED:
1883                 dev_info(ctrl->device, "rescanning\n");
1884                 nvme_queue_scan(ctrl);
1885                 break;
1886         default:
1887                 dev_warn(ctrl->device, "async event result %08x\n", result);
1888         }
1889 }
1890 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
1891
1892 void nvme_queue_async_events(struct nvme_ctrl *ctrl)
1893 {
1894         ctrl->event_limit = NVME_NR_AERS;
1895         schedule_work(&ctrl->async_event_work);
1896 }
1897 EXPORT_SYMBOL_GPL(nvme_queue_async_events);
1898
1899 static DEFINE_IDA(nvme_instance_ida);
1900
1901 static int nvme_set_instance(struct nvme_ctrl *ctrl)
1902 {
1903         int instance, error;
1904
1905         do {
1906                 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
1907                         return -ENODEV;
1908
1909                 spin_lock(&dev_list_lock);
1910                 error = ida_get_new(&nvme_instance_ida, &instance);
1911                 spin_unlock(&dev_list_lock);
1912         } while (error == -EAGAIN);
1913
1914         if (error)
1915                 return -ENODEV;
1916
1917         ctrl->instance = instance;
1918         return 0;
1919 }
1920
1921 static void nvme_release_instance(struct nvme_ctrl *ctrl)
1922 {
1923         spin_lock(&dev_list_lock);
1924         ida_remove(&nvme_instance_ida, ctrl->instance);
1925         spin_unlock(&dev_list_lock);
1926 }
1927
1928 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
1929 {
1930         flush_work(&ctrl->async_event_work);
1931         flush_work(&ctrl->scan_work);
1932         nvme_remove_namespaces(ctrl);
1933
1934         device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
1935
1936         spin_lock(&dev_list_lock);
1937         list_del(&ctrl->node);
1938         spin_unlock(&dev_list_lock);
1939 }
1940 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
1941
1942 static void nvme_free_ctrl(struct kref *kref)
1943 {
1944         struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
1945
1946         put_device(ctrl->device);
1947         nvme_release_instance(ctrl);
1948         ida_destroy(&ctrl->ns_ida);
1949
1950         ctrl->ops->free_ctrl(ctrl);
1951 }
1952
1953 void nvme_put_ctrl(struct nvme_ctrl *ctrl)
1954 {
1955         kref_put(&ctrl->kref, nvme_free_ctrl);
1956 }
1957 EXPORT_SYMBOL_GPL(nvme_put_ctrl);
1958
1959 /*
1960  * Initialize a NVMe controller structures.  This needs to be called during
1961  * earliest initialization so that we have the initialized structured around
1962  * during probing.
1963  */
1964 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
1965                 const struct nvme_ctrl_ops *ops, unsigned long quirks)
1966 {
1967         int ret;
1968
1969         ctrl->state = NVME_CTRL_NEW;
1970         spin_lock_init(&ctrl->lock);
1971         INIT_LIST_HEAD(&ctrl->namespaces);
1972         mutex_init(&ctrl->namespaces_mutex);
1973         kref_init(&ctrl->kref);
1974         ctrl->dev = dev;
1975         ctrl->ops = ops;
1976         ctrl->quirks = quirks;
1977         INIT_WORK(&ctrl->scan_work, nvme_scan_work);
1978         INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
1979
1980         ret = nvme_set_instance(ctrl);
1981         if (ret)
1982                 goto out;
1983
1984         ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
1985                                 MKDEV(nvme_char_major, ctrl->instance),
1986                                 ctrl, nvme_dev_attr_groups,
1987                                 "nvme%d", ctrl->instance);
1988         if (IS_ERR(ctrl->device)) {
1989                 ret = PTR_ERR(ctrl->device);
1990                 goto out_release_instance;
1991         }
1992         get_device(ctrl->device);
1993         ida_init(&ctrl->ns_ida);
1994
1995         spin_lock(&dev_list_lock);
1996         list_add_tail(&ctrl->node, &nvme_ctrl_list);
1997         spin_unlock(&dev_list_lock);
1998
1999         return 0;
2000 out_release_instance:
2001         nvme_release_instance(ctrl);
2002 out:
2003         return ret;
2004 }
2005 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
2006
2007 /**
2008  * nvme_kill_queues(): Ends all namespace queues
2009  * @ctrl: the dead controller that needs to end
2010  *
2011  * Call this function when the driver determines it is unable to get the
2012  * controller in a state capable of servicing IO.
2013  */
2014 void nvme_kill_queues(struct nvme_ctrl *ctrl)
2015 {
2016         struct nvme_ns *ns;
2017
2018         rcu_read_lock();
2019         list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
2020                 if (!kref_get_unless_zero(&ns->kref))
2021                         continue;
2022
2023                 /*
2024                  * Revalidating a dead namespace sets capacity to 0. This will
2025                  * end buffered writers dirtying pages that can't be synced.
2026                  */
2027                 if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags))
2028                         revalidate_disk(ns->disk);
2029
2030                 blk_set_queue_dying(ns->queue);
2031                 blk_mq_abort_requeue_list(ns->queue);
2032                 blk_mq_start_stopped_hw_queues(ns->queue, true);
2033
2034                 nvme_put_ns(ns);
2035         }
2036         rcu_read_unlock();
2037 }
2038 EXPORT_SYMBOL_GPL(nvme_kill_queues);
2039
2040 void nvme_stop_queues(struct nvme_ctrl *ctrl)
2041 {
2042         struct nvme_ns *ns;
2043
2044         rcu_read_lock();
2045         list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
2046                 spin_lock_irq(ns->queue->queue_lock);
2047                 queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
2048                 spin_unlock_irq(ns->queue->queue_lock);
2049
2050                 blk_mq_cancel_requeue_work(ns->queue);
2051                 blk_mq_stop_hw_queues(ns->queue);
2052         }
2053         rcu_read_unlock();
2054 }
2055 EXPORT_SYMBOL_GPL(nvme_stop_queues);
2056
2057 void nvme_start_queues(struct nvme_ctrl *ctrl)
2058 {
2059         struct nvme_ns *ns;
2060
2061         rcu_read_lock();
2062         list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
2063                 queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
2064                 blk_mq_start_stopped_hw_queues(ns->queue, true);
2065                 blk_mq_kick_requeue_list(ns->queue);
2066         }
2067         rcu_read_unlock();
2068 }
2069 EXPORT_SYMBOL_GPL(nvme_start_queues);
2070
2071 int __init nvme_core_init(void)
2072 {
2073         int result;
2074
2075         result = register_blkdev(nvme_major, "nvme");
2076         if (result < 0)
2077                 return result;
2078         else if (result > 0)
2079                 nvme_major = result;
2080
2081         result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
2082                                                         &nvme_dev_fops);
2083         if (result < 0)
2084                 goto unregister_blkdev;
2085         else if (result > 0)
2086                 nvme_char_major = result;
2087
2088         nvme_class = class_create(THIS_MODULE, "nvme");
2089         if (IS_ERR(nvme_class)) {
2090                 result = PTR_ERR(nvme_class);
2091                 goto unregister_chrdev;
2092         }
2093
2094         return 0;
2095
2096  unregister_chrdev:
2097         __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
2098  unregister_blkdev:
2099         unregister_blkdev(nvme_major, "nvme");
2100         return result;
2101 }
2102
2103 void nvme_core_exit(void)
2104 {
2105         class_destroy(nvme_class);
2106         __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
2107         unregister_blkdev(nvme_major, "nvme");
2108 }
2109
2110 MODULE_LICENSE("GPL");
2111 MODULE_VERSION("1.0");
2112 module_init(nvme_core_init);
2113 module_exit(nvme_core_exit);