]> Git Repo - qemu.git/blob - hw/block/nvme.c
Merge remote-tracking branch 'remotes/palmer/tags/riscv-for-master-4.0-sf4' into...
[qemu.git] / hw / block / nvme.c
1 /*
2  * QEMU NVM Express Controller
3  *
4  * Copyright (c) 2012, Intel Corporation
5  *
6  * Written by Keith Busch <[email protected]>
7  *
8  * This code is licensed under the GNU GPL v2 or later.
9  */
10
11 /**
12  * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
13  *
14  *  http://www.nvmexpress.org/resources/
15  */
16
17 /**
18  * Usage: add options:
19  *      -drive file=<file>,if=none,id=<drive_id>
20  *      -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>, \
21  *              cmb_size_mb=<cmb_size_mb[optional]>, \
22  *              num_queues=<N[optional]>
23  *
24  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
25  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
26  */
27
28 #include "qemu/osdep.h"
29 #include "qemu/units.h"
30 #include "hw/block/block.h"
31 #include "hw/hw.h"
32 #include "hw/pci/msix.h"
33 #include "hw/pci/pci.h"
34 #include "sysemu/sysemu.h"
35 #include "qapi/error.h"
36 #include "qapi/visitor.h"
37 #include "sysemu/block-backend.h"
38
39 #include "qemu/log.h"
40 #include "qemu/cutils.h"
41 #include "trace.h"
42 #include "nvme.h"
43
44 #define NVME_GUEST_ERR(trace, fmt, ...) \
45     do { \
46         (trace_##trace)(__VA_ARGS__); \
47         qemu_log_mask(LOG_GUEST_ERROR, #trace \
48             " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
49     } while (0)
50
51 static void nvme_process_sq(void *opaque);
52
53 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
54 {
55     if (n->cmbsz && addr >= n->ctrl_mem.addr &&
56                 addr < (n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size))) {
57         memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
58     } else {
59         pci_dma_read(&n->parent_obj, addr, buf, size);
60     }
61 }
62
63 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
64 {
65     return sqid < n->num_queues && n->sq[sqid] != NULL ? 0 : -1;
66 }
67
68 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
69 {
70     return cqid < n->num_queues && n->cq[cqid] != NULL ? 0 : -1;
71 }
72
73 static void nvme_inc_cq_tail(NvmeCQueue *cq)
74 {
75     cq->tail++;
76     if (cq->tail >= cq->size) {
77         cq->tail = 0;
78         cq->phase = !cq->phase;
79     }
80 }
81
82 static void nvme_inc_sq_head(NvmeSQueue *sq)
83 {
84     sq->head = (sq->head + 1) % sq->size;
85 }
86
87 static uint8_t nvme_cq_full(NvmeCQueue *cq)
88 {
89     return (cq->tail + 1) % cq->size == cq->head;
90 }
91
92 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
93 {
94     return sq->head == sq->tail;
95 }
96
97 static void nvme_irq_check(NvmeCtrl *n)
98 {
99     if (msix_enabled(&(n->parent_obj))) {
100         return;
101     }
102     if (~n->bar.intms & n->irq_status) {
103         pci_irq_assert(&n->parent_obj);
104     } else {
105         pci_irq_deassert(&n->parent_obj);
106     }
107 }
108
109 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
110 {
111     if (cq->irq_enabled) {
112         if (msix_enabled(&(n->parent_obj))) {
113             trace_nvme_irq_msix(cq->vector);
114             msix_notify(&(n->parent_obj), cq->vector);
115         } else {
116             trace_nvme_irq_pin();
117             assert(cq->cqid < 64);
118             n->irq_status |= 1 << cq->cqid;
119             nvme_irq_check(n);
120         }
121     } else {
122         trace_nvme_irq_masked();
123     }
124 }
125
126 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
127 {
128     if (cq->irq_enabled) {
129         if (msix_enabled(&(n->parent_obj))) {
130             return;
131         } else {
132             assert(cq->cqid < 64);
133             n->irq_status &= ~(1 << cq->cqid);
134             nvme_irq_check(n);
135         }
136     }
137 }
138
139 static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
140                              uint64_t prp2, uint32_t len, NvmeCtrl *n)
141 {
142     hwaddr trans_len = n->page_size - (prp1 % n->page_size);
143     trans_len = MIN(len, trans_len);
144     int num_prps = (len >> n->page_bits) + 1;
145
146     if (unlikely(!prp1)) {
147         trace_nvme_err_invalid_prp();
148         return NVME_INVALID_FIELD | NVME_DNR;
149     } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
150                prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
151         qsg->nsg = 0;
152         qemu_iovec_init(iov, num_prps);
153         qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], trans_len);
154     } else {
155         pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
156         qemu_sglist_add(qsg, prp1, trans_len);
157     }
158     len -= trans_len;
159     if (len) {
160         if (unlikely(!prp2)) {
161             trace_nvme_err_invalid_prp2_missing();
162             goto unmap;
163         }
164         if (len > n->page_size) {
165             uint64_t prp_list[n->max_prp_ents];
166             uint32_t nents, prp_trans;
167             int i = 0;
168
169             nents = (len + n->page_size - 1) >> n->page_bits;
170             prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
171             nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
172             while (len != 0) {
173                 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
174
175                 if (i == n->max_prp_ents - 1 && len > n->page_size) {
176                     if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
177                         trace_nvme_err_invalid_prplist_ent(prp_ent);
178                         goto unmap;
179                     }
180
181                     i = 0;
182                     nents = (len + n->page_size - 1) >> n->page_bits;
183                     prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
184                     nvme_addr_read(n, prp_ent, (void *)prp_list,
185                         prp_trans);
186                     prp_ent = le64_to_cpu(prp_list[i]);
187                 }
188
189                 if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
190                     trace_nvme_err_invalid_prplist_ent(prp_ent);
191                     goto unmap;
192                 }
193
194                 trans_len = MIN(len, n->page_size);
195                 if (qsg->nsg){
196                     qemu_sglist_add(qsg, prp_ent, trans_len);
197                 } else {
198                     qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - n->ctrl_mem.addr], trans_len);
199                 }
200                 len -= trans_len;
201                 i++;
202             }
203         } else {
204             if (unlikely(prp2 & (n->page_size - 1))) {
205                 trace_nvme_err_invalid_prp2_align(prp2);
206                 goto unmap;
207             }
208             if (qsg->nsg) {
209                 qemu_sglist_add(qsg, prp2, len);
210             } else {
211                 qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - n->ctrl_mem.addr], trans_len);
212             }
213         }
214     }
215     return NVME_SUCCESS;
216
217  unmap:
218     qemu_sglist_destroy(qsg);
219     return NVME_INVALID_FIELD | NVME_DNR;
220 }
221
222 static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
223     uint64_t prp1, uint64_t prp2)
224 {
225     QEMUSGList qsg;
226     QEMUIOVector iov;
227     uint16_t status = NVME_SUCCESS;
228
229     trace_nvme_dma_read(prp1, prp2);
230
231     if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
232         return NVME_INVALID_FIELD | NVME_DNR;
233     }
234     if (qsg.nsg > 0) {
235         if (unlikely(dma_buf_read(ptr, len, &qsg))) {
236             trace_nvme_err_invalid_dma();
237             status = NVME_INVALID_FIELD | NVME_DNR;
238         }
239         qemu_sglist_destroy(&qsg);
240     } else {
241         if (unlikely(qemu_iovec_to_buf(&iov, 0, ptr, len) != len)) {
242             trace_nvme_err_invalid_dma();
243             status = NVME_INVALID_FIELD | NVME_DNR;
244         }
245         qemu_iovec_destroy(&iov);
246     }
247     return status;
248 }
249
250 static void nvme_post_cqes(void *opaque)
251 {
252     NvmeCQueue *cq = opaque;
253     NvmeCtrl *n = cq->ctrl;
254     NvmeRequest *req, *next;
255
256     QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
257         NvmeSQueue *sq;
258         hwaddr addr;
259
260         if (nvme_cq_full(cq)) {
261             break;
262         }
263
264         QTAILQ_REMOVE(&cq->req_list, req, entry);
265         sq = req->sq;
266         req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
267         req->cqe.sq_id = cpu_to_le16(sq->sqid);
268         req->cqe.sq_head = cpu_to_le16(sq->head);
269         addr = cq->dma_addr + cq->tail * n->cqe_size;
270         nvme_inc_cq_tail(cq);
271         pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
272             sizeof(req->cqe));
273         QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
274     }
275     if (cq->tail != cq->head) {
276         nvme_irq_assert(n, cq);
277     }
278 }
279
280 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
281 {
282     assert(cq->cqid == req->sq->cqid);
283     QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
284     QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
285     timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
286 }
287
288 static void nvme_rw_cb(void *opaque, int ret)
289 {
290     NvmeRequest *req = opaque;
291     NvmeSQueue *sq = req->sq;
292     NvmeCtrl *n = sq->ctrl;
293     NvmeCQueue *cq = n->cq[sq->cqid];
294
295     if (!ret) {
296         block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
297         req->status = NVME_SUCCESS;
298     } else {
299         block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
300         req->status = NVME_INTERNAL_DEV_ERROR;
301     }
302     if (req->has_sg) {
303         qemu_sglist_destroy(&req->qsg);
304     }
305     nvme_enqueue_req_completion(cq, req);
306 }
307
308 static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
309     NvmeRequest *req)
310 {
311     req->has_sg = false;
312     block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
313          BLOCK_ACCT_FLUSH);
314     req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
315
316     return NVME_NO_COMPLETE;
317 }
318
319 static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
320     NvmeRequest *req)
321 {
322     NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
323     const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
324     const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
325     uint64_t slba = le64_to_cpu(rw->slba);
326     uint32_t nlb  = le16_to_cpu(rw->nlb) + 1;
327     uint64_t offset = slba << data_shift;
328     uint32_t count = nlb << data_shift;
329
330     if (unlikely(slba + nlb > ns->id_ns.nsze)) {
331         trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
332         return NVME_LBA_RANGE | NVME_DNR;
333     }
334
335     req->has_sg = false;
336     block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
337                      BLOCK_ACCT_WRITE);
338     req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count,
339                                         BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
340     return NVME_NO_COMPLETE;
341 }
342
343 static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
344     NvmeRequest *req)
345 {
346     NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
347     uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
348     uint64_t slba = le64_to_cpu(rw->slba);
349     uint64_t prp1 = le64_to_cpu(rw->prp1);
350     uint64_t prp2 = le64_to_cpu(rw->prp2);
351
352     uint8_t lba_index  = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
353     uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
354     uint64_t data_size = (uint64_t)nlb << data_shift;
355     uint64_t data_offset = slba << data_shift;
356     int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
357     enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
358
359     trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
360
361     if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
362         block_acct_invalid(blk_get_stats(n->conf.blk), acct);
363         trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
364         return NVME_LBA_RANGE | NVME_DNR;
365     }
366
367     if (nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, data_size, n)) {
368         block_acct_invalid(blk_get_stats(n->conf.blk), acct);
369         return NVME_INVALID_FIELD | NVME_DNR;
370     }
371
372     dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
373     if (req->qsg.nsg > 0) {
374         req->has_sg = true;
375         req->aiocb = is_write ?
376             dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
377                           nvme_rw_cb, req) :
378             dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
379                          nvme_rw_cb, req);
380     } else {
381         req->has_sg = false;
382         req->aiocb = is_write ?
383             blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
384                             req) :
385             blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
386                            req);
387     }
388
389     return NVME_NO_COMPLETE;
390 }
391
392 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
393 {
394     NvmeNamespace *ns;
395     uint32_t nsid = le32_to_cpu(cmd->nsid);
396
397     if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
398         trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
399         return NVME_INVALID_NSID | NVME_DNR;
400     }
401
402     ns = &n->namespaces[nsid - 1];
403     switch (cmd->opcode) {
404     case NVME_CMD_FLUSH:
405         return nvme_flush(n, ns, cmd, req);
406     case NVME_CMD_WRITE_ZEROS:
407         return nvme_write_zeros(n, ns, cmd, req);
408     case NVME_CMD_WRITE:
409     case NVME_CMD_READ:
410         return nvme_rw(n, ns, cmd, req);
411     default:
412         trace_nvme_err_invalid_opc(cmd->opcode);
413         return NVME_INVALID_OPCODE | NVME_DNR;
414     }
415 }
416
417 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
418 {
419     n->sq[sq->sqid] = NULL;
420     timer_del(sq->timer);
421     timer_free(sq->timer);
422     g_free(sq->io_req);
423     if (sq->sqid) {
424         g_free(sq);
425     }
426 }
427
428 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
429 {
430     NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
431     NvmeRequest *req, *next;
432     NvmeSQueue *sq;
433     NvmeCQueue *cq;
434     uint16_t qid = le16_to_cpu(c->qid);
435
436     if (unlikely(!qid || nvme_check_sqid(n, qid))) {
437         trace_nvme_err_invalid_del_sq(qid);
438         return NVME_INVALID_QID | NVME_DNR;
439     }
440
441     trace_nvme_del_sq(qid);
442
443     sq = n->sq[qid];
444     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
445         req = QTAILQ_FIRST(&sq->out_req_list);
446         assert(req->aiocb);
447         blk_aio_cancel(req->aiocb);
448     }
449     if (!nvme_check_cqid(n, sq->cqid)) {
450         cq = n->cq[sq->cqid];
451         QTAILQ_REMOVE(&cq->sq_list, sq, entry);
452
453         nvme_post_cqes(cq);
454         QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
455             if (req->sq == sq) {
456                 QTAILQ_REMOVE(&cq->req_list, req, entry);
457                 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
458             }
459         }
460     }
461
462     nvme_free_sq(sq, n);
463     return NVME_SUCCESS;
464 }
465
466 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
467     uint16_t sqid, uint16_t cqid, uint16_t size)
468 {
469     int i;
470     NvmeCQueue *cq;
471
472     sq->ctrl = n;
473     sq->dma_addr = dma_addr;
474     sq->sqid = sqid;
475     sq->size = size;
476     sq->cqid = cqid;
477     sq->head = sq->tail = 0;
478     sq->io_req = g_new(NvmeRequest, sq->size);
479
480     QTAILQ_INIT(&sq->req_list);
481     QTAILQ_INIT(&sq->out_req_list);
482     for (i = 0; i < sq->size; i++) {
483         sq->io_req[i].sq = sq;
484         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
485     }
486     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
487
488     assert(n->cq[cqid]);
489     cq = n->cq[cqid];
490     QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
491     n->sq[sqid] = sq;
492 }
493
494 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
495 {
496     NvmeSQueue *sq;
497     NvmeCreateSq *c = (NvmeCreateSq *)cmd;
498
499     uint16_t cqid = le16_to_cpu(c->cqid);
500     uint16_t sqid = le16_to_cpu(c->sqid);
501     uint16_t qsize = le16_to_cpu(c->qsize);
502     uint16_t qflags = le16_to_cpu(c->sq_flags);
503     uint64_t prp1 = le64_to_cpu(c->prp1);
504
505     trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
506
507     if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
508         trace_nvme_err_invalid_create_sq_cqid(cqid);
509         return NVME_INVALID_CQID | NVME_DNR;
510     }
511     if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
512         trace_nvme_err_invalid_create_sq_sqid(sqid);
513         return NVME_INVALID_QID | NVME_DNR;
514     }
515     if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
516         trace_nvme_err_invalid_create_sq_size(qsize);
517         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
518     }
519     if (unlikely(!prp1 || prp1 & (n->page_size - 1))) {
520         trace_nvme_err_invalid_create_sq_addr(prp1);
521         return NVME_INVALID_FIELD | NVME_DNR;
522     }
523     if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
524         trace_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
525         return NVME_INVALID_FIELD | NVME_DNR;
526     }
527     sq = g_malloc0(sizeof(*sq));
528     nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
529     return NVME_SUCCESS;
530 }
531
532 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
533 {
534     n->cq[cq->cqid] = NULL;
535     timer_del(cq->timer);
536     timer_free(cq->timer);
537     msix_vector_unuse(&n->parent_obj, cq->vector);
538     if (cq->cqid) {
539         g_free(cq);
540     }
541 }
542
543 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
544 {
545     NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
546     NvmeCQueue *cq;
547     uint16_t qid = le16_to_cpu(c->qid);
548
549     if (unlikely(!qid || nvme_check_cqid(n, qid))) {
550         trace_nvme_err_invalid_del_cq_cqid(qid);
551         return NVME_INVALID_CQID | NVME_DNR;
552     }
553
554     cq = n->cq[qid];
555     if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
556         trace_nvme_err_invalid_del_cq_notempty(qid);
557         return NVME_INVALID_QUEUE_DEL;
558     }
559     nvme_irq_deassert(n, cq);
560     trace_nvme_del_cq(qid);
561     nvme_free_cq(cq, n);
562     return NVME_SUCCESS;
563 }
564
565 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
566     uint16_t cqid, uint16_t vector, uint16_t size, uint16_t irq_enabled)
567 {
568     cq->ctrl = n;
569     cq->cqid = cqid;
570     cq->size = size;
571     cq->dma_addr = dma_addr;
572     cq->phase = 1;
573     cq->irq_enabled = irq_enabled;
574     cq->vector = vector;
575     cq->head = cq->tail = 0;
576     QTAILQ_INIT(&cq->req_list);
577     QTAILQ_INIT(&cq->sq_list);
578     msix_vector_use(&n->parent_obj, cq->vector);
579     n->cq[cqid] = cq;
580     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
581 }
582
583 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
584 {
585     NvmeCQueue *cq;
586     NvmeCreateCq *c = (NvmeCreateCq *)cmd;
587     uint16_t cqid = le16_to_cpu(c->cqid);
588     uint16_t vector = le16_to_cpu(c->irq_vector);
589     uint16_t qsize = le16_to_cpu(c->qsize);
590     uint16_t qflags = le16_to_cpu(c->cq_flags);
591     uint64_t prp1 = le64_to_cpu(c->prp1);
592
593     trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
594                          NVME_CQ_FLAGS_IEN(qflags) != 0);
595
596     if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
597         trace_nvme_err_invalid_create_cq_cqid(cqid);
598         return NVME_INVALID_CQID | NVME_DNR;
599     }
600     if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
601         trace_nvme_err_invalid_create_cq_size(qsize);
602         return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
603     }
604     if (unlikely(!prp1)) {
605         trace_nvme_err_invalid_create_cq_addr(prp1);
606         return NVME_INVALID_FIELD | NVME_DNR;
607     }
608     if (unlikely(vector > n->num_queues)) {
609         trace_nvme_err_invalid_create_cq_vector(vector);
610         return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
611     }
612     if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
613         trace_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
614         return NVME_INVALID_FIELD | NVME_DNR;
615     }
616
617     cq = g_malloc0(sizeof(*cq));
618     nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
619         NVME_CQ_FLAGS_IEN(qflags));
620     return NVME_SUCCESS;
621 }
622
623 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
624 {
625     uint64_t prp1 = le64_to_cpu(c->prp1);
626     uint64_t prp2 = le64_to_cpu(c->prp2);
627
628     trace_nvme_identify_ctrl();
629
630     return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
631         prp1, prp2);
632 }
633
634 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
635 {
636     NvmeNamespace *ns;
637     uint32_t nsid = le32_to_cpu(c->nsid);
638     uint64_t prp1 = le64_to_cpu(c->prp1);
639     uint64_t prp2 = le64_to_cpu(c->prp2);
640
641     trace_nvme_identify_ns(nsid);
642
643     if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
644         trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
645         return NVME_INVALID_NSID | NVME_DNR;
646     }
647
648     ns = &n->namespaces[nsid - 1];
649
650     return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
651         prp1, prp2);
652 }
653
654 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
655 {
656     static const int data_len = 4 * KiB;
657     uint32_t min_nsid = le32_to_cpu(c->nsid);
658     uint64_t prp1 = le64_to_cpu(c->prp1);
659     uint64_t prp2 = le64_to_cpu(c->prp2);
660     uint32_t *list;
661     uint16_t ret;
662     int i, j = 0;
663
664     trace_nvme_identify_nslist(min_nsid);
665
666     list = g_malloc0(data_len);
667     for (i = 0; i < n->num_namespaces; i++) {
668         if (i < min_nsid) {
669             continue;
670         }
671         list[j++] = cpu_to_le32(i + 1);
672         if (j == data_len / sizeof(uint32_t)) {
673             break;
674         }
675     }
676     ret = nvme_dma_read_prp(n, (uint8_t *)list, data_len, prp1, prp2);
677     g_free(list);
678     return ret;
679 }
680
681
682 static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
683 {
684     NvmeIdentify *c = (NvmeIdentify *)cmd;
685
686     switch (le32_to_cpu(c->cns)) {
687     case 0x00:
688         return nvme_identify_ns(n, c);
689     case 0x01:
690         return nvme_identify_ctrl(n, c);
691     case 0x02:
692         return nvme_identify_nslist(n, c);
693     default:
694         trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
695         return NVME_INVALID_FIELD | NVME_DNR;
696     }
697 }
698
699 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
700 {
701     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
702     uint32_t result;
703
704     switch (dw10) {
705     case NVME_VOLATILE_WRITE_CACHE:
706         result = blk_enable_write_cache(n->conf.blk);
707         trace_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
708         break;
709     case NVME_NUMBER_OF_QUEUES:
710         result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
711         trace_nvme_getfeat_numq(result);
712         break;
713     default:
714         trace_nvme_err_invalid_getfeat(dw10);
715         return NVME_INVALID_FIELD | NVME_DNR;
716     }
717
718     req->cqe.result = result;
719     return NVME_SUCCESS;
720 }
721
722 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
723 {
724     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
725     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
726
727     switch (dw10) {
728     case NVME_VOLATILE_WRITE_CACHE:
729         blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
730         break;
731     case NVME_NUMBER_OF_QUEUES:
732         trace_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
733                                 ((dw11 >> 16) & 0xFFFF) + 1,
734                                 n->num_queues - 1, n->num_queues - 1);
735         req->cqe.result =
736             cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
737         break;
738     default:
739         trace_nvme_err_invalid_setfeat(dw10);
740         return NVME_INVALID_FIELD | NVME_DNR;
741     }
742     return NVME_SUCCESS;
743 }
744
745 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
746 {
747     switch (cmd->opcode) {
748     case NVME_ADM_CMD_DELETE_SQ:
749         return nvme_del_sq(n, cmd);
750     case NVME_ADM_CMD_CREATE_SQ:
751         return nvme_create_sq(n, cmd);
752     case NVME_ADM_CMD_DELETE_CQ:
753         return nvme_del_cq(n, cmd);
754     case NVME_ADM_CMD_CREATE_CQ:
755         return nvme_create_cq(n, cmd);
756     case NVME_ADM_CMD_IDENTIFY:
757         return nvme_identify(n, cmd);
758     case NVME_ADM_CMD_SET_FEATURES:
759         return nvme_set_feature(n, cmd, req);
760     case NVME_ADM_CMD_GET_FEATURES:
761         return nvme_get_feature(n, cmd, req);
762     default:
763         trace_nvme_err_invalid_admin_opc(cmd->opcode);
764         return NVME_INVALID_OPCODE | NVME_DNR;
765     }
766 }
767
768 static void nvme_process_sq(void *opaque)
769 {
770     NvmeSQueue *sq = opaque;
771     NvmeCtrl *n = sq->ctrl;
772     NvmeCQueue *cq = n->cq[sq->cqid];
773
774     uint16_t status;
775     hwaddr addr;
776     NvmeCmd cmd;
777     NvmeRequest *req;
778
779     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
780         addr = sq->dma_addr + sq->head * n->sqe_size;
781         nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd));
782         nvme_inc_sq_head(sq);
783
784         req = QTAILQ_FIRST(&sq->req_list);
785         QTAILQ_REMOVE(&sq->req_list, req, entry);
786         QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
787         memset(&req->cqe, 0, sizeof(req->cqe));
788         req->cqe.cid = cmd.cid;
789
790         status = sq->sqid ? nvme_io_cmd(n, &cmd, req) :
791             nvme_admin_cmd(n, &cmd, req);
792         if (status != NVME_NO_COMPLETE) {
793             req->status = status;
794             nvme_enqueue_req_completion(cq, req);
795         }
796     }
797 }
798
799 static void nvme_clear_ctrl(NvmeCtrl *n)
800 {
801     int i;
802
803     blk_drain(n->conf.blk);
804
805     for (i = 0; i < n->num_queues; i++) {
806         if (n->sq[i] != NULL) {
807             nvme_free_sq(n->sq[i], n);
808         }
809     }
810     for (i = 0; i < n->num_queues; i++) {
811         if (n->cq[i] != NULL) {
812             nvme_free_cq(n->cq[i], n);
813         }
814     }
815
816     blk_flush(n->conf.blk);
817     n->bar.cc = 0;
818 }
819
820 static int nvme_start_ctrl(NvmeCtrl *n)
821 {
822     uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
823     uint32_t page_size = 1 << page_bits;
824
825     if (unlikely(n->cq[0])) {
826         trace_nvme_err_startfail_cq();
827         return -1;
828     }
829     if (unlikely(n->sq[0])) {
830         trace_nvme_err_startfail_sq();
831         return -1;
832     }
833     if (unlikely(!n->bar.asq)) {
834         trace_nvme_err_startfail_nbarasq();
835         return -1;
836     }
837     if (unlikely(!n->bar.acq)) {
838         trace_nvme_err_startfail_nbaracq();
839         return -1;
840     }
841     if (unlikely(n->bar.asq & (page_size - 1))) {
842         trace_nvme_err_startfail_asq_misaligned(n->bar.asq);
843         return -1;
844     }
845     if (unlikely(n->bar.acq & (page_size - 1))) {
846         trace_nvme_err_startfail_acq_misaligned(n->bar.acq);
847         return -1;
848     }
849     if (unlikely(NVME_CC_MPS(n->bar.cc) <
850                  NVME_CAP_MPSMIN(n->bar.cap))) {
851         trace_nvme_err_startfail_page_too_small(
852                     NVME_CC_MPS(n->bar.cc),
853                     NVME_CAP_MPSMIN(n->bar.cap));
854         return -1;
855     }
856     if (unlikely(NVME_CC_MPS(n->bar.cc) >
857                  NVME_CAP_MPSMAX(n->bar.cap))) {
858         trace_nvme_err_startfail_page_too_large(
859                     NVME_CC_MPS(n->bar.cc),
860                     NVME_CAP_MPSMAX(n->bar.cap));
861         return -1;
862     }
863     if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
864                  NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
865         trace_nvme_err_startfail_cqent_too_small(
866                     NVME_CC_IOCQES(n->bar.cc),
867                     NVME_CTRL_CQES_MIN(n->bar.cap));
868         return -1;
869     }
870     if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
871                  NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
872         trace_nvme_err_startfail_cqent_too_large(
873                     NVME_CC_IOCQES(n->bar.cc),
874                     NVME_CTRL_CQES_MAX(n->bar.cap));
875         return -1;
876     }
877     if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
878                  NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
879         trace_nvme_err_startfail_sqent_too_small(
880                     NVME_CC_IOSQES(n->bar.cc),
881                     NVME_CTRL_SQES_MIN(n->bar.cap));
882         return -1;
883     }
884     if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
885                  NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
886         trace_nvme_err_startfail_sqent_too_large(
887                     NVME_CC_IOSQES(n->bar.cc),
888                     NVME_CTRL_SQES_MAX(n->bar.cap));
889         return -1;
890     }
891     if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
892         trace_nvme_err_startfail_asqent_sz_zero();
893         return -1;
894     }
895     if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
896         trace_nvme_err_startfail_acqent_sz_zero();
897         return -1;
898     }
899
900     n->page_bits = page_bits;
901     n->page_size = page_size;
902     n->max_prp_ents = n->page_size / sizeof(uint64_t);
903     n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
904     n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
905     nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0,
906         NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
907     nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
908         NVME_AQA_ASQS(n->bar.aqa) + 1);
909
910     return 0;
911 }
912
913 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
914     unsigned size)
915 {
916     if (unlikely(offset & (sizeof(uint32_t) - 1))) {
917         NVME_GUEST_ERR(nvme_ub_mmiowr_misaligned32,
918                        "MMIO write not 32-bit aligned,"
919                        " offset=0x%"PRIx64"", offset);
920         /* should be ignored, fall through for now */
921     }
922
923     if (unlikely(size < sizeof(uint32_t))) {
924         NVME_GUEST_ERR(nvme_ub_mmiowr_toosmall,
925                        "MMIO write smaller than 32-bits,"
926                        " offset=0x%"PRIx64", size=%u",
927                        offset, size);
928         /* should be ignored, fall through for now */
929     }
930
931     switch (offset) {
932     case 0xc:   /* INTMS */
933         if (unlikely(msix_enabled(&(n->parent_obj)))) {
934             NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
935                            "undefined access to interrupt mask set"
936                            " when MSI-X is enabled");
937             /* should be ignored, fall through for now */
938         }
939         n->bar.intms |= data & 0xffffffff;
940         n->bar.intmc = n->bar.intms;
941         trace_nvme_mmio_intm_set(data & 0xffffffff,
942                                  n->bar.intmc);
943         nvme_irq_check(n);
944         break;
945     case 0x10:  /* INTMC */
946         if (unlikely(msix_enabled(&(n->parent_obj)))) {
947             NVME_GUEST_ERR(nvme_ub_mmiowr_intmask_with_msix,
948                            "undefined access to interrupt mask clr"
949                            " when MSI-X is enabled");
950             /* should be ignored, fall through for now */
951         }
952         n->bar.intms &= ~(data & 0xffffffff);
953         n->bar.intmc = n->bar.intms;
954         trace_nvme_mmio_intm_clr(data & 0xffffffff,
955                                  n->bar.intmc);
956         nvme_irq_check(n);
957         break;
958     case 0x14:  /* CC */
959         trace_nvme_mmio_cfg(data & 0xffffffff);
960         /* Windows first sends data, then sends enable bit */
961         if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
962             !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
963         {
964             n->bar.cc = data;
965         }
966
967         if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
968             n->bar.cc = data;
969             if (unlikely(nvme_start_ctrl(n))) {
970                 trace_nvme_err_startfail();
971                 n->bar.csts = NVME_CSTS_FAILED;
972             } else {
973                 trace_nvme_mmio_start_success();
974                 n->bar.csts = NVME_CSTS_READY;
975             }
976         } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
977             trace_nvme_mmio_stopped();
978             nvme_clear_ctrl(n);
979             n->bar.csts &= ~NVME_CSTS_READY;
980         }
981         if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
982             trace_nvme_mmio_shutdown_set();
983             nvme_clear_ctrl(n);
984             n->bar.cc = data;
985             n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
986         } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
987             trace_nvme_mmio_shutdown_cleared();
988             n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
989             n->bar.cc = data;
990         }
991         break;
992     case 0x1C:  /* CSTS */
993         if (data & (1 << 4)) {
994             NVME_GUEST_ERR(nvme_ub_mmiowr_ssreset_w1c_unsupported,
995                            "attempted to W1C CSTS.NSSRO"
996                            " but CAP.NSSRS is zero (not supported)");
997         } else if (data != 0) {
998             NVME_GUEST_ERR(nvme_ub_mmiowr_ro_csts,
999                            "attempted to set a read only bit"
1000                            " of controller status");
1001         }
1002         break;
1003     case 0x20:  /* NSSR */
1004         if (data == 0x4E564D65) {
1005             trace_nvme_ub_mmiowr_ssreset_unsupported();
1006         } else {
1007             /* The spec says that writes of other values have no effect */
1008             return;
1009         }
1010         break;
1011     case 0x24:  /* AQA */
1012         n->bar.aqa = data & 0xffffffff;
1013         trace_nvme_mmio_aqattr(data & 0xffffffff);
1014         break;
1015     case 0x28:  /* ASQ */
1016         n->bar.asq = data;
1017         trace_nvme_mmio_asqaddr(data);
1018         break;
1019     case 0x2c:  /* ASQ hi */
1020         n->bar.asq |= data << 32;
1021         trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
1022         break;
1023     case 0x30:  /* ACQ */
1024         trace_nvme_mmio_acqaddr(data);
1025         n->bar.acq = data;
1026         break;
1027     case 0x34:  /* ACQ hi */
1028         n->bar.acq |= data << 32;
1029         trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
1030         break;
1031     case 0x38:  /* CMBLOC */
1032         NVME_GUEST_ERR(nvme_ub_mmiowr_cmbloc_reserved,
1033                        "invalid write to reserved CMBLOC"
1034                        " when CMBSZ is zero, ignored");
1035         return;
1036     case 0x3C:  /* CMBSZ */
1037         NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
1038                        "invalid write to read only CMBSZ, ignored");
1039         return;
1040     default:
1041         NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
1042                        "invalid MMIO write,"
1043                        " offset=0x%"PRIx64", data=%"PRIx64"",
1044                        offset, data);
1045         break;
1046     }
1047 }
1048
1049 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
1050 {
1051     NvmeCtrl *n = (NvmeCtrl *)opaque;
1052     uint8_t *ptr = (uint8_t *)&n->bar;
1053     uint64_t val = 0;
1054
1055     if (unlikely(addr & (sizeof(uint32_t) - 1))) {
1056         NVME_GUEST_ERR(nvme_ub_mmiord_misaligned32,
1057                        "MMIO read not 32-bit aligned,"
1058                        " offset=0x%"PRIx64"", addr);
1059         /* should RAZ, fall through for now */
1060     } else if (unlikely(size < sizeof(uint32_t))) {
1061         NVME_GUEST_ERR(nvme_ub_mmiord_toosmall,
1062                        "MMIO read smaller than 32-bits,"
1063                        " offset=0x%"PRIx64"", addr);
1064         /* should RAZ, fall through for now */
1065     }
1066
1067     if (addr < sizeof(n->bar)) {
1068         memcpy(&val, ptr + addr, size);
1069     } else {
1070         NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
1071                        "MMIO read beyond last register,"
1072                        " offset=0x%"PRIx64", returning 0", addr);
1073     }
1074
1075     return val;
1076 }
1077
1078 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
1079 {
1080     uint32_t qid;
1081
1082     if (unlikely(addr & ((1 << 2) - 1))) {
1083         NVME_GUEST_ERR(nvme_ub_db_wr_misaligned,
1084                        "doorbell write not 32-bit aligned,"
1085                        " offset=0x%"PRIx64", ignoring", addr);
1086         return;
1087     }
1088
1089     if (((addr - 0x1000) >> 2) & 1) {
1090         /* Completion queue doorbell write */
1091
1092         uint16_t new_head = val & 0xffff;
1093         int start_sqs;
1094         NvmeCQueue *cq;
1095
1096         qid = (addr - (0x1000 + (1 << 2))) >> 3;
1097         if (unlikely(nvme_check_cqid(n, qid))) {
1098             NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
1099                            "completion queue doorbell write"
1100                            " for nonexistent queue,"
1101                            " sqid=%"PRIu32", ignoring", qid);
1102             return;
1103         }
1104
1105         cq = n->cq[qid];
1106         if (unlikely(new_head >= cq->size)) {
1107             NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cqhead,
1108                            "completion queue doorbell write value"
1109                            " beyond queue size, sqid=%"PRIu32","
1110                            " new_head=%"PRIu16", ignoring",
1111                            qid, new_head);
1112             return;
1113         }
1114
1115         start_sqs = nvme_cq_full(cq) ? 1 : 0;
1116         cq->head = new_head;
1117         if (start_sqs) {
1118             NvmeSQueue *sq;
1119             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
1120                 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1121             }
1122             timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1123         }
1124
1125         if (cq->tail == cq->head) {
1126             nvme_irq_deassert(n, cq);
1127         }
1128     } else {
1129         /* Submission queue doorbell write */
1130
1131         uint16_t new_tail = val & 0xffff;
1132         NvmeSQueue *sq;
1133
1134         qid = (addr - 0x1000) >> 3;
1135         if (unlikely(nvme_check_sqid(n, qid))) {
1136             NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
1137                            "submission queue doorbell write"
1138                            " for nonexistent queue,"
1139                            " sqid=%"PRIu32", ignoring", qid);
1140             return;
1141         }
1142
1143         sq = n->sq[qid];
1144         if (unlikely(new_tail >= sq->size)) {
1145             NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sqtail,
1146                            "submission queue doorbell write value"
1147                            " beyond queue size, sqid=%"PRIu32","
1148                            " new_tail=%"PRIu16", ignoring",
1149                            qid, new_tail);
1150             return;
1151         }
1152
1153         sq->tail = new_tail;
1154         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1155     }
1156 }
1157
1158 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
1159     unsigned size)
1160 {
1161     NvmeCtrl *n = (NvmeCtrl *)opaque;
1162     if (addr < sizeof(n->bar)) {
1163         nvme_write_bar(n, addr, data, size);
1164     } else if (addr >= 0x1000) {
1165         nvme_process_db(n, addr, data);
1166     }
1167 }
1168
1169 static const MemoryRegionOps nvme_mmio_ops = {
1170     .read = nvme_mmio_read,
1171     .write = nvme_mmio_write,
1172     .endianness = DEVICE_LITTLE_ENDIAN,
1173     .impl = {
1174         .min_access_size = 2,
1175         .max_access_size = 8,
1176     },
1177 };
1178
1179 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
1180     unsigned size)
1181 {
1182     NvmeCtrl *n = (NvmeCtrl *)opaque;
1183     stn_le_p(&n->cmbuf[addr], size, data);
1184 }
1185
1186 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
1187 {
1188     NvmeCtrl *n = (NvmeCtrl *)opaque;
1189     return ldn_le_p(&n->cmbuf[addr], size);
1190 }
1191
1192 static const MemoryRegionOps nvme_cmb_ops = {
1193     .read = nvme_cmb_read,
1194     .write = nvme_cmb_write,
1195     .endianness = DEVICE_LITTLE_ENDIAN,
1196     .impl = {
1197         .min_access_size = 1,
1198         .max_access_size = 8,
1199     },
1200 };
1201
1202 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
1203 {
1204     NvmeCtrl *n = NVME(pci_dev);
1205     NvmeIdCtrl *id = &n->id_ctrl;
1206
1207     int i;
1208     int64_t bs_size;
1209     uint8_t *pci_conf;
1210
1211     if (!n->num_queues) {
1212         error_setg(errp, "num_queues can't be zero");
1213         return;
1214     }
1215
1216     if (!n->conf.blk) {
1217         error_setg(errp, "drive property not set");
1218         return;
1219     }
1220
1221     bs_size = blk_getlength(n->conf.blk);
1222     if (bs_size < 0) {
1223         error_setg(errp, "could not get backing file size");
1224         return;
1225     }
1226
1227     if (!n->serial) {
1228         error_setg(errp, "serial property not set");
1229         return;
1230     }
1231     blkconf_blocksizes(&n->conf);
1232     if (!blkconf_apply_backend_options(&n->conf, blk_is_read_only(n->conf.blk),
1233                                        false, errp)) {
1234         return;
1235     }
1236
1237     pci_conf = pci_dev->config;
1238     pci_conf[PCI_INTERRUPT_PIN] = 1;
1239     pci_config_set_prog_interface(pci_dev->config, 0x2);
1240     pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
1241     pcie_endpoint_cap_init(pci_dev, 0x80);
1242
1243     n->num_namespaces = 1;
1244     n->reg_size = pow2ceil(0x1004 + 2 * (n->num_queues + 1) * 4);
1245     n->ns_size = bs_size / (uint64_t)n->num_namespaces;
1246
1247     n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
1248     n->sq = g_new0(NvmeSQueue *, n->num_queues);
1249     n->cq = g_new0(NvmeCQueue *, n->num_queues);
1250
1251     memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n,
1252                           "nvme", n->reg_size);
1253     pci_register_bar(pci_dev, 0,
1254         PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
1255         &n->iomem);
1256     msix_init_exclusive_bar(pci_dev, n->num_queues, 4, NULL);
1257
1258     id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
1259     id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
1260     strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
1261     strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
1262     strpadcpy((char *)id->sn, sizeof(id->sn), n->serial, ' ');
1263     id->rab = 6;
1264     id->ieee[0] = 0x00;
1265     id->ieee[1] = 0x02;
1266     id->ieee[2] = 0xb3;
1267     id->oacs = cpu_to_le16(0);
1268     id->frmw = 7 << 1;
1269     id->lpa = 1 << 0;
1270     id->sqes = (0x6 << 4) | 0x6;
1271     id->cqes = (0x4 << 4) | 0x4;
1272     id->nn = cpu_to_le32(n->num_namespaces);
1273     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS);
1274     id->psd[0].mp = cpu_to_le16(0x9c4);
1275     id->psd[0].enlat = cpu_to_le32(0x10);
1276     id->psd[0].exlat = cpu_to_le32(0x4);
1277     if (blk_enable_write_cache(n->conf.blk)) {
1278         id->vwc = 1;
1279     }
1280
1281     n->bar.cap = 0;
1282     NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
1283     NVME_CAP_SET_CQR(n->bar.cap, 1);
1284     NVME_CAP_SET_AMS(n->bar.cap, 1);
1285     NVME_CAP_SET_TO(n->bar.cap, 0xf);
1286     NVME_CAP_SET_CSS(n->bar.cap, 1);
1287     NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
1288
1289     n->bar.vs = 0x00010200;
1290     n->bar.intmc = n->bar.intms = 0;
1291
1292     if (n->cmb_size_mb) {
1293
1294         NVME_CMBLOC_SET_BIR(n->bar.cmbloc, 2);
1295         NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
1296
1297         NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
1298         NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
1299         NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
1300         NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
1301         NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
1302         NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
1303         NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->cmb_size_mb);
1304
1305         n->cmbloc = n->bar.cmbloc;
1306         n->cmbsz = n->bar.cmbsz;
1307
1308         n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
1309         memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
1310                               "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
1311         pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc),
1312             PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
1313             PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem);
1314
1315     }
1316
1317     for (i = 0; i < n->num_namespaces; i++) {
1318         NvmeNamespace *ns = &n->namespaces[i];
1319         NvmeIdNs *id_ns = &ns->id_ns;
1320         id_ns->nsfeat = 0;
1321         id_ns->nlbaf = 0;
1322         id_ns->flbas = 0;
1323         id_ns->mc = 0;
1324         id_ns->dpc = 0;
1325         id_ns->dps = 0;
1326         id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
1327         id_ns->ncap  = id_ns->nuse = id_ns->nsze =
1328             cpu_to_le64(n->ns_size >>
1329                 id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas)].ds);
1330     }
1331 }
1332
1333 static void nvme_exit(PCIDevice *pci_dev)
1334 {
1335     NvmeCtrl *n = NVME(pci_dev);
1336
1337     nvme_clear_ctrl(n);
1338     g_free(n->namespaces);
1339     g_free(n->cq);
1340     g_free(n->sq);
1341
1342     if (n->cmb_size_mb) {
1343         g_free(n->cmbuf);
1344     }
1345     msix_uninit_exclusive_bar(pci_dev);
1346 }
1347
1348 static Property nvme_props[] = {
1349     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
1350     DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
1351     DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, cmb_size_mb, 0),
1352     DEFINE_PROP_UINT32("num_queues", NvmeCtrl, num_queues, 64),
1353     DEFINE_PROP_END_OF_LIST(),
1354 };
1355
1356 static const VMStateDescription nvme_vmstate = {
1357     .name = "nvme",
1358     .unmigratable = 1,
1359 };
1360
1361 static void nvme_class_init(ObjectClass *oc, void *data)
1362 {
1363     DeviceClass *dc = DEVICE_CLASS(oc);
1364     PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
1365
1366     pc->realize = nvme_realize;
1367     pc->exit = nvme_exit;
1368     pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
1369     pc->vendor_id = PCI_VENDOR_ID_INTEL;
1370     pc->device_id = 0x5845;
1371     pc->revision = 2;
1372
1373     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
1374     dc->desc = "Non-Volatile Memory Express";
1375     dc->props = nvme_props;
1376     dc->vmsd = &nvme_vmstate;
1377 }
1378
1379 static void nvme_instance_init(Object *obj)
1380 {
1381     NvmeCtrl *s = NVME(obj);
1382
1383     device_add_bootindex_property(obj, &s->conf.bootindex,
1384                                   "bootindex", "/namespace@1,0",
1385                                   DEVICE(obj), &error_abort);
1386 }
1387
1388 static const TypeInfo nvme_info = {
1389     .name          = TYPE_NVME,
1390     .parent        = TYPE_PCI_DEVICE,
1391     .instance_size = sizeof(NvmeCtrl),
1392     .class_init    = nvme_class_init,
1393     .instance_init = nvme_instance_init,
1394     .interfaces = (InterfaceInfo[]) {
1395         { INTERFACE_PCIE_DEVICE },
1396         { }
1397     },
1398 };
1399
1400 static void nvme_register_types(void)
1401 {
1402     type_register_static(&nvme_info);
1403 }
1404
1405 type_init(nvme_register_types)
This page took 0.104359 seconds and 4 git commands to generate.