]> Git Repo - linux.git/blob - drivers/nvme/target/pci-epf.c
Linux 6.14-rc3
[linux.git] / drivers / nvme / target / pci-epf.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe PCI Endpoint Function target driver.
4  *
5  * Copyright (c) 2024, Western Digital Corporation or its affiliates.
6  * Copyright (c) 2024, Rick Wertenbroek <[email protected]>
7  *                     REDS Institute, HEIG-VD, HES-SO, Switzerland
8  */
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11 #include <linux/delay.h>
12 #include <linux/dmaengine.h>
13 #include <linux/io.h>
14 #include <linux/mempool.h>
15 #include <linux/module.h>
16 #include <linux/mutex.h>
17 #include <linux/nvme.h>
18 #include <linux/pci_ids.h>
19 #include <linux/pci-epc.h>
20 #include <linux/pci-epf.h>
21 #include <linux/pci_regs.h>
22 #include <linux/slab.h>
23
24 #include "nvmet.h"
25
26 static LIST_HEAD(nvmet_pci_epf_ports);
27 static DEFINE_MUTEX(nvmet_pci_epf_ports_mutex);
28
29 /*
30  * Default and maximum allowed data transfer size. For the default,
31  * allow up to 128 page-sized segments. For the maximum allowed,
32  * use 4 times the default (which is completely arbitrary).
33  */
34 #define NVMET_PCI_EPF_MAX_SEGS          128
35 #define NVMET_PCI_EPF_MDTS_KB           \
36         (NVMET_PCI_EPF_MAX_SEGS << (PAGE_SHIFT - 10))
37 #define NVMET_PCI_EPF_MAX_MDTS_KB       (NVMET_PCI_EPF_MDTS_KB * 4)
38
39 /*
40  * IRQ vector coalescing threshold: by default, post 8 CQEs before raising an
41  * interrupt vector to the host. This default 8 is completely arbitrary and can
42  * be changed by the host with a nvme_set_features command.
43  */
44 #define NVMET_PCI_EPF_IV_THRESHOLD      8
45
46 /*
47  * BAR CC register and SQ polling intervals.
48  */
49 #define NVMET_PCI_EPF_CC_POLL_INTERVAL  msecs_to_jiffies(5)
50 #define NVMET_PCI_EPF_SQ_POLL_INTERVAL  msecs_to_jiffies(5)
51 #define NVMET_PCI_EPF_SQ_POLL_IDLE      msecs_to_jiffies(5000)
52
53 /*
54  * SQ arbitration burst default: fetch at most 8 commands at a time from an SQ.
55  */
56 #define NVMET_PCI_EPF_SQ_AB             8
57
58 /*
59  * Handling of CQs is normally immediate, unless we fail to map a CQ or the CQ
60  * is full, in which case we retry the CQ processing after this interval.
61  */
62 #define NVMET_PCI_EPF_CQ_RETRY_INTERVAL msecs_to_jiffies(1)
63
64 enum nvmet_pci_epf_queue_flags {
65         NVMET_PCI_EPF_Q_IS_SQ = 0,      /* The queue is a submission queue */
66         NVMET_PCI_EPF_Q_LIVE,           /* The queue is live */
67         NVMET_PCI_EPF_Q_IRQ_ENABLED,    /* IRQ is enabled for this queue */
68 };
69
70 /*
71  * IRQ vector descriptor.
72  */
73 struct nvmet_pci_epf_irq_vector {
74         unsigned int    vector;
75         unsigned int    ref;
76         bool            cd;
77         int             nr_irqs;
78 };
79
80 struct nvmet_pci_epf_queue {
81         union {
82                 struct nvmet_sq         nvme_sq;
83                 struct nvmet_cq         nvme_cq;
84         };
85         struct nvmet_pci_epf_ctrl       *ctrl;
86         unsigned long                   flags;
87
88         u64                             pci_addr;
89         size_t                          pci_size;
90         struct pci_epc_map              pci_map;
91
92         u16                             qid;
93         u16                             depth;
94         u16                             vector;
95         u16                             head;
96         u16                             tail;
97         u16                             phase;
98         u32                             db;
99
100         size_t                          qes;
101
102         struct nvmet_pci_epf_irq_vector *iv;
103         struct workqueue_struct         *iod_wq;
104         struct delayed_work             work;
105         spinlock_t                      lock;
106         struct list_head                list;
107 };
108
109 /*
110  * PCI Root Complex (RC) address data segment for mapping an admin or
111  * I/O command buffer @buf of @length bytes to the PCI address @pci_addr.
112  */
113 struct nvmet_pci_epf_segment {
114         void                            *buf;
115         u64                             pci_addr;
116         u32                             length;
117 };
118
119 /*
120  * Command descriptors.
121  */
122 struct nvmet_pci_epf_iod {
123         struct list_head                link;
124
125         struct nvmet_req                req;
126         struct nvme_command             cmd;
127         struct nvme_completion          cqe;
128         unsigned int                    status;
129
130         struct nvmet_pci_epf_ctrl       *ctrl;
131
132         struct nvmet_pci_epf_queue      *sq;
133         struct nvmet_pci_epf_queue      *cq;
134
135         /* Data transfer size and direction for the command. */
136         size_t                          data_len;
137         enum dma_data_direction         dma_dir;
138
139         /*
140          * PCI Root Complex (RC) address data segments: if nr_data_segs is 1, we
141          * use only @data_seg. Otherwise, the array of segments @data_segs is
142          * allocated to manage multiple PCI address data segments. @data_sgl and
143          * @data_sgt are used to setup the command request for execution by the
144          * target core.
145          */
146         unsigned int                    nr_data_segs;
147         struct nvmet_pci_epf_segment    data_seg;
148         struct nvmet_pci_epf_segment    *data_segs;
149         struct scatterlist              data_sgl;
150         struct sg_table                 data_sgt;
151
152         struct work_struct              work;
153         struct completion               done;
154 };
155
156 /*
157  * PCI target controller private data.
158  */
159 struct nvmet_pci_epf_ctrl {
160         struct nvmet_pci_epf            *nvme_epf;
161         struct nvmet_port               *port;
162         struct nvmet_ctrl               *tctrl;
163         struct device                   *dev;
164
165         unsigned int                    nr_queues;
166         struct nvmet_pci_epf_queue      *sq;
167         struct nvmet_pci_epf_queue      *cq;
168         unsigned int                    sq_ab;
169
170         mempool_t                       iod_pool;
171         void                            *bar;
172         u64                             cap;
173         u32                             cc;
174         u32                             csts;
175
176         size_t                          io_sqes;
177         size_t                          io_cqes;
178
179         size_t                          mps_shift;
180         size_t                          mps;
181         size_t                          mps_mask;
182
183         unsigned int                    mdts;
184
185         struct delayed_work             poll_cc;
186         struct delayed_work             poll_sqs;
187
188         struct mutex                    irq_lock;
189         struct nvmet_pci_epf_irq_vector *irq_vectors;
190         unsigned int                    irq_vector_threshold;
191
192         bool                            link_up;
193         bool                            enabled;
194 };
195
196 /*
197  * PCI EPF driver private data.
198  */
199 struct nvmet_pci_epf {
200         struct pci_epf                  *epf;
201
202         const struct pci_epc_features   *epc_features;
203
204         void                            *reg_bar;
205         size_t                          msix_table_offset;
206
207         unsigned int                    irq_type;
208         unsigned int                    nr_vectors;
209
210         struct nvmet_pci_epf_ctrl       ctrl;
211
212         bool                            dma_enabled;
213         struct dma_chan                 *dma_tx_chan;
214         struct mutex                    dma_tx_lock;
215         struct dma_chan                 *dma_rx_chan;
216         struct mutex                    dma_rx_lock;
217
218         struct mutex                    mmio_lock;
219
220         /* PCI endpoint function configfs attributes. */
221         struct config_group             group;
222         __le16                          portid;
223         char                            subsysnqn[NVMF_NQN_SIZE];
224         unsigned int                    mdts_kb;
225 };
226
227 static inline u32 nvmet_pci_epf_bar_read32(struct nvmet_pci_epf_ctrl *ctrl,
228                                            u32 off)
229 {
230         __le32 *bar_reg = ctrl->bar + off;
231
232         return le32_to_cpu(READ_ONCE(*bar_reg));
233 }
234
235 static inline void nvmet_pci_epf_bar_write32(struct nvmet_pci_epf_ctrl *ctrl,
236                                              u32 off, u32 val)
237 {
238         __le32 *bar_reg = ctrl->bar + off;
239
240         WRITE_ONCE(*bar_reg, cpu_to_le32(val));
241 }
242
243 static inline u64 nvmet_pci_epf_bar_read64(struct nvmet_pci_epf_ctrl *ctrl,
244                                            u32 off)
245 {
246         return (u64)nvmet_pci_epf_bar_read32(ctrl, off) |
247                 ((u64)nvmet_pci_epf_bar_read32(ctrl, off + 4) << 32);
248 }
249
250 static inline void nvmet_pci_epf_bar_write64(struct nvmet_pci_epf_ctrl *ctrl,
251                                              u32 off, u64 val)
252 {
253         nvmet_pci_epf_bar_write32(ctrl, off, val & 0xFFFFFFFF);
254         nvmet_pci_epf_bar_write32(ctrl, off + 4, (val >> 32) & 0xFFFFFFFF);
255 }
256
257 static inline int nvmet_pci_epf_mem_map(struct nvmet_pci_epf *nvme_epf,
258                 u64 pci_addr, size_t size, struct pci_epc_map *map)
259 {
260         struct pci_epf *epf = nvme_epf->epf;
261
262         return pci_epc_mem_map(epf->epc, epf->func_no, epf->vfunc_no,
263                                pci_addr, size, map);
264 }
265
266 static inline void nvmet_pci_epf_mem_unmap(struct nvmet_pci_epf *nvme_epf,
267                                            struct pci_epc_map *map)
268 {
269         struct pci_epf *epf = nvme_epf->epf;
270
271         pci_epc_mem_unmap(epf->epc, epf->func_no, epf->vfunc_no, map);
272 }
273
274 struct nvmet_pci_epf_dma_filter {
275         struct device *dev;
276         u32 dma_mask;
277 };
278
279 static bool nvmet_pci_epf_dma_filter(struct dma_chan *chan, void *arg)
280 {
281         struct nvmet_pci_epf_dma_filter *filter = arg;
282         struct dma_slave_caps caps;
283
284         memset(&caps, 0, sizeof(caps));
285         dma_get_slave_caps(chan, &caps);
286
287         return chan->device->dev == filter->dev &&
288                 (filter->dma_mask & caps.directions);
289 }
290
291 static void nvmet_pci_epf_init_dma(struct nvmet_pci_epf *nvme_epf)
292 {
293         struct pci_epf *epf = nvme_epf->epf;
294         struct device *dev = &epf->dev;
295         struct nvmet_pci_epf_dma_filter filter;
296         struct dma_chan *chan;
297         dma_cap_mask_t mask;
298
299         mutex_init(&nvme_epf->dma_rx_lock);
300         mutex_init(&nvme_epf->dma_tx_lock);
301
302         dma_cap_zero(mask);
303         dma_cap_set(DMA_SLAVE, mask);
304
305         filter.dev = epf->epc->dev.parent;
306         filter.dma_mask = BIT(DMA_DEV_TO_MEM);
307
308         chan = dma_request_channel(mask, nvmet_pci_epf_dma_filter, &filter);
309         if (!chan)
310                 goto out_dma_no_rx;
311
312         nvme_epf->dma_rx_chan = chan;
313
314         filter.dma_mask = BIT(DMA_MEM_TO_DEV);
315         chan = dma_request_channel(mask, nvmet_pci_epf_dma_filter, &filter);
316         if (!chan)
317                 goto out_dma_no_tx;
318
319         nvme_epf->dma_tx_chan = chan;
320
321         nvme_epf->dma_enabled = true;
322
323         dev_dbg(dev, "Using DMA RX channel %s, maximum segment size %u B\n",
324                 dma_chan_name(chan),
325                 dma_get_max_seg_size(dmaengine_get_dma_device(chan)));
326
327         dev_dbg(dev, "Using DMA TX channel %s, maximum segment size %u B\n",
328                 dma_chan_name(chan),
329                 dma_get_max_seg_size(dmaengine_get_dma_device(chan)));
330
331         return;
332
333 out_dma_no_tx:
334         dma_release_channel(nvme_epf->dma_rx_chan);
335         nvme_epf->dma_rx_chan = NULL;
336
337 out_dma_no_rx:
338         mutex_destroy(&nvme_epf->dma_rx_lock);
339         mutex_destroy(&nvme_epf->dma_tx_lock);
340         nvme_epf->dma_enabled = false;
341
342         dev_info(&epf->dev, "DMA not supported, falling back to MMIO\n");
343 }
344
345 static void nvmet_pci_epf_deinit_dma(struct nvmet_pci_epf *nvme_epf)
346 {
347         if (!nvme_epf->dma_enabled)
348                 return;
349
350         dma_release_channel(nvme_epf->dma_tx_chan);
351         nvme_epf->dma_tx_chan = NULL;
352         dma_release_channel(nvme_epf->dma_rx_chan);
353         nvme_epf->dma_rx_chan = NULL;
354         mutex_destroy(&nvme_epf->dma_rx_lock);
355         mutex_destroy(&nvme_epf->dma_tx_lock);
356         nvme_epf->dma_enabled = false;
357 }
358
359 static int nvmet_pci_epf_dma_transfer(struct nvmet_pci_epf *nvme_epf,
360                 struct nvmet_pci_epf_segment *seg, enum dma_data_direction dir)
361 {
362         struct pci_epf *epf = nvme_epf->epf;
363         struct dma_async_tx_descriptor *desc;
364         struct dma_slave_config sconf = {};
365         struct device *dev = &epf->dev;
366         struct device *dma_dev;
367         struct dma_chan *chan;
368         dma_cookie_t cookie;
369         dma_addr_t dma_addr;
370         struct mutex *lock;
371         int ret;
372
373         switch (dir) {
374         case DMA_FROM_DEVICE:
375                 lock = &nvme_epf->dma_rx_lock;
376                 chan = nvme_epf->dma_rx_chan;
377                 sconf.direction = DMA_DEV_TO_MEM;
378                 sconf.src_addr = seg->pci_addr;
379                 break;
380         case DMA_TO_DEVICE:
381                 lock = &nvme_epf->dma_tx_lock;
382                 chan = nvme_epf->dma_tx_chan;
383                 sconf.direction = DMA_MEM_TO_DEV;
384                 sconf.dst_addr = seg->pci_addr;
385                 break;
386         default:
387                 return -EINVAL;
388         }
389
390         mutex_lock(lock);
391
392         dma_dev = dmaengine_get_dma_device(chan);
393         dma_addr = dma_map_single(dma_dev, seg->buf, seg->length, dir);
394         ret = dma_mapping_error(dma_dev, dma_addr);
395         if (ret)
396                 goto unlock;
397
398         ret = dmaengine_slave_config(chan, &sconf);
399         if (ret) {
400                 dev_err(dev, "Failed to configure DMA channel\n");
401                 goto unmap;
402         }
403
404         desc = dmaengine_prep_slave_single(chan, dma_addr, seg->length,
405                                            sconf.direction, DMA_CTRL_ACK);
406         if (!desc) {
407                 dev_err(dev, "Failed to prepare DMA\n");
408                 ret = -EIO;
409                 goto unmap;
410         }
411
412         cookie = dmaengine_submit(desc);
413         ret = dma_submit_error(cookie);
414         if (ret) {
415                 dev_err(dev, "Failed to do DMA submit (err=%d)\n", ret);
416                 goto unmap;
417         }
418
419         if (dma_sync_wait(chan, cookie) != DMA_COMPLETE) {
420                 dev_err(dev, "DMA transfer failed\n");
421                 ret = -EIO;
422         }
423
424         dmaengine_terminate_sync(chan);
425
426 unmap:
427         dma_unmap_single(dma_dev, dma_addr, seg->length, dir);
428
429 unlock:
430         mutex_unlock(lock);
431
432         return ret;
433 }
434
435 static int nvmet_pci_epf_mmio_transfer(struct nvmet_pci_epf *nvme_epf,
436                 struct nvmet_pci_epf_segment *seg, enum dma_data_direction dir)
437 {
438         u64 pci_addr = seg->pci_addr;
439         u32 length = seg->length;
440         void *buf = seg->buf;
441         struct pci_epc_map map;
442         int ret = -EINVAL;
443
444         /*
445          * Note: MMIO transfers do not need serialization but this is a
446          * simple way to avoid using too many mapping windows.
447          */
448         mutex_lock(&nvme_epf->mmio_lock);
449
450         while (length) {
451                 ret = nvmet_pci_epf_mem_map(nvme_epf, pci_addr, length, &map);
452                 if (ret)
453                         break;
454
455                 switch (dir) {
456                 case DMA_FROM_DEVICE:
457                         memcpy_fromio(buf, map.virt_addr, map.pci_size);
458                         break;
459                 case DMA_TO_DEVICE:
460                         memcpy_toio(map.virt_addr, buf, map.pci_size);
461                         break;
462                 default:
463                         ret = -EINVAL;
464                         goto unlock;
465                 }
466
467                 pci_addr += map.pci_size;
468                 buf += map.pci_size;
469                 length -= map.pci_size;
470
471                 nvmet_pci_epf_mem_unmap(nvme_epf, &map);
472         }
473
474 unlock:
475         mutex_unlock(&nvme_epf->mmio_lock);
476
477         return ret;
478 }
479
480 static inline int nvmet_pci_epf_transfer_seg(struct nvmet_pci_epf *nvme_epf,
481                 struct nvmet_pci_epf_segment *seg, enum dma_data_direction dir)
482 {
483         if (nvme_epf->dma_enabled)
484                 return nvmet_pci_epf_dma_transfer(nvme_epf, seg, dir);
485
486         return nvmet_pci_epf_mmio_transfer(nvme_epf, seg, dir);
487 }
488
489 static inline int nvmet_pci_epf_transfer(struct nvmet_pci_epf_ctrl *ctrl,
490                                          void *buf, u64 pci_addr, u32 length,
491                                          enum dma_data_direction dir)
492 {
493         struct nvmet_pci_epf_segment seg = {
494                 .buf = buf,
495                 .pci_addr = pci_addr,
496                 .length = length,
497         };
498
499         return nvmet_pci_epf_transfer_seg(ctrl->nvme_epf, &seg, dir);
500 }
501
502 static int nvmet_pci_epf_alloc_irq_vectors(struct nvmet_pci_epf_ctrl *ctrl)
503 {
504         ctrl->irq_vectors = kcalloc(ctrl->nr_queues,
505                                     sizeof(struct nvmet_pci_epf_irq_vector),
506                                     GFP_KERNEL);
507         if (!ctrl->irq_vectors)
508                 return -ENOMEM;
509
510         mutex_init(&ctrl->irq_lock);
511
512         return 0;
513 }
514
515 static void nvmet_pci_epf_free_irq_vectors(struct nvmet_pci_epf_ctrl *ctrl)
516 {
517         if (ctrl->irq_vectors) {
518                 mutex_destroy(&ctrl->irq_lock);
519                 kfree(ctrl->irq_vectors);
520                 ctrl->irq_vectors = NULL;
521         }
522 }
523
524 static struct nvmet_pci_epf_irq_vector *
525 nvmet_pci_epf_find_irq_vector(struct nvmet_pci_epf_ctrl *ctrl, u16 vector)
526 {
527         struct nvmet_pci_epf_irq_vector *iv;
528         int i;
529
530         lockdep_assert_held(&ctrl->irq_lock);
531
532         for (i = 0; i < ctrl->nr_queues; i++) {
533                 iv = &ctrl->irq_vectors[i];
534                 if (iv->ref && iv->vector == vector)
535                         return iv;
536         }
537
538         return NULL;
539 }
540
541 static struct nvmet_pci_epf_irq_vector *
542 nvmet_pci_epf_add_irq_vector(struct nvmet_pci_epf_ctrl *ctrl, u16 vector)
543 {
544         struct nvmet_pci_epf_irq_vector *iv;
545         int i;
546
547         mutex_lock(&ctrl->irq_lock);
548
549         iv = nvmet_pci_epf_find_irq_vector(ctrl, vector);
550         if (iv) {
551                 iv->ref++;
552                 goto unlock;
553         }
554
555         for (i = 0; i < ctrl->nr_queues; i++) {
556                 iv = &ctrl->irq_vectors[i];
557                 if (!iv->ref)
558                         break;
559         }
560
561         if (WARN_ON_ONCE(!iv))
562                 goto unlock;
563
564         iv->ref = 1;
565         iv->vector = vector;
566         iv->nr_irqs = 0;
567
568 unlock:
569         mutex_unlock(&ctrl->irq_lock);
570
571         return iv;
572 }
573
574 static void nvmet_pci_epf_remove_irq_vector(struct nvmet_pci_epf_ctrl *ctrl,
575                                             u16 vector)
576 {
577         struct nvmet_pci_epf_irq_vector *iv;
578
579         mutex_lock(&ctrl->irq_lock);
580
581         iv = nvmet_pci_epf_find_irq_vector(ctrl, vector);
582         if (iv) {
583                 iv->ref--;
584                 if (!iv->ref) {
585                         iv->vector = 0;
586                         iv->nr_irqs = 0;
587                 }
588         }
589
590         mutex_unlock(&ctrl->irq_lock);
591 }
592
593 static bool nvmet_pci_epf_should_raise_irq(struct nvmet_pci_epf_ctrl *ctrl,
594                 struct nvmet_pci_epf_queue *cq, bool force)
595 {
596         struct nvmet_pci_epf_irq_vector *iv = cq->iv;
597         bool ret;
598
599         if (!test_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags))
600                 return false;
601
602         /* IRQ coalescing for the admin queue is not allowed. */
603         if (!cq->qid)
604                 return true;
605
606         if (iv->cd)
607                 return true;
608
609         if (force) {
610                 ret = iv->nr_irqs > 0;
611         } else {
612                 iv->nr_irqs++;
613                 ret = iv->nr_irqs >= ctrl->irq_vector_threshold;
614         }
615         if (ret)
616                 iv->nr_irqs = 0;
617
618         return ret;
619 }
620
621 static void nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl *ctrl,
622                 struct nvmet_pci_epf_queue *cq, bool force)
623 {
624         struct nvmet_pci_epf *nvme_epf = ctrl->nvme_epf;
625         struct pci_epf *epf = nvme_epf->epf;
626         int ret = 0;
627
628         if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags))
629                 return;
630
631         mutex_lock(&ctrl->irq_lock);
632
633         if (!nvmet_pci_epf_should_raise_irq(ctrl, cq, force))
634                 goto unlock;
635
636         switch (nvme_epf->irq_type) {
637         case PCI_IRQ_MSIX:
638         case PCI_IRQ_MSI:
639                 ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no,
640                                         nvme_epf->irq_type, cq->vector + 1);
641                 if (!ret)
642                         break;
643                 /*
644                  * If we got an error, it is likely because the host is using
645                  * legacy IRQs (e.g. BIOS, grub).
646                  */
647                 fallthrough;
648         case PCI_IRQ_INTX:
649                 ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no,
650                                         PCI_IRQ_INTX, 0);
651                 break;
652         default:
653                 WARN_ON_ONCE(1);
654                 ret = -EINVAL;
655                 break;
656         }
657
658         if (ret)
659                 dev_err(ctrl->dev, "Failed to raise IRQ (err=%d)\n", ret);
660
661 unlock:
662         mutex_unlock(&ctrl->irq_lock);
663 }
664
665 static inline const char *nvmet_pci_epf_iod_name(struct nvmet_pci_epf_iod *iod)
666 {
667         return nvme_opcode_str(iod->sq->qid, iod->cmd.common.opcode);
668 }
669
670 static void nvmet_pci_epf_exec_iod_work(struct work_struct *work);
671
672 static struct nvmet_pci_epf_iod *
673 nvmet_pci_epf_alloc_iod(struct nvmet_pci_epf_queue *sq)
674 {
675         struct nvmet_pci_epf_ctrl *ctrl = sq->ctrl;
676         struct nvmet_pci_epf_iod *iod;
677
678         iod = mempool_alloc(&ctrl->iod_pool, GFP_KERNEL);
679         if (unlikely(!iod))
680                 return NULL;
681
682         memset(iod, 0, sizeof(*iod));
683         iod->req.cmd = &iod->cmd;
684         iod->req.cqe = &iod->cqe;
685         iod->req.port = ctrl->port;
686         iod->ctrl = ctrl;
687         iod->sq = sq;
688         iod->cq = &ctrl->cq[sq->qid];
689         INIT_LIST_HEAD(&iod->link);
690         iod->dma_dir = DMA_NONE;
691         INIT_WORK(&iod->work, nvmet_pci_epf_exec_iod_work);
692         init_completion(&iod->done);
693
694         return iod;
695 }
696
697 /*
698  * Allocate or grow a command table of PCI segments.
699  */
700 static int nvmet_pci_epf_alloc_iod_data_segs(struct nvmet_pci_epf_iod *iod,
701                                              int nsegs)
702 {
703         struct nvmet_pci_epf_segment *segs;
704         int nr_segs = iod->nr_data_segs + nsegs;
705
706         segs = krealloc(iod->data_segs,
707                         nr_segs * sizeof(struct nvmet_pci_epf_segment),
708                         GFP_KERNEL | __GFP_ZERO);
709         if (!segs)
710                 return -ENOMEM;
711
712         iod->nr_data_segs = nr_segs;
713         iod->data_segs = segs;
714
715         return 0;
716 }
717
718 static void nvmet_pci_epf_free_iod(struct nvmet_pci_epf_iod *iod)
719 {
720         int i;
721
722         if (iod->data_segs) {
723                 for (i = 0; i < iod->nr_data_segs; i++)
724                         kfree(iod->data_segs[i].buf);
725                 if (iod->data_segs != &iod->data_seg)
726                         kfree(iod->data_segs);
727         }
728         if (iod->data_sgt.nents > 1)
729                 sg_free_table(&iod->data_sgt);
730         mempool_free(iod, &iod->ctrl->iod_pool);
731 }
732
733 static int nvmet_pci_epf_transfer_iod_data(struct nvmet_pci_epf_iod *iod)
734 {
735         struct nvmet_pci_epf *nvme_epf = iod->ctrl->nvme_epf;
736         struct nvmet_pci_epf_segment *seg = &iod->data_segs[0];
737         int i, ret;
738
739         /* Split the data transfer according to the PCI segments. */
740         for (i = 0; i < iod->nr_data_segs; i++, seg++) {
741                 ret = nvmet_pci_epf_transfer_seg(nvme_epf, seg, iod->dma_dir);
742                 if (ret) {
743                         iod->status = NVME_SC_DATA_XFER_ERROR | NVME_STATUS_DNR;
744                         return ret;
745                 }
746         }
747
748         return 0;
749 }
750
751 static inline u32 nvmet_pci_epf_prp_ofst(struct nvmet_pci_epf_ctrl *ctrl,
752                                          u64 prp)
753 {
754         return prp & ctrl->mps_mask;
755 }
756
757 static inline size_t nvmet_pci_epf_prp_size(struct nvmet_pci_epf_ctrl *ctrl,
758                                             u64 prp)
759 {
760         return ctrl->mps - nvmet_pci_epf_prp_ofst(ctrl, prp);
761 }
762
763 /*
764  * Transfer a PRP list from the host and return the number of prps.
765  */
766 static int nvmet_pci_epf_get_prp_list(struct nvmet_pci_epf_ctrl *ctrl, u64 prp,
767                                       size_t xfer_len, __le64 *prps)
768 {
769         size_t nr_prps = (xfer_len + ctrl->mps_mask) >> ctrl->mps_shift;
770         u32 length;
771         int ret;
772
773         /*
774          * Compute the number of PRPs required for the number of bytes to
775          * transfer (xfer_len). If this number overflows the memory page size
776          * with the PRP list pointer specified, only return the space available
777          * in the memory page, the last PRP in there will be a PRP list pointer
778          * to the remaining PRPs.
779          */
780         length = min(nvmet_pci_epf_prp_size(ctrl, prp), nr_prps << 3);
781         ret = nvmet_pci_epf_transfer(ctrl, prps, prp, length, DMA_FROM_DEVICE);
782         if (ret)
783                 return ret;
784
785         return length >> 3;
786 }
787
788 static int nvmet_pci_epf_iod_parse_prp_list(struct nvmet_pci_epf_ctrl *ctrl,
789                                             struct nvmet_pci_epf_iod *iod)
790 {
791         struct nvme_command *cmd = &iod->cmd;
792         struct nvmet_pci_epf_segment *seg;
793         size_t size = 0, ofst, prp_size, xfer_len;
794         size_t transfer_len = iod->data_len;
795         int nr_segs, nr_prps = 0;
796         u64 pci_addr, prp;
797         int i = 0, ret;
798         __le64 *prps;
799
800         prps = kzalloc(ctrl->mps, GFP_KERNEL);
801         if (!prps)
802                 goto err_internal;
803
804         /*
805          * Allocate PCI segments for the command: this considers the worst case
806          * scenario where all prps are discontiguous, so get as many segments
807          * as we can have prps. In practice, most of the time, we will have
808          * far less PCI segments than prps.
809          */
810         prp = le64_to_cpu(cmd->common.dptr.prp1);
811         if (!prp)
812                 goto err_invalid_field;
813
814         ofst = nvmet_pci_epf_prp_ofst(ctrl, prp);
815         nr_segs = (transfer_len + ofst + ctrl->mps - 1) >> ctrl->mps_shift;
816
817         ret = nvmet_pci_epf_alloc_iod_data_segs(iod, nr_segs);
818         if (ret)
819                 goto err_internal;
820
821         /* Set the first segment using prp1. */
822         seg = &iod->data_segs[0];
823         seg->pci_addr = prp;
824         seg->length = nvmet_pci_epf_prp_size(ctrl, prp);
825
826         size = seg->length;
827         pci_addr = prp + size;
828         nr_segs = 1;
829
830         /*
831          * Now build the PCI address segments using the PRP lists, starting
832          * from prp2.
833          */
834         prp = le64_to_cpu(cmd->common.dptr.prp2);
835         if (!prp)
836                 goto err_invalid_field;
837
838         while (size < transfer_len) {
839                 xfer_len = transfer_len - size;
840
841                 if (!nr_prps) {
842                         nr_prps = nvmet_pci_epf_get_prp_list(ctrl, prp,
843                                                              xfer_len, prps);
844                         if (nr_prps < 0)
845                                 goto err_internal;
846
847                         i = 0;
848                         ofst = 0;
849                 }
850
851                 /* Current entry */
852                 prp = le64_to_cpu(prps[i]);
853                 if (!prp)
854                         goto err_invalid_field;
855
856                 /* Did we reach the last PRP entry of the list? */
857                 if (xfer_len > ctrl->mps && i == nr_prps - 1) {
858                         /* We need more PRPs: PRP is a list pointer. */
859                         nr_prps = 0;
860                         continue;
861                 }
862
863                 /* Only the first PRP is allowed to have an offset. */
864                 if (nvmet_pci_epf_prp_ofst(ctrl, prp))
865                         goto err_invalid_offset;
866
867                 if (prp != pci_addr) {
868                         /* Discontiguous prp: new segment. */
869                         nr_segs++;
870                         if (WARN_ON_ONCE(nr_segs > iod->nr_data_segs))
871                                 goto err_internal;
872
873                         seg++;
874                         seg->pci_addr = prp;
875                         seg->length = 0;
876                         pci_addr = prp;
877                 }
878
879                 prp_size = min_t(size_t, ctrl->mps, xfer_len);
880                 seg->length += prp_size;
881                 pci_addr += prp_size;
882                 size += prp_size;
883
884                 i++;
885         }
886
887         iod->nr_data_segs = nr_segs;
888         ret = 0;
889
890         if (size != transfer_len) {
891                 dev_err(ctrl->dev,
892                         "PRPs transfer length mismatch: got %zu B, need %zu B\n",
893                         size, transfer_len);
894                 goto err_internal;
895         }
896
897         kfree(prps);
898
899         return 0;
900
901 err_invalid_offset:
902         dev_err(ctrl->dev, "PRPs list invalid offset\n");
903         iod->status = NVME_SC_PRP_INVALID_OFFSET | NVME_STATUS_DNR;
904         goto err;
905
906 err_invalid_field:
907         dev_err(ctrl->dev, "PRPs list invalid field\n");
908         iod->status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
909         goto err;
910
911 err_internal:
912         dev_err(ctrl->dev, "PRPs list internal error\n");
913         iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
914
915 err:
916         kfree(prps);
917         return -EINVAL;
918 }
919
920 static int nvmet_pci_epf_iod_parse_prp_simple(struct nvmet_pci_epf_ctrl *ctrl,
921                                               struct nvmet_pci_epf_iod *iod)
922 {
923         struct nvme_command *cmd = &iod->cmd;
924         size_t transfer_len = iod->data_len;
925         int ret, nr_segs = 1;
926         u64 prp1, prp2 = 0;
927         size_t prp1_size;
928
929         prp1 = le64_to_cpu(cmd->common.dptr.prp1);
930         prp1_size = nvmet_pci_epf_prp_size(ctrl, prp1);
931
932         /* For commands crossing a page boundary, we should have prp2. */
933         if (transfer_len > prp1_size) {
934                 prp2 = le64_to_cpu(cmd->common.dptr.prp2);
935                 if (!prp2) {
936                         iod->status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
937                         return -EINVAL;
938                 }
939                 if (nvmet_pci_epf_prp_ofst(ctrl, prp2)) {
940                         iod->status =
941                                 NVME_SC_PRP_INVALID_OFFSET | NVME_STATUS_DNR;
942                         return -EINVAL;
943                 }
944                 if (prp2 != prp1 + prp1_size)
945                         nr_segs = 2;
946         }
947
948         if (nr_segs == 1) {
949                 iod->nr_data_segs = 1;
950                 iod->data_segs = &iod->data_seg;
951                 iod->data_segs[0].pci_addr = prp1;
952                 iod->data_segs[0].length = transfer_len;
953                 return 0;
954         }
955
956         ret = nvmet_pci_epf_alloc_iod_data_segs(iod, nr_segs);
957         if (ret) {
958                 iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
959                 return ret;
960         }
961
962         iod->data_segs[0].pci_addr = prp1;
963         iod->data_segs[0].length = prp1_size;
964         iod->data_segs[1].pci_addr = prp2;
965         iod->data_segs[1].length = transfer_len - prp1_size;
966
967         return 0;
968 }
969
970 static int nvmet_pci_epf_iod_parse_prps(struct nvmet_pci_epf_iod *iod)
971 {
972         struct nvmet_pci_epf_ctrl *ctrl = iod->ctrl;
973         u64 prp1 = le64_to_cpu(iod->cmd.common.dptr.prp1);
974         size_t ofst;
975
976         /* Get the PCI address segments for the command using its PRPs. */
977         ofst = nvmet_pci_epf_prp_ofst(ctrl, prp1);
978         if (ofst & 0x3) {
979                 iod->status = NVME_SC_PRP_INVALID_OFFSET | NVME_STATUS_DNR;
980                 return -EINVAL;
981         }
982
983         if (iod->data_len + ofst <= ctrl->mps * 2)
984                 return nvmet_pci_epf_iod_parse_prp_simple(ctrl, iod);
985
986         return nvmet_pci_epf_iod_parse_prp_list(ctrl, iod);
987 }
988
989 /*
990  * Transfer an SGL segment from the host and return the number of data
991  * descriptors and the next segment descriptor, if any.
992  */
993 static struct nvme_sgl_desc *
994 nvmet_pci_epf_get_sgl_segment(struct nvmet_pci_epf_ctrl *ctrl,
995                               struct nvme_sgl_desc *desc, unsigned int *nr_sgls)
996 {
997         struct nvme_sgl_desc *sgls;
998         u32 length = le32_to_cpu(desc->length);
999         int nr_descs, ret;
1000         void *buf;
1001
1002         buf = kmalloc(length, GFP_KERNEL);
1003         if (!buf)
1004                 return NULL;
1005
1006         ret = nvmet_pci_epf_transfer(ctrl, buf, le64_to_cpu(desc->addr), length,
1007                                      DMA_FROM_DEVICE);
1008         if (ret) {
1009                 kfree(buf);
1010                 return NULL;
1011         }
1012
1013         sgls = buf;
1014         nr_descs = length / sizeof(struct nvme_sgl_desc);
1015         if (sgls[nr_descs - 1].type == (NVME_SGL_FMT_SEG_DESC << 4) ||
1016             sgls[nr_descs - 1].type == (NVME_SGL_FMT_LAST_SEG_DESC << 4)) {
1017                 /*
1018                  * We have another SGL segment following this one: do not count
1019                  * it as a regular data SGL descriptor and return it to the
1020                  * caller.
1021                  */
1022                 *desc = sgls[nr_descs - 1];
1023                 nr_descs--;
1024         } else {
1025                 /* We do not have another SGL segment after this one. */
1026                 desc->length = 0;
1027         }
1028
1029         *nr_sgls = nr_descs;
1030
1031         return sgls;
1032 }
1033
1034 static int nvmet_pci_epf_iod_parse_sgl_segments(struct nvmet_pci_epf_ctrl *ctrl,
1035                                                 struct nvmet_pci_epf_iod *iod)
1036 {
1037         struct nvme_command *cmd = &iod->cmd;
1038         struct nvme_sgl_desc seg = cmd->common.dptr.sgl;
1039         struct nvme_sgl_desc *sgls = NULL;
1040         int n = 0, i, nr_sgls;
1041         int ret;
1042
1043         /*
1044          * We do not support inline data nor keyed SGLs, so we should be seeing
1045          * only segment descriptors.
1046          */
1047         if (seg.type != (NVME_SGL_FMT_SEG_DESC << 4) &&
1048             seg.type != (NVME_SGL_FMT_LAST_SEG_DESC << 4)) {
1049                 iod->status = NVME_SC_SGL_INVALID_TYPE | NVME_STATUS_DNR;
1050                 return -EIO;
1051         }
1052
1053         while (seg.length) {
1054                 sgls = nvmet_pci_epf_get_sgl_segment(ctrl, &seg, &nr_sgls);
1055                 if (!sgls) {
1056                         iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
1057                         return -EIO;
1058                 }
1059
1060                 /* Grow the PCI segment table as needed. */
1061                 ret = nvmet_pci_epf_alloc_iod_data_segs(iod, nr_sgls);
1062                 if (ret) {
1063                         iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
1064                         goto out;
1065                 }
1066
1067                 /*
1068                  * Parse the SGL descriptors to build the PCI segment table,
1069                  * checking the descriptor type as we go.
1070                  */
1071                 for (i = 0; i < nr_sgls; i++) {
1072                         if (sgls[i].type != (NVME_SGL_FMT_DATA_DESC << 4)) {
1073                                 iod->status = NVME_SC_SGL_INVALID_TYPE |
1074                                         NVME_STATUS_DNR;
1075                                 goto out;
1076                         }
1077                         iod->data_segs[n].pci_addr = le64_to_cpu(sgls[i].addr);
1078                         iod->data_segs[n].length = le32_to_cpu(sgls[i].length);
1079                         n++;
1080                 }
1081
1082                 kfree(sgls);
1083         }
1084
1085  out:
1086         if (iod->status != NVME_SC_SUCCESS) {
1087                 kfree(sgls);
1088                 return -EIO;
1089         }
1090
1091         return 0;
1092 }
1093
1094 static int nvmet_pci_epf_iod_parse_sgls(struct nvmet_pci_epf_iod *iod)
1095 {
1096         struct nvmet_pci_epf_ctrl *ctrl = iod->ctrl;
1097         struct nvme_sgl_desc *sgl = &iod->cmd.common.dptr.sgl;
1098
1099         if (sgl->type == (NVME_SGL_FMT_DATA_DESC << 4)) {
1100                 /* Single data descriptor case. */
1101                 iod->nr_data_segs = 1;
1102                 iod->data_segs = &iod->data_seg;
1103                 iod->data_seg.pci_addr = le64_to_cpu(sgl->addr);
1104                 iod->data_seg.length = le32_to_cpu(sgl->length);
1105                 return 0;
1106         }
1107
1108         return nvmet_pci_epf_iod_parse_sgl_segments(ctrl, iod);
1109 }
1110
1111 static int nvmet_pci_epf_alloc_iod_data_buf(struct nvmet_pci_epf_iod *iod)
1112 {
1113         struct nvmet_pci_epf_ctrl *ctrl = iod->ctrl;
1114         struct nvmet_req *req = &iod->req;
1115         struct nvmet_pci_epf_segment *seg;
1116         struct scatterlist *sg;
1117         int ret, i;
1118
1119         if (iod->data_len > ctrl->mdts) {
1120                 iod->status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
1121                 return -EINVAL;
1122         }
1123
1124         /*
1125          * Get the PCI address segments for the command data buffer using either
1126          * its SGLs or PRPs.
1127          */
1128         if (iod->cmd.common.flags & NVME_CMD_SGL_ALL)
1129                 ret = nvmet_pci_epf_iod_parse_sgls(iod);
1130         else
1131                 ret = nvmet_pci_epf_iod_parse_prps(iod);
1132         if (ret)
1133                 return ret;
1134
1135         /* Get a command buffer using SGLs matching the PCI segments. */
1136         if (iod->nr_data_segs == 1) {
1137                 sg_init_table(&iod->data_sgl, 1);
1138                 iod->data_sgt.sgl = &iod->data_sgl;
1139                 iod->data_sgt.nents = 1;
1140                 iod->data_sgt.orig_nents = 1;
1141         } else {
1142                 ret = sg_alloc_table(&iod->data_sgt, iod->nr_data_segs,
1143                                      GFP_KERNEL);
1144                 if (ret)
1145                         goto err_nomem;
1146         }
1147
1148         for_each_sgtable_sg(&iod->data_sgt, sg, i) {
1149                 seg = &iod->data_segs[i];
1150                 seg->buf = kmalloc(seg->length, GFP_KERNEL);
1151                 if (!seg->buf)
1152                         goto err_nomem;
1153                 sg_set_buf(sg, seg->buf, seg->length);
1154         }
1155
1156         req->transfer_len = iod->data_len;
1157         req->sg = iod->data_sgt.sgl;
1158         req->sg_cnt = iod->data_sgt.nents;
1159
1160         return 0;
1161
1162 err_nomem:
1163         iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
1164         return -ENOMEM;
1165 }
1166
1167 static void nvmet_pci_epf_complete_iod(struct nvmet_pci_epf_iod *iod)
1168 {
1169         struct nvmet_pci_epf_queue *cq = iod->cq;
1170         unsigned long flags;
1171
1172         /* Print an error message for failed commands, except AENs. */
1173         iod->status = le16_to_cpu(iod->cqe.status) >> 1;
1174         if (iod->status && iod->cmd.common.opcode != nvme_admin_async_event)
1175                 dev_err(iod->ctrl->dev,
1176                         "CQ[%d]: Command %s (0x%x) status 0x%0x\n",
1177                         iod->sq->qid, nvmet_pci_epf_iod_name(iod),
1178                         iod->cmd.common.opcode, iod->status);
1179
1180         /*
1181          * Add the command to the list of completed commands and schedule the
1182          * CQ work.
1183          */
1184         spin_lock_irqsave(&cq->lock, flags);
1185         list_add_tail(&iod->link, &cq->list);
1186         queue_delayed_work(system_highpri_wq, &cq->work, 0);
1187         spin_unlock_irqrestore(&cq->lock, flags);
1188 }
1189
1190 static void nvmet_pci_epf_drain_queue(struct nvmet_pci_epf_queue *queue)
1191 {
1192         struct nvmet_pci_epf_iod *iod;
1193         unsigned long flags;
1194
1195         spin_lock_irqsave(&queue->lock, flags);
1196         while (!list_empty(&queue->list)) {
1197                 iod = list_first_entry(&queue->list, struct nvmet_pci_epf_iod,
1198                                        link);
1199                 list_del_init(&iod->link);
1200                 nvmet_pci_epf_free_iod(iod);
1201         }
1202         spin_unlock_irqrestore(&queue->lock, flags);
1203 }
1204
1205 static int nvmet_pci_epf_add_port(struct nvmet_port *port)
1206 {
1207         mutex_lock(&nvmet_pci_epf_ports_mutex);
1208         list_add_tail(&port->entry, &nvmet_pci_epf_ports);
1209         mutex_unlock(&nvmet_pci_epf_ports_mutex);
1210         return 0;
1211 }
1212
1213 static void nvmet_pci_epf_remove_port(struct nvmet_port *port)
1214 {
1215         mutex_lock(&nvmet_pci_epf_ports_mutex);
1216         list_del_init(&port->entry);
1217         mutex_unlock(&nvmet_pci_epf_ports_mutex);
1218 }
1219
1220 static struct nvmet_port *
1221 nvmet_pci_epf_find_port(struct nvmet_pci_epf_ctrl *ctrl, __le16 portid)
1222 {
1223         struct nvmet_port *p, *port = NULL;
1224
1225         mutex_lock(&nvmet_pci_epf_ports_mutex);
1226         list_for_each_entry(p, &nvmet_pci_epf_ports, entry) {
1227                 if (p->disc_addr.portid == portid) {
1228                         port = p;
1229                         break;
1230                 }
1231         }
1232         mutex_unlock(&nvmet_pci_epf_ports_mutex);
1233
1234         return port;
1235 }
1236
1237 static void nvmet_pci_epf_queue_response(struct nvmet_req *req)
1238 {
1239         struct nvmet_pci_epf_iod *iod =
1240                 container_of(req, struct nvmet_pci_epf_iod, req);
1241
1242         iod->status = le16_to_cpu(req->cqe->status) >> 1;
1243
1244         /* If we have no data to transfer, directly complete the command. */
1245         if (!iod->data_len || iod->dma_dir != DMA_TO_DEVICE) {
1246                 nvmet_pci_epf_complete_iod(iod);
1247                 return;
1248         }
1249
1250         complete(&iod->done);
1251 }
1252
1253 static u8 nvmet_pci_epf_get_mdts(const struct nvmet_ctrl *tctrl)
1254 {
1255         struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1256         int page_shift = NVME_CAP_MPSMIN(tctrl->cap) + 12;
1257
1258         return ilog2(ctrl->mdts) - page_shift;
1259 }
1260
1261 static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl,
1262                 u16 cqid, u16 flags, u16 qsize, u64 pci_addr, u16 vector)
1263 {
1264         struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1265         struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid];
1266         u16 status;
1267
1268         if (test_and_set_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags))
1269                 return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
1270
1271         if (!(flags & NVME_QUEUE_PHYS_CONTIG))
1272                 return NVME_SC_INVALID_QUEUE | NVME_STATUS_DNR;
1273
1274         if (flags & NVME_CQ_IRQ_ENABLED)
1275                 set_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags);
1276
1277         cq->pci_addr = pci_addr;
1278         cq->qid = cqid;
1279         cq->depth = qsize + 1;
1280         cq->vector = vector;
1281         cq->head = 0;
1282         cq->tail = 0;
1283         cq->phase = 1;
1284         cq->db = NVME_REG_DBS + (((cqid * 2) + 1) * sizeof(u32));
1285         nvmet_pci_epf_bar_write32(ctrl, cq->db, 0);
1286
1287         if (!cqid)
1288                 cq->qes = sizeof(struct nvme_completion);
1289         else
1290                 cq->qes = ctrl->io_cqes;
1291         cq->pci_size = cq->qes * cq->depth;
1292
1293         cq->iv = nvmet_pci_epf_add_irq_vector(ctrl, vector);
1294         if (!cq->iv) {
1295                 status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
1296                 goto err;
1297         }
1298
1299         status = nvmet_cq_create(tctrl, &cq->nvme_cq, cqid, cq->depth);
1300         if (status != NVME_SC_SUCCESS)
1301                 goto err;
1302
1303         dev_dbg(ctrl->dev, "CQ[%u]: %u entries of %zu B, IRQ vector %u\n",
1304                 cqid, qsize, cq->qes, cq->vector);
1305
1306         return NVME_SC_SUCCESS;
1307
1308 err:
1309         clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags);
1310         clear_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags);
1311         return status;
1312 }
1313
1314 static u16 nvmet_pci_epf_delete_cq(struct nvmet_ctrl *tctrl, u16 cqid)
1315 {
1316         struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1317         struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid];
1318
1319         if (!test_and_clear_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags))
1320                 return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
1321
1322         cancel_delayed_work_sync(&cq->work);
1323         nvmet_pci_epf_drain_queue(cq);
1324         nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector);
1325
1326         return NVME_SC_SUCCESS;
1327 }
1328
1329 static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
1330                 u16 sqid, u16 flags, u16 qsize, u64 pci_addr)
1331 {
1332         struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1333         struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid];
1334         u16 status;
1335
1336         if (test_and_set_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
1337                 return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
1338
1339         if (!(flags & NVME_QUEUE_PHYS_CONTIG))
1340                 return NVME_SC_INVALID_QUEUE | NVME_STATUS_DNR;
1341
1342         sq->pci_addr = pci_addr;
1343         sq->qid = sqid;
1344         sq->depth = qsize + 1;
1345         sq->head = 0;
1346         sq->tail = 0;
1347         sq->phase = 0;
1348         sq->db = NVME_REG_DBS + (sqid * 2 * sizeof(u32));
1349         nvmet_pci_epf_bar_write32(ctrl, sq->db, 0);
1350         if (!sqid)
1351                 sq->qes = 1UL << NVME_ADM_SQES;
1352         else
1353                 sq->qes = ctrl->io_sqes;
1354         sq->pci_size = sq->qes * sq->depth;
1355
1356         status = nvmet_sq_create(tctrl, &sq->nvme_sq, sqid, sq->depth);
1357         if (status != NVME_SC_SUCCESS)
1358                 goto out_clear_bit;
1359
1360         sq->iod_wq = alloc_workqueue("sq%d_wq", WQ_UNBOUND,
1361                                 min_t(int, sq->depth, WQ_MAX_ACTIVE), sqid);
1362         if (!sq->iod_wq) {
1363                 dev_err(ctrl->dev, "Failed to create SQ %d work queue\n", sqid);
1364                 status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
1365                 goto out_destroy_sq;
1366         }
1367
1368         dev_dbg(ctrl->dev, "SQ[%u]: %u entries of %zu B\n",
1369                 sqid, qsize, sq->qes);
1370
1371         return NVME_SC_SUCCESS;
1372
1373 out_destroy_sq:
1374         nvmet_sq_destroy(&sq->nvme_sq);
1375 out_clear_bit:
1376         clear_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags);
1377         return status;
1378 }
1379
1380 static u16 nvmet_pci_epf_delete_sq(struct nvmet_ctrl *tctrl, u16 sqid)
1381 {
1382         struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1383         struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid];
1384
1385         if (!test_and_clear_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
1386                 return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
1387
1388         flush_workqueue(sq->iod_wq);
1389         destroy_workqueue(sq->iod_wq);
1390         sq->iod_wq = NULL;
1391
1392         nvmet_pci_epf_drain_queue(sq);
1393
1394         if (sq->nvme_sq.ctrl)
1395                 nvmet_sq_destroy(&sq->nvme_sq);
1396
1397         return NVME_SC_SUCCESS;
1398 }
1399
1400 static u16 nvmet_pci_epf_get_feat(const struct nvmet_ctrl *tctrl,
1401                                   u8 feat, void *data)
1402 {
1403         struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1404         struct nvmet_feat_arbitration *arb;
1405         struct nvmet_feat_irq_coalesce *irqc;
1406         struct nvmet_feat_irq_config *irqcfg;
1407         struct nvmet_pci_epf_irq_vector *iv;
1408         u16 status;
1409
1410         switch (feat) {
1411         case NVME_FEAT_ARBITRATION:
1412                 arb = data;
1413                 if (!ctrl->sq_ab)
1414                         arb->ab = 0x7;
1415                 else
1416                         arb->ab = ilog2(ctrl->sq_ab);
1417                 return NVME_SC_SUCCESS;
1418
1419         case NVME_FEAT_IRQ_COALESCE:
1420                 irqc = data;
1421                 irqc->thr = ctrl->irq_vector_threshold;
1422                 irqc->time = 0;
1423                 return NVME_SC_SUCCESS;
1424
1425         case NVME_FEAT_IRQ_CONFIG:
1426                 irqcfg = data;
1427                 mutex_lock(&ctrl->irq_lock);
1428                 iv = nvmet_pci_epf_find_irq_vector(ctrl, irqcfg->iv);
1429                 if (iv) {
1430                         irqcfg->cd = iv->cd;
1431                         status = NVME_SC_SUCCESS;
1432                 } else {
1433                         status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
1434                 }
1435                 mutex_unlock(&ctrl->irq_lock);
1436                 return status;
1437
1438         default:
1439                 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
1440         }
1441 }
1442
1443 static u16 nvmet_pci_epf_set_feat(const struct nvmet_ctrl *tctrl,
1444                                   u8 feat, void *data)
1445 {
1446         struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1447         struct nvmet_feat_arbitration *arb;
1448         struct nvmet_feat_irq_coalesce *irqc;
1449         struct nvmet_feat_irq_config *irqcfg;
1450         struct nvmet_pci_epf_irq_vector *iv;
1451         u16 status;
1452
1453         switch (feat) {
1454         case NVME_FEAT_ARBITRATION:
1455                 arb = data;
1456                 if (arb->ab == 0x7)
1457                         ctrl->sq_ab = 0;
1458                 else
1459                         ctrl->sq_ab = 1 << arb->ab;
1460                 return NVME_SC_SUCCESS;
1461
1462         case NVME_FEAT_IRQ_COALESCE:
1463                 /*
1464                  * Since we do not implement precise IRQ coalescing timing,
1465                  * ignore the time field.
1466                  */
1467                 irqc = data;
1468                 ctrl->irq_vector_threshold = irqc->thr + 1;
1469                 return NVME_SC_SUCCESS;
1470
1471         case NVME_FEAT_IRQ_CONFIG:
1472                 irqcfg = data;
1473                 mutex_lock(&ctrl->irq_lock);
1474                 iv = nvmet_pci_epf_find_irq_vector(ctrl, irqcfg->iv);
1475                 if (iv) {
1476                         iv->cd = irqcfg->cd;
1477                         status = NVME_SC_SUCCESS;
1478                 } else {
1479                         status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
1480                 }
1481                 mutex_unlock(&ctrl->irq_lock);
1482                 return status;
1483
1484         default:
1485                 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
1486         }
1487 }
1488
1489 static const struct nvmet_fabrics_ops nvmet_pci_epf_fabrics_ops = {
1490         .owner          = THIS_MODULE,
1491         .type           = NVMF_TRTYPE_PCI,
1492         .add_port       = nvmet_pci_epf_add_port,
1493         .remove_port    = nvmet_pci_epf_remove_port,
1494         .queue_response = nvmet_pci_epf_queue_response,
1495         .get_mdts       = nvmet_pci_epf_get_mdts,
1496         .create_cq      = nvmet_pci_epf_create_cq,
1497         .delete_cq      = nvmet_pci_epf_delete_cq,
1498         .create_sq      = nvmet_pci_epf_create_sq,
1499         .delete_sq      = nvmet_pci_epf_delete_sq,
1500         .get_feature    = nvmet_pci_epf_get_feat,
1501         .set_feature    = nvmet_pci_epf_set_feat,
1502 };
1503
1504 static void nvmet_pci_epf_cq_work(struct work_struct *work);
1505
1506 static void nvmet_pci_epf_init_queue(struct nvmet_pci_epf_ctrl *ctrl,
1507                                      unsigned int qid, bool sq)
1508 {
1509         struct nvmet_pci_epf_queue *queue;
1510
1511         if (sq) {
1512                 queue = &ctrl->sq[qid];
1513                 set_bit(NVMET_PCI_EPF_Q_IS_SQ, &queue->flags);
1514         } else {
1515                 queue = &ctrl->cq[qid];
1516                 INIT_DELAYED_WORK(&queue->work, nvmet_pci_epf_cq_work);
1517         }
1518         queue->ctrl = ctrl;
1519         queue->qid = qid;
1520         spin_lock_init(&queue->lock);
1521         INIT_LIST_HEAD(&queue->list);
1522 }
1523
1524 static int nvmet_pci_epf_alloc_queues(struct nvmet_pci_epf_ctrl *ctrl)
1525 {
1526         unsigned int qid;
1527
1528         ctrl->sq = kcalloc(ctrl->nr_queues,
1529                            sizeof(struct nvmet_pci_epf_queue), GFP_KERNEL);
1530         if (!ctrl->sq)
1531                 return -ENOMEM;
1532
1533         ctrl->cq = kcalloc(ctrl->nr_queues,
1534                            sizeof(struct nvmet_pci_epf_queue), GFP_KERNEL);
1535         if (!ctrl->cq) {
1536                 kfree(ctrl->sq);
1537                 ctrl->sq = NULL;
1538                 return -ENOMEM;
1539         }
1540
1541         for (qid = 0; qid < ctrl->nr_queues; qid++) {
1542                 nvmet_pci_epf_init_queue(ctrl, qid, true);
1543                 nvmet_pci_epf_init_queue(ctrl, qid, false);
1544         }
1545
1546         return 0;
1547 }
1548
1549 static void nvmet_pci_epf_free_queues(struct nvmet_pci_epf_ctrl *ctrl)
1550 {
1551         kfree(ctrl->sq);
1552         ctrl->sq = NULL;
1553         kfree(ctrl->cq);
1554         ctrl->cq = NULL;
1555 }
1556
1557 static int nvmet_pci_epf_map_queue(struct nvmet_pci_epf_ctrl *ctrl,
1558                                    struct nvmet_pci_epf_queue *queue)
1559 {
1560         struct nvmet_pci_epf *nvme_epf = ctrl->nvme_epf;
1561         int ret;
1562
1563         ret = nvmet_pci_epf_mem_map(nvme_epf, queue->pci_addr,
1564                                       queue->pci_size, &queue->pci_map);
1565         if (ret) {
1566                 dev_err(ctrl->dev, "Failed to map queue %u (err=%d)\n",
1567                         queue->qid, ret);
1568                 return ret;
1569         }
1570
1571         if (queue->pci_map.pci_size < queue->pci_size) {
1572                 dev_err(ctrl->dev, "Invalid partial mapping of queue %u\n",
1573                         queue->qid);
1574                 nvmet_pci_epf_mem_unmap(nvme_epf, &queue->pci_map);
1575                 return -ENOMEM;
1576         }
1577
1578         return 0;
1579 }
1580
1581 static inline void nvmet_pci_epf_unmap_queue(struct nvmet_pci_epf_ctrl *ctrl,
1582                                              struct nvmet_pci_epf_queue *queue)
1583 {
1584         nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &queue->pci_map);
1585 }
1586
1587 static void nvmet_pci_epf_exec_iod_work(struct work_struct *work)
1588 {
1589         struct nvmet_pci_epf_iod *iod =
1590                 container_of(work, struct nvmet_pci_epf_iod, work);
1591         struct nvmet_req *req = &iod->req;
1592         int ret;
1593
1594         if (!iod->ctrl->link_up) {
1595                 nvmet_pci_epf_free_iod(iod);
1596                 return;
1597         }
1598
1599         if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &iod->sq->flags)) {
1600                 iod->status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
1601                 goto complete;
1602         }
1603
1604         if (!nvmet_req_init(req, &iod->cq->nvme_cq, &iod->sq->nvme_sq,
1605                             &nvmet_pci_epf_fabrics_ops))
1606                 goto complete;
1607
1608         iod->data_len = nvmet_req_transfer_len(req);
1609         if (iod->data_len) {
1610                 /*
1611                  * Get the data DMA transfer direction. Here "device" means the
1612                  * PCI root-complex host.
1613                  */
1614                 if (nvme_is_write(&iod->cmd))
1615                         iod->dma_dir = DMA_FROM_DEVICE;
1616                 else
1617                         iod->dma_dir = DMA_TO_DEVICE;
1618
1619                 /*
1620                  * Setup the command data buffer and get the command data from
1621                  * the host if needed.
1622                  */
1623                 ret = nvmet_pci_epf_alloc_iod_data_buf(iod);
1624                 if (!ret && iod->dma_dir == DMA_FROM_DEVICE)
1625                         ret = nvmet_pci_epf_transfer_iod_data(iod);
1626                 if (ret) {
1627                         nvmet_req_uninit(req);
1628                         goto complete;
1629                 }
1630         }
1631
1632         req->execute(req);
1633
1634         /*
1635          * If we do not have data to transfer after the command execution
1636          * finishes, nvmet_pci_epf_queue_response() will complete the command
1637          * directly. No need to wait for the completion in this case.
1638          */
1639         if (!iod->data_len || iod->dma_dir != DMA_TO_DEVICE)
1640                 return;
1641
1642         wait_for_completion(&iod->done);
1643
1644         if (iod->status == NVME_SC_SUCCESS) {
1645                 WARN_ON_ONCE(!iod->data_len || iod->dma_dir != DMA_TO_DEVICE);
1646                 nvmet_pci_epf_transfer_iod_data(iod);
1647         }
1648
1649 complete:
1650         nvmet_pci_epf_complete_iod(iod);
1651 }
1652
1653 static int nvmet_pci_epf_process_sq(struct nvmet_pci_epf_ctrl *ctrl,
1654                                     struct nvmet_pci_epf_queue *sq)
1655 {
1656         struct nvmet_pci_epf_iod *iod;
1657         int ret, n = 0;
1658
1659         sq->tail = nvmet_pci_epf_bar_read32(ctrl, sq->db);
1660         while (sq->head != sq->tail && (!ctrl->sq_ab || n < ctrl->sq_ab)) {
1661                 iod = nvmet_pci_epf_alloc_iod(sq);
1662                 if (!iod)
1663                         break;
1664
1665                 /* Get the NVMe command submitted by the host. */
1666                 ret = nvmet_pci_epf_transfer(ctrl, &iod->cmd,
1667                                              sq->pci_addr + sq->head * sq->qes,
1668                                              sq->qes, DMA_FROM_DEVICE);
1669                 if (ret) {
1670                         /* Not much we can do... */
1671                         nvmet_pci_epf_free_iod(iod);
1672                         break;
1673                 }
1674
1675                 dev_dbg(ctrl->dev, "SQ[%u]: head %u, tail %u, command %s\n",
1676                         sq->qid, sq->head, sq->tail,
1677                         nvmet_pci_epf_iod_name(iod));
1678
1679                 sq->head++;
1680                 if (sq->head == sq->depth)
1681                         sq->head = 0;
1682                 n++;
1683
1684                 queue_work_on(WORK_CPU_UNBOUND, sq->iod_wq, &iod->work);
1685
1686                 sq->tail = nvmet_pci_epf_bar_read32(ctrl, sq->db);
1687         }
1688
1689         return n;
1690 }
1691
1692 static void nvmet_pci_epf_poll_sqs_work(struct work_struct *work)
1693 {
1694         struct nvmet_pci_epf_ctrl *ctrl =
1695                 container_of(work, struct nvmet_pci_epf_ctrl, poll_sqs.work);
1696         struct nvmet_pci_epf_queue *sq;
1697         unsigned long last = 0;
1698         int i, nr_sqs;
1699
1700         while (ctrl->link_up && ctrl->enabled) {
1701                 nr_sqs = 0;
1702                 /* Do round-robin arbitration. */
1703                 for (i = 0; i < ctrl->nr_queues; i++) {
1704                         sq = &ctrl->sq[i];
1705                         if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
1706                                 continue;
1707                         if (nvmet_pci_epf_process_sq(ctrl, sq))
1708                                 nr_sqs++;
1709                 }
1710
1711                 if (nr_sqs) {
1712                         last = jiffies;
1713                         continue;
1714                 }
1715
1716                 /*
1717                  * If we have not received any command on any queue for more
1718                  * than NVMET_PCI_EPF_SQ_POLL_IDLE, assume we are idle and
1719                  * reschedule. This avoids "burning" a CPU when the controller
1720                  * is idle for a long time.
1721                  */
1722                 if (time_is_before_jiffies(last + NVMET_PCI_EPF_SQ_POLL_IDLE))
1723                         break;
1724
1725                 cpu_relax();
1726         }
1727
1728         schedule_delayed_work(&ctrl->poll_sqs, NVMET_PCI_EPF_SQ_POLL_INTERVAL);
1729 }
1730
1731 static void nvmet_pci_epf_cq_work(struct work_struct *work)
1732 {
1733         struct nvmet_pci_epf_queue *cq =
1734                 container_of(work, struct nvmet_pci_epf_queue, work.work);
1735         struct nvmet_pci_epf_ctrl *ctrl = cq->ctrl;
1736         struct nvme_completion *cqe;
1737         struct nvmet_pci_epf_iod *iod;
1738         unsigned long flags;
1739         int ret, n = 0;
1740
1741         ret = nvmet_pci_epf_map_queue(ctrl, cq);
1742         if (ret)
1743                 goto again;
1744
1745         while (test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags) && ctrl->link_up) {
1746
1747                 /* Check that the CQ is not full. */
1748                 cq->head = nvmet_pci_epf_bar_read32(ctrl, cq->db);
1749                 if (cq->head == cq->tail + 1) {
1750                         ret = -EAGAIN;
1751                         break;
1752                 }
1753
1754                 spin_lock_irqsave(&cq->lock, flags);
1755                 iod = list_first_entry_or_null(&cq->list,
1756                                                struct nvmet_pci_epf_iod, link);
1757                 if (iod)
1758                         list_del_init(&iod->link);
1759                 spin_unlock_irqrestore(&cq->lock, flags);
1760
1761                 if (!iod)
1762                         break;
1763
1764                 /* Post the IOD completion entry. */
1765                 cqe = &iod->cqe;
1766                 cqe->status = cpu_to_le16((iod->status << 1) | cq->phase);
1767
1768                 dev_dbg(ctrl->dev,
1769                         "CQ[%u]: %s status 0x%x, result 0x%llx, head %u, tail %u, phase %u\n",
1770                         cq->qid, nvmet_pci_epf_iod_name(iod), iod->status,
1771                         le64_to_cpu(cqe->result.u64), cq->head, cq->tail,
1772                         cq->phase);
1773
1774                 memcpy_toio(cq->pci_map.virt_addr + cq->tail * cq->qes,
1775                             cqe, cq->qes);
1776
1777                 cq->tail++;
1778                 if (cq->tail >= cq->depth) {
1779                         cq->tail = 0;
1780                         cq->phase ^= 1;
1781                 }
1782
1783                 nvmet_pci_epf_free_iod(iod);
1784
1785                 /* Signal the host. */
1786                 nvmet_pci_epf_raise_irq(ctrl, cq, false);
1787                 n++;
1788         }
1789
1790         nvmet_pci_epf_unmap_queue(ctrl, cq);
1791
1792         /*
1793          * We do not support precise IRQ coalescing time (100ns units as per
1794          * NVMe specifications). So if we have posted completion entries without
1795          * reaching the interrupt coalescing threshold, raise an interrupt.
1796          */
1797         if (n)
1798                 nvmet_pci_epf_raise_irq(ctrl, cq, true);
1799
1800 again:
1801         if (ret < 0)
1802                 queue_delayed_work(system_highpri_wq, &cq->work,
1803                                    NVMET_PCI_EPF_CQ_RETRY_INTERVAL);
1804 }
1805
1806 static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
1807 {
1808         u64 pci_addr, asq, acq;
1809         u32 aqa;
1810         u16 status, qsize;
1811
1812         if (ctrl->enabled)
1813                 return 0;
1814
1815         dev_info(ctrl->dev, "Enabling controller\n");
1816
1817         ctrl->mps_shift = nvmet_cc_mps(ctrl->cc) + 12;
1818         ctrl->mps = 1UL << ctrl->mps_shift;
1819         ctrl->mps_mask = ctrl->mps - 1;
1820
1821         ctrl->io_sqes = 1UL << nvmet_cc_iosqes(ctrl->cc);
1822         if (ctrl->io_sqes < sizeof(struct nvme_command)) {
1823                 dev_err(ctrl->dev, "Unsupported I/O SQES %zu (need %zu)\n",
1824                         ctrl->io_sqes, sizeof(struct nvme_command));
1825                 return -EINVAL;
1826         }
1827
1828         ctrl->io_cqes = 1UL << nvmet_cc_iocqes(ctrl->cc);
1829         if (ctrl->io_cqes < sizeof(struct nvme_completion)) {
1830                 dev_err(ctrl->dev, "Unsupported I/O CQES %zu (need %zu)\n",
1831                         ctrl->io_sqes, sizeof(struct nvme_completion));
1832                 return -EINVAL;
1833         }
1834
1835         /* Create the admin queue. */
1836         aqa = nvmet_pci_epf_bar_read32(ctrl, NVME_REG_AQA);
1837         asq = nvmet_pci_epf_bar_read64(ctrl, NVME_REG_ASQ);
1838         acq = nvmet_pci_epf_bar_read64(ctrl, NVME_REG_ACQ);
1839
1840         qsize = (aqa & 0x0fff0000) >> 16;
1841         pci_addr = acq & GENMASK_ULL(63, 12);
1842         status = nvmet_pci_epf_create_cq(ctrl->tctrl, 0,
1843                                 NVME_CQ_IRQ_ENABLED | NVME_QUEUE_PHYS_CONTIG,
1844                                 qsize, pci_addr, 0);
1845         if (status != NVME_SC_SUCCESS) {
1846                 dev_err(ctrl->dev, "Failed to create admin completion queue\n");
1847                 return -EINVAL;
1848         }
1849
1850         qsize = aqa & 0x00000fff;
1851         pci_addr = asq & GENMASK_ULL(63, 12);
1852         status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, NVME_QUEUE_PHYS_CONTIG,
1853                                          qsize, pci_addr);
1854         if (status != NVME_SC_SUCCESS) {
1855                 dev_err(ctrl->dev, "Failed to create admin submission queue\n");
1856                 nvmet_pci_epf_delete_cq(ctrl->tctrl, 0);
1857                 return -EINVAL;
1858         }
1859
1860         ctrl->sq_ab = NVMET_PCI_EPF_SQ_AB;
1861         ctrl->irq_vector_threshold = NVMET_PCI_EPF_IV_THRESHOLD;
1862         ctrl->enabled = true;
1863
1864         /* Start polling the controller SQs. */
1865         schedule_delayed_work(&ctrl->poll_sqs, 0);
1866
1867         return 0;
1868 }
1869
1870 static void nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
1871 {
1872         int qid;
1873
1874         if (!ctrl->enabled)
1875                 return;
1876
1877         dev_info(ctrl->dev, "Disabling controller\n");
1878
1879         ctrl->enabled = false;
1880         cancel_delayed_work_sync(&ctrl->poll_sqs);
1881
1882         /* Delete all I/O queues first. */
1883         for (qid = 1; qid < ctrl->nr_queues; qid++)
1884                 nvmet_pci_epf_delete_sq(ctrl->tctrl, qid);
1885
1886         for (qid = 1; qid < ctrl->nr_queues; qid++)
1887                 nvmet_pci_epf_delete_cq(ctrl->tctrl, qid);
1888
1889         /* Delete the admin queue last. */
1890         nvmet_pci_epf_delete_sq(ctrl->tctrl, 0);
1891         nvmet_pci_epf_delete_cq(ctrl->tctrl, 0);
1892 }
1893
1894 static void nvmet_pci_epf_poll_cc_work(struct work_struct *work)
1895 {
1896         struct nvmet_pci_epf_ctrl *ctrl =
1897                 container_of(work, struct nvmet_pci_epf_ctrl, poll_cc.work);
1898         u32 old_cc, new_cc;
1899         int ret;
1900
1901         if (!ctrl->tctrl)
1902                 return;
1903
1904         old_cc = ctrl->cc;
1905         new_cc = nvmet_pci_epf_bar_read32(ctrl, NVME_REG_CC);
1906         ctrl->cc = new_cc;
1907
1908         if (nvmet_cc_en(new_cc) && !nvmet_cc_en(old_cc)) {
1909                 ret = nvmet_pci_epf_enable_ctrl(ctrl);
1910                 if (ret)
1911                         return;
1912                 ctrl->csts |= NVME_CSTS_RDY;
1913         }
1914
1915         if (!nvmet_cc_en(new_cc) && nvmet_cc_en(old_cc)) {
1916                 nvmet_pci_epf_disable_ctrl(ctrl);
1917                 ctrl->csts &= ~NVME_CSTS_RDY;
1918         }
1919
1920         if (nvmet_cc_shn(new_cc) && !nvmet_cc_shn(old_cc)) {
1921                 nvmet_pci_epf_disable_ctrl(ctrl);
1922                 ctrl->csts |= NVME_CSTS_SHST_CMPLT;
1923         }
1924
1925         if (!nvmet_cc_shn(new_cc) && nvmet_cc_shn(old_cc))
1926                 ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
1927
1928         nvmet_update_cc(ctrl->tctrl, ctrl->cc);
1929         nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts);
1930
1931         schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL);
1932 }
1933
1934 static void nvmet_pci_epf_init_bar(struct nvmet_pci_epf_ctrl *ctrl)
1935 {
1936         struct nvmet_ctrl *tctrl = ctrl->tctrl;
1937
1938         ctrl->bar = ctrl->nvme_epf->reg_bar;
1939
1940         /* Copy the target controller capabilities as a base. */
1941         ctrl->cap = tctrl->cap;
1942
1943         /* Contiguous Queues Required (CQR). */
1944         ctrl->cap |= 0x1ULL << 16;
1945
1946         /* Set Doorbell stride to 4B (DSTRB). */
1947         ctrl->cap &= ~GENMASK_ULL(35, 32);
1948
1949         /* Clear NVM Subsystem Reset Supported (NSSRS). */
1950         ctrl->cap &= ~(0x1ULL << 36);
1951
1952         /* Clear Boot Partition Support (BPS). */
1953         ctrl->cap &= ~(0x1ULL << 45);
1954
1955         /* Clear Persistent Memory Region Supported (PMRS). */
1956         ctrl->cap &= ~(0x1ULL << 56);
1957
1958         /* Clear Controller Memory Buffer Supported (CMBS). */
1959         ctrl->cap &= ~(0x1ULL << 57);
1960
1961         /* Controller configuration. */
1962         ctrl->cc = tctrl->cc & (~NVME_CC_ENABLE);
1963
1964         /* Controller status. */
1965         ctrl->csts = ctrl->tctrl->csts;
1966
1967         nvmet_pci_epf_bar_write64(ctrl, NVME_REG_CAP, ctrl->cap);
1968         nvmet_pci_epf_bar_write32(ctrl, NVME_REG_VS, tctrl->subsys->ver);
1969         nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts);
1970         nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CC, ctrl->cc);
1971 }
1972
1973 static int nvmet_pci_epf_create_ctrl(struct nvmet_pci_epf *nvme_epf,
1974                                      unsigned int max_nr_queues)
1975 {
1976         struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
1977         struct nvmet_alloc_ctrl_args args = {};
1978         char hostnqn[NVMF_NQN_SIZE];
1979         uuid_t id;
1980         int ret;
1981
1982         memset(ctrl, 0, sizeof(*ctrl));
1983         ctrl->dev = &nvme_epf->epf->dev;
1984         mutex_init(&ctrl->irq_lock);
1985         ctrl->nvme_epf = nvme_epf;
1986         ctrl->mdts = nvme_epf->mdts_kb * SZ_1K;
1987         INIT_DELAYED_WORK(&ctrl->poll_cc, nvmet_pci_epf_poll_cc_work);
1988         INIT_DELAYED_WORK(&ctrl->poll_sqs, nvmet_pci_epf_poll_sqs_work);
1989
1990         ret = mempool_init_kmalloc_pool(&ctrl->iod_pool,
1991                                         max_nr_queues * NVMET_MAX_QUEUE_SIZE,
1992                                         sizeof(struct nvmet_pci_epf_iod));
1993         if (ret) {
1994                 dev_err(ctrl->dev, "Failed to initialize IOD mempool\n");
1995                 return ret;
1996         }
1997
1998         ctrl->port = nvmet_pci_epf_find_port(ctrl, nvme_epf->portid);
1999         if (!ctrl->port) {
2000                 dev_err(ctrl->dev, "Port not found\n");
2001                 ret = -EINVAL;
2002                 goto out_mempool_exit;
2003         }
2004
2005         /* Create the target controller. */
2006         uuid_gen(&id);
2007         snprintf(hostnqn, NVMF_NQN_SIZE,
2008                  "nqn.2014-08.org.nvmexpress:uuid:%pUb", &id);
2009         args.port = ctrl->port;
2010         args.subsysnqn = nvme_epf->subsysnqn;
2011         memset(&id, 0, sizeof(uuid_t));
2012         args.hostid = &id;
2013         args.hostnqn = hostnqn;
2014         args.ops = &nvmet_pci_epf_fabrics_ops;
2015
2016         ctrl->tctrl = nvmet_alloc_ctrl(&args);
2017         if (!ctrl->tctrl) {
2018                 dev_err(ctrl->dev, "Failed to create target controller\n");
2019                 ret = -ENOMEM;
2020                 goto out_mempool_exit;
2021         }
2022         ctrl->tctrl->drvdata = ctrl;
2023
2024         /* We do not support protection information for now. */
2025         if (ctrl->tctrl->pi_support) {
2026                 dev_err(ctrl->dev,
2027                         "Protection information (PI) is not supported\n");
2028                 ret = -ENOTSUPP;
2029                 goto out_put_ctrl;
2030         }
2031
2032         /* Allocate our queues, up to the maximum number. */
2033         ctrl->nr_queues = min(ctrl->tctrl->subsys->max_qid + 1, max_nr_queues);
2034         ret = nvmet_pci_epf_alloc_queues(ctrl);
2035         if (ret)
2036                 goto out_put_ctrl;
2037
2038         /*
2039          * Allocate the IRQ vectors descriptors. We cannot have more than the
2040          * maximum number of queues.
2041          */
2042         ret = nvmet_pci_epf_alloc_irq_vectors(ctrl);
2043         if (ret)
2044                 goto out_free_queues;
2045
2046         dev_info(ctrl->dev,
2047                  "New PCI ctrl \"%s\", %u I/O queues, mdts %u B\n",
2048                  ctrl->tctrl->subsys->subsysnqn, ctrl->nr_queues - 1,
2049                  ctrl->mdts);
2050
2051         /* Initialize BAR 0 using the target controller CAP. */
2052         nvmet_pci_epf_init_bar(ctrl);
2053
2054         return 0;
2055
2056 out_free_queues:
2057         nvmet_pci_epf_free_queues(ctrl);
2058 out_put_ctrl:
2059         nvmet_ctrl_put(ctrl->tctrl);
2060         ctrl->tctrl = NULL;
2061 out_mempool_exit:
2062         mempool_exit(&ctrl->iod_pool);
2063         return ret;
2064 }
2065
2066 static void nvmet_pci_epf_start_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
2067 {
2068         schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL);
2069 }
2070
2071 static void nvmet_pci_epf_stop_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
2072 {
2073         cancel_delayed_work_sync(&ctrl->poll_cc);
2074
2075         nvmet_pci_epf_disable_ctrl(ctrl);
2076 }
2077
2078 static void nvmet_pci_epf_destroy_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
2079 {
2080         if (!ctrl->tctrl)
2081                 return;
2082
2083         dev_info(ctrl->dev, "Destroying PCI ctrl \"%s\"\n",
2084                  ctrl->tctrl->subsys->subsysnqn);
2085
2086         nvmet_pci_epf_stop_ctrl(ctrl);
2087
2088         nvmet_pci_epf_free_queues(ctrl);
2089         nvmet_pci_epf_free_irq_vectors(ctrl);
2090
2091         nvmet_ctrl_put(ctrl->tctrl);
2092         ctrl->tctrl = NULL;
2093
2094         mempool_exit(&ctrl->iod_pool);
2095 }
2096
2097 static int nvmet_pci_epf_configure_bar(struct nvmet_pci_epf *nvme_epf)
2098 {
2099         struct pci_epf *epf = nvme_epf->epf;
2100         const struct pci_epc_features *epc_features = nvme_epf->epc_features;
2101         size_t reg_size, reg_bar_size;
2102         size_t msix_table_size = 0;
2103
2104         /*
2105          * The first free BAR will be our register BAR and per NVMe
2106          * specifications, it must be BAR 0.
2107          */
2108         if (pci_epc_get_first_free_bar(epc_features) != BAR_0) {
2109                 dev_err(&epf->dev, "BAR 0 is not free\n");
2110                 return -ENODEV;
2111         }
2112
2113         if (epc_features->bar[BAR_0].only_64bit)
2114                 epf->bar[BAR_0].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64;
2115
2116         /*
2117          * Calculate the size of the register bar: NVMe registers first with
2118          * enough space for the doorbells, followed by the MSI-X table
2119          * if supported.
2120          */
2121         reg_size = NVME_REG_DBS + (NVMET_NR_QUEUES * 2 * sizeof(u32));
2122         reg_size = ALIGN(reg_size, 8);
2123
2124         if (epc_features->msix_capable) {
2125                 size_t pba_size;
2126
2127                 msix_table_size = PCI_MSIX_ENTRY_SIZE * epf->msix_interrupts;
2128                 nvme_epf->msix_table_offset = reg_size;
2129                 pba_size = ALIGN(DIV_ROUND_UP(epf->msix_interrupts, 8), 8);
2130
2131                 reg_size += msix_table_size + pba_size;
2132         }
2133
2134         if (epc_features->bar[BAR_0].type == BAR_FIXED) {
2135                 if (reg_size > epc_features->bar[BAR_0].fixed_size) {
2136                         dev_err(&epf->dev,
2137                                 "BAR 0 size %llu B too small, need %zu B\n",
2138                                 epc_features->bar[BAR_0].fixed_size,
2139                                 reg_size);
2140                         return -ENOMEM;
2141                 }
2142                 reg_bar_size = epc_features->bar[BAR_0].fixed_size;
2143         } else {
2144                 reg_bar_size = ALIGN(reg_size, max(epc_features->align, 4096));
2145         }
2146
2147         nvme_epf->reg_bar = pci_epf_alloc_space(epf, reg_bar_size, BAR_0,
2148                                                 epc_features, PRIMARY_INTERFACE);
2149         if (!nvme_epf->reg_bar) {
2150                 dev_err(&epf->dev, "Failed to allocate BAR 0\n");
2151                 return -ENOMEM;
2152         }
2153         memset(nvme_epf->reg_bar, 0, reg_bar_size);
2154
2155         return 0;
2156 }
2157
2158 static void nvmet_pci_epf_free_bar(struct nvmet_pci_epf *nvme_epf)
2159 {
2160         struct pci_epf *epf = nvme_epf->epf;
2161
2162         if (!nvme_epf->reg_bar)
2163                 return;
2164
2165         pci_epf_free_space(epf, nvme_epf->reg_bar, BAR_0, PRIMARY_INTERFACE);
2166         nvme_epf->reg_bar = NULL;
2167 }
2168
2169 static void nvmet_pci_epf_clear_bar(struct nvmet_pci_epf *nvme_epf)
2170 {
2171         struct pci_epf *epf = nvme_epf->epf;
2172
2173         pci_epc_clear_bar(epf->epc, epf->func_no, epf->vfunc_no,
2174                           &epf->bar[BAR_0]);
2175 }
2176
2177 static int nvmet_pci_epf_init_irq(struct nvmet_pci_epf *nvme_epf)
2178 {
2179         const struct pci_epc_features *epc_features = nvme_epf->epc_features;
2180         struct pci_epf *epf = nvme_epf->epf;
2181         int ret;
2182
2183         /* Enable MSI-X if supported, otherwise, use MSI. */
2184         if (epc_features->msix_capable && epf->msix_interrupts) {
2185                 ret = pci_epc_set_msix(epf->epc, epf->func_no, epf->vfunc_no,
2186                                        epf->msix_interrupts, BAR_0,
2187                                        nvme_epf->msix_table_offset);
2188                 if (ret) {
2189                         dev_err(&epf->dev, "Failed to configure MSI-X\n");
2190                         return ret;
2191                 }
2192
2193                 nvme_epf->nr_vectors = epf->msix_interrupts;
2194                 nvme_epf->irq_type = PCI_IRQ_MSIX;
2195
2196                 return 0;
2197         }
2198
2199         if (epc_features->msi_capable && epf->msi_interrupts) {
2200                 ret = pci_epc_set_msi(epf->epc, epf->func_no, epf->vfunc_no,
2201                                       epf->msi_interrupts);
2202                 if (ret) {
2203                         dev_err(&epf->dev, "Failed to configure MSI\n");
2204                         return ret;
2205                 }
2206
2207                 nvme_epf->nr_vectors = epf->msi_interrupts;
2208                 nvme_epf->irq_type = PCI_IRQ_MSI;
2209
2210                 return 0;
2211         }
2212
2213         /* MSI and MSI-X are not supported: fall back to INTx. */
2214         nvme_epf->nr_vectors = 1;
2215         nvme_epf->irq_type = PCI_IRQ_INTX;
2216
2217         return 0;
2218 }
2219
2220 static int nvmet_pci_epf_epc_init(struct pci_epf *epf)
2221 {
2222         struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2223         const struct pci_epc_features *epc_features = nvme_epf->epc_features;
2224         struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
2225         unsigned int max_nr_queues = NVMET_NR_QUEUES;
2226         int ret;
2227
2228         /* For now, do not support virtual functions. */
2229         if (epf->vfunc_no > 0) {
2230                 dev_err(&epf->dev, "Virtual functions are not supported\n");
2231                 return -EINVAL;
2232         }
2233
2234         /*
2235          * Cap the maximum number of queues we can support on the controller
2236          * with the number of IRQs we can use.
2237          */
2238         if (epc_features->msix_capable && epf->msix_interrupts) {
2239                 dev_info(&epf->dev,
2240                          "PCI endpoint controller supports MSI-X, %u vectors\n",
2241                          epf->msix_interrupts);
2242                 max_nr_queues = min(max_nr_queues, epf->msix_interrupts);
2243         } else if (epc_features->msi_capable && epf->msi_interrupts) {
2244                 dev_info(&epf->dev,
2245                          "PCI endpoint controller supports MSI, %u vectors\n",
2246                          epf->msi_interrupts);
2247                 max_nr_queues = min(max_nr_queues, epf->msi_interrupts);
2248         }
2249
2250         if (max_nr_queues < 2) {
2251                 dev_err(&epf->dev, "Invalid maximum number of queues %u\n",
2252                         max_nr_queues);
2253                 return -EINVAL;
2254         }
2255
2256         /* Create the target controller. */
2257         ret = nvmet_pci_epf_create_ctrl(nvme_epf, max_nr_queues);
2258         if (ret) {
2259                 dev_err(&epf->dev,
2260                         "Failed to create NVMe PCI target controller (err=%d)\n",
2261                         ret);
2262                 return ret;
2263         }
2264
2265         /* Set device ID, class, etc. */
2266         epf->header->vendorid = ctrl->tctrl->subsys->vendor_id;
2267         epf->header->subsys_vendor_id = ctrl->tctrl->subsys->subsys_vendor_id;
2268         ret = pci_epc_write_header(epf->epc, epf->func_no, epf->vfunc_no,
2269                                    epf->header);
2270         if (ret) {
2271                 dev_err(&epf->dev,
2272                         "Failed to write configuration header (err=%d)\n", ret);
2273                 goto out_destroy_ctrl;
2274         }
2275
2276         ret = pci_epc_set_bar(epf->epc, epf->func_no, epf->vfunc_no,
2277                               &epf->bar[BAR_0]);
2278         if (ret) {
2279                 dev_err(&epf->dev, "Failed to set BAR 0 (err=%d)\n", ret);
2280                 goto out_destroy_ctrl;
2281         }
2282
2283         /*
2284          * Enable interrupts and start polling the controller BAR if we do not
2285          * have a link up notifier.
2286          */
2287         ret = nvmet_pci_epf_init_irq(nvme_epf);
2288         if (ret)
2289                 goto out_clear_bar;
2290
2291         if (!epc_features->linkup_notifier) {
2292                 ctrl->link_up = true;
2293                 nvmet_pci_epf_start_ctrl(&nvme_epf->ctrl);
2294         }
2295
2296         return 0;
2297
2298 out_clear_bar:
2299         nvmet_pci_epf_clear_bar(nvme_epf);
2300 out_destroy_ctrl:
2301         nvmet_pci_epf_destroy_ctrl(&nvme_epf->ctrl);
2302         return ret;
2303 }
2304
2305 static void nvmet_pci_epf_epc_deinit(struct pci_epf *epf)
2306 {
2307         struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2308         struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
2309
2310         ctrl->link_up = false;
2311         nvmet_pci_epf_destroy_ctrl(ctrl);
2312
2313         nvmet_pci_epf_deinit_dma(nvme_epf);
2314         nvmet_pci_epf_clear_bar(nvme_epf);
2315 }
2316
2317 static int nvmet_pci_epf_link_up(struct pci_epf *epf)
2318 {
2319         struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2320         struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
2321
2322         ctrl->link_up = true;
2323         nvmet_pci_epf_start_ctrl(ctrl);
2324
2325         return 0;
2326 }
2327
2328 static int nvmet_pci_epf_link_down(struct pci_epf *epf)
2329 {
2330         struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2331         struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
2332
2333         ctrl->link_up = false;
2334         nvmet_pci_epf_stop_ctrl(ctrl);
2335
2336         return 0;
2337 }
2338
2339 static const struct pci_epc_event_ops nvmet_pci_epf_event_ops = {
2340         .epc_init = nvmet_pci_epf_epc_init,
2341         .epc_deinit = nvmet_pci_epf_epc_deinit,
2342         .link_up = nvmet_pci_epf_link_up,
2343         .link_down = nvmet_pci_epf_link_down,
2344 };
2345
2346 static int nvmet_pci_epf_bind(struct pci_epf *epf)
2347 {
2348         struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2349         const struct pci_epc_features *epc_features;
2350         struct pci_epc *epc = epf->epc;
2351         int ret;
2352
2353         if (WARN_ON_ONCE(!epc))
2354                 return -EINVAL;
2355
2356         epc_features = pci_epc_get_features(epc, epf->func_no, epf->vfunc_no);
2357         if (!epc_features) {
2358                 dev_err(&epf->dev, "epc_features not implemented\n");
2359                 return -EOPNOTSUPP;
2360         }
2361         nvme_epf->epc_features = epc_features;
2362
2363         ret = nvmet_pci_epf_configure_bar(nvme_epf);
2364         if (ret)
2365                 return ret;
2366
2367         nvmet_pci_epf_init_dma(nvme_epf);
2368
2369         return 0;
2370 }
2371
2372 static void nvmet_pci_epf_unbind(struct pci_epf *epf)
2373 {
2374         struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2375         struct pci_epc *epc = epf->epc;
2376
2377         nvmet_pci_epf_destroy_ctrl(&nvme_epf->ctrl);
2378
2379         if (epc->init_complete) {
2380                 nvmet_pci_epf_deinit_dma(nvme_epf);
2381                 nvmet_pci_epf_clear_bar(nvme_epf);
2382         }
2383
2384         nvmet_pci_epf_free_bar(nvme_epf);
2385 }
2386
2387 static struct pci_epf_header nvme_epf_pci_header = {
2388         .vendorid       = PCI_ANY_ID,
2389         .deviceid       = PCI_ANY_ID,
2390         .progif_code    = 0x02, /* NVM Express */
2391         .baseclass_code = PCI_BASE_CLASS_STORAGE,
2392         .subclass_code  = 0x08, /* Non-Volatile Memory controller */
2393         .interrupt_pin  = PCI_INTERRUPT_INTA,
2394 };
2395
2396 static int nvmet_pci_epf_probe(struct pci_epf *epf,
2397                                const struct pci_epf_device_id *id)
2398 {
2399         struct nvmet_pci_epf *nvme_epf;
2400         int ret;
2401
2402         nvme_epf = devm_kzalloc(&epf->dev, sizeof(*nvme_epf), GFP_KERNEL);
2403         if (!nvme_epf)
2404                 return -ENOMEM;
2405
2406         ret = devm_mutex_init(&epf->dev, &nvme_epf->mmio_lock);
2407         if (ret)
2408                 return ret;
2409
2410         nvme_epf->epf = epf;
2411         nvme_epf->mdts_kb = NVMET_PCI_EPF_MDTS_KB;
2412
2413         epf->event_ops = &nvmet_pci_epf_event_ops;
2414         epf->header = &nvme_epf_pci_header;
2415         epf_set_drvdata(epf, nvme_epf);
2416
2417         return 0;
2418 }
2419
2420 #define to_nvme_epf(epf_group)  \
2421         container_of(epf_group, struct nvmet_pci_epf, group)
2422
2423 static ssize_t nvmet_pci_epf_portid_show(struct config_item *item, char *page)
2424 {
2425         struct config_group *group = to_config_group(item);
2426         struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
2427
2428         return sysfs_emit(page, "%u\n", le16_to_cpu(nvme_epf->portid));
2429 }
2430
2431 static ssize_t nvmet_pci_epf_portid_store(struct config_item *item,
2432                                           const char *page, size_t len)
2433 {
2434         struct config_group *group = to_config_group(item);
2435         struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
2436         u16 portid;
2437
2438         /* Do not allow setting this when the function is already started. */
2439         if (nvme_epf->ctrl.tctrl)
2440                 return -EBUSY;
2441
2442         if (!len)
2443                 return -EINVAL;
2444
2445         if (kstrtou16(page, 0, &portid))
2446                 return -EINVAL;
2447
2448         nvme_epf->portid = cpu_to_le16(portid);
2449
2450         return len;
2451 }
2452
2453 CONFIGFS_ATTR(nvmet_pci_epf_, portid);
2454
2455 static ssize_t nvmet_pci_epf_subsysnqn_show(struct config_item *item,
2456                                             char *page)
2457 {
2458         struct config_group *group = to_config_group(item);
2459         struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
2460
2461         return sysfs_emit(page, "%s\n", nvme_epf->subsysnqn);
2462 }
2463
2464 static ssize_t nvmet_pci_epf_subsysnqn_store(struct config_item *item,
2465                                              const char *page, size_t len)
2466 {
2467         struct config_group *group = to_config_group(item);
2468         struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
2469
2470         /* Do not allow setting this when the function is already started. */
2471         if (nvme_epf->ctrl.tctrl)
2472                 return -EBUSY;
2473
2474         if (!len)
2475                 return -EINVAL;
2476
2477         strscpy(nvme_epf->subsysnqn, page, len);
2478
2479         return len;
2480 }
2481
2482 CONFIGFS_ATTR(nvmet_pci_epf_, subsysnqn);
2483
2484 static ssize_t nvmet_pci_epf_mdts_kb_show(struct config_item *item, char *page)
2485 {
2486         struct config_group *group = to_config_group(item);
2487         struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
2488
2489         return sysfs_emit(page, "%u\n", nvme_epf->mdts_kb);
2490 }
2491
2492 static ssize_t nvmet_pci_epf_mdts_kb_store(struct config_item *item,
2493                                            const char *page, size_t len)
2494 {
2495         struct config_group *group = to_config_group(item);
2496         struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
2497         unsigned long mdts_kb;
2498         int ret;
2499
2500         if (nvme_epf->ctrl.tctrl)
2501                 return -EBUSY;
2502
2503         ret = kstrtoul(page, 0, &mdts_kb);
2504         if (ret)
2505                 return ret;
2506         if (!mdts_kb)
2507                 mdts_kb = NVMET_PCI_EPF_MDTS_KB;
2508         else if (mdts_kb > NVMET_PCI_EPF_MAX_MDTS_KB)
2509                 mdts_kb = NVMET_PCI_EPF_MAX_MDTS_KB;
2510
2511         if (!is_power_of_2(mdts_kb))
2512                 return -EINVAL;
2513
2514         nvme_epf->mdts_kb = mdts_kb;
2515
2516         return len;
2517 }
2518
2519 CONFIGFS_ATTR(nvmet_pci_epf_, mdts_kb);
2520
2521 static struct configfs_attribute *nvmet_pci_epf_attrs[] = {
2522         &nvmet_pci_epf_attr_portid,
2523         &nvmet_pci_epf_attr_subsysnqn,
2524         &nvmet_pci_epf_attr_mdts_kb,
2525         NULL,
2526 };
2527
2528 static const struct config_item_type nvmet_pci_epf_group_type = {
2529         .ct_attrs       = nvmet_pci_epf_attrs,
2530         .ct_owner       = THIS_MODULE,
2531 };
2532
2533 static struct config_group *nvmet_pci_epf_add_cfs(struct pci_epf *epf,
2534                                                   struct config_group *group)
2535 {
2536         struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2537
2538         config_group_init_type_name(&nvme_epf->group, "nvme",
2539                                     &nvmet_pci_epf_group_type);
2540
2541         return &nvme_epf->group;
2542 }
2543
2544 static const struct pci_epf_device_id nvmet_pci_epf_ids[] = {
2545         { .name = "nvmet_pci_epf" },
2546         {},
2547 };
2548
2549 static struct pci_epf_ops nvmet_pci_epf_ops = {
2550         .bind   = nvmet_pci_epf_bind,
2551         .unbind = nvmet_pci_epf_unbind,
2552         .add_cfs = nvmet_pci_epf_add_cfs,
2553 };
2554
2555 static struct pci_epf_driver nvmet_pci_epf_driver = {
2556         .driver.name    = "nvmet_pci_epf",
2557         .probe          = nvmet_pci_epf_probe,
2558         .id_table       = nvmet_pci_epf_ids,
2559         .ops            = &nvmet_pci_epf_ops,
2560         .owner          = THIS_MODULE,
2561 };
2562
2563 static int __init nvmet_pci_epf_init_module(void)
2564 {
2565         int ret;
2566
2567         ret = pci_epf_register_driver(&nvmet_pci_epf_driver);
2568         if (ret)
2569                 return ret;
2570
2571         ret = nvmet_register_transport(&nvmet_pci_epf_fabrics_ops);
2572         if (ret) {
2573                 pci_epf_unregister_driver(&nvmet_pci_epf_driver);
2574                 return ret;
2575         }
2576
2577         return 0;
2578 }
2579
2580 static void __exit nvmet_pci_epf_cleanup_module(void)
2581 {
2582         nvmet_unregister_transport(&nvmet_pci_epf_fabrics_ops);
2583         pci_epf_unregister_driver(&nvmet_pci_epf_driver);
2584 }
2585
2586 module_init(nvmet_pci_epf_init_module);
2587 module_exit(nvmet_pci_epf_cleanup_module);
2588
2589 MODULE_DESCRIPTION("NVMe PCI Endpoint Function target driver");
2590 MODULE_AUTHOR("Damien Le Moal <[email protected]>");
2591 MODULE_LICENSE("GPL");
This page took 0.176176 seconds and 4 git commands to generate.