]> Git Repo - linux.git/blob - drivers/vfio/pci/mlx5/main.c
Merge patch series "riscv: Extension parsing fixes"
[linux.git] / drivers / vfio / pci / mlx5 / main.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5
6 #include <linux/device.h>
7 #include <linux/eventfd.h>
8 #include <linux/file.h>
9 #include <linux/interrupt.h>
10 #include <linux/iommu.h>
11 #include <linux/module.h>
12 #include <linux/mutex.h>
13 #include <linux/notifier.h>
14 #include <linux/pci.h>
15 #include <linux/pm_runtime.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/anon_inodes.h>
21
22 #include "cmd.h"
23
24 /* Device specification max LOAD size */
25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
26
27 #define MAX_CHUNK_SIZE SZ_8M
28
29 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
30 {
31         struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
32
33         return container_of(core_device, struct mlx5vf_pci_core_device,
34                             core_device);
35 }
36
37 struct page *
38 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
39                           unsigned long offset)
40 {
41         unsigned long cur_offset = 0;
42         struct scatterlist *sg;
43         unsigned int i;
44
45         /* All accesses are sequential */
46         if (offset < buf->last_offset || !buf->last_offset_sg) {
47                 buf->last_offset = 0;
48                 buf->last_offset_sg = buf->table.sgt.sgl;
49                 buf->sg_last_entry = 0;
50         }
51
52         cur_offset = buf->last_offset;
53
54         for_each_sg(buf->last_offset_sg, sg,
55                         buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
56                 if (offset < sg->length + cur_offset) {
57                         buf->last_offset_sg = sg;
58                         buf->sg_last_entry += i;
59                         buf->last_offset = cur_offset;
60                         return nth_page(sg_page(sg),
61                                         (offset - cur_offset) / PAGE_SIZE);
62                 }
63                 cur_offset += sg->length;
64         }
65         return NULL;
66 }
67
68 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
69 {
70         mutex_lock(&migf->lock);
71         migf->state = MLX5_MIGF_STATE_ERROR;
72         migf->filp->f_pos = 0;
73         mutex_unlock(&migf->lock);
74 }
75
76 static int mlx5vf_release_file(struct inode *inode, struct file *filp)
77 {
78         struct mlx5_vf_migration_file *migf = filp->private_data;
79
80         mlx5vf_disable_fd(migf);
81         mutex_destroy(&migf->lock);
82         kfree(migf);
83         return 0;
84 }
85
86 static struct mlx5_vhca_data_buffer *
87 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
88                               bool *end_of_data)
89 {
90         struct mlx5_vhca_data_buffer *buf;
91         bool found = false;
92
93         *end_of_data = false;
94         spin_lock_irq(&migf->list_lock);
95         if (list_empty(&migf->buf_list)) {
96                 *end_of_data = true;
97                 goto end;
98         }
99
100         buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
101                                buf_elm);
102         if (pos >= buf->start_pos &&
103             pos < buf->start_pos + buf->length) {
104                 found = true;
105                 goto end;
106         }
107
108         /*
109          * As we use a stream based FD we may expect having the data always
110          * on first chunk
111          */
112         migf->state = MLX5_MIGF_STATE_ERROR;
113
114 end:
115         spin_unlock_irq(&migf->list_lock);
116         return found ? buf : NULL;
117 }
118
119 static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
120 {
121         struct mlx5_vf_migration_file *migf = vhca_buf->migf;
122
123         if (vhca_buf->stop_copy_chunk_num) {
124                 bool is_header = vhca_buf->dma_dir == DMA_NONE;
125                 u8 chunk_num = vhca_buf->stop_copy_chunk_num;
126                 size_t next_required_umem_size = 0;
127
128                 if (is_header)
129                         migf->buf_header[chunk_num - 1] = vhca_buf;
130                 else
131                         migf->buf[chunk_num - 1] = vhca_buf;
132
133                 spin_lock_irq(&migf->list_lock);
134                 list_del_init(&vhca_buf->buf_elm);
135                 if (!is_header) {
136                         next_required_umem_size =
137                                 migf->next_required_umem_size;
138                         migf->next_required_umem_size = 0;
139                         migf->num_ready_chunks--;
140                 }
141                 spin_unlock_irq(&migf->list_lock);
142                 if (next_required_umem_size)
143                         mlx5vf_mig_file_set_save_work(migf, chunk_num,
144                                                       next_required_umem_size);
145                 return;
146         }
147
148         spin_lock_irq(&migf->list_lock);
149         list_del_init(&vhca_buf->buf_elm);
150         list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
151         spin_unlock_irq(&migf->list_lock);
152 }
153
154 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
155                                char __user **buf, size_t *len, loff_t *pos)
156 {
157         unsigned long offset;
158         ssize_t done = 0;
159         size_t copy_len;
160
161         copy_len = min_t(size_t,
162                          vhca_buf->start_pos + vhca_buf->length - *pos, *len);
163         while (copy_len) {
164                 size_t page_offset;
165                 struct page *page;
166                 size_t page_len;
167                 u8 *from_buff;
168                 int ret;
169
170                 offset = *pos - vhca_buf->start_pos;
171                 page_offset = offset % PAGE_SIZE;
172                 offset -= page_offset;
173                 page = mlx5vf_get_migration_page(vhca_buf, offset);
174                 if (!page)
175                         return -EINVAL;
176                 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
177                 from_buff = kmap_local_page(page);
178                 ret = copy_to_user(*buf, from_buff + page_offset, page_len);
179                 kunmap_local(from_buff);
180                 if (ret)
181                         return -EFAULT;
182                 *pos += page_len;
183                 *len -= page_len;
184                 *buf += page_len;
185                 done += page_len;
186                 copy_len -= page_len;
187         }
188
189         if (*pos >= vhca_buf->start_pos + vhca_buf->length)
190                 mlx5vf_buf_read_done(vhca_buf);
191
192         return done;
193 }
194
195 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
196                                loff_t *pos)
197 {
198         struct mlx5_vf_migration_file *migf = filp->private_data;
199         struct mlx5_vhca_data_buffer *vhca_buf;
200         bool first_loop_call = true;
201         bool end_of_data;
202         ssize_t done = 0;
203
204         if (pos)
205                 return -ESPIPE;
206         pos = &filp->f_pos;
207
208         if (!(filp->f_flags & O_NONBLOCK)) {
209                 if (wait_event_interruptible(migf->poll_wait,
210                                 !list_empty(&migf->buf_list) ||
211                                 migf->state == MLX5_MIGF_STATE_ERROR ||
212                                 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
213                                 migf->state == MLX5_MIGF_STATE_PRE_COPY ||
214                                 migf->state == MLX5_MIGF_STATE_COMPLETE))
215                         return -ERESTARTSYS;
216         }
217
218         mutex_lock(&migf->lock);
219         if (migf->state == MLX5_MIGF_STATE_ERROR) {
220                 done = -ENODEV;
221                 goto out_unlock;
222         }
223
224         while (len) {
225                 ssize_t count;
226
227                 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
228                                                          &end_of_data);
229                 if (first_loop_call) {
230                         first_loop_call = false;
231                         /* Temporary end of file as part of PRE_COPY */
232                         if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
233                                 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
234                                 done = -ENOMSG;
235                                 goto out_unlock;
236                         }
237
238                         if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
239                                 if (filp->f_flags & O_NONBLOCK) {
240                                         done = -EAGAIN;
241                                         goto out_unlock;
242                                 }
243                         }
244                 }
245
246                 if (end_of_data)
247                         goto out_unlock;
248
249                 if (!vhca_buf) {
250                         done = -EINVAL;
251                         goto out_unlock;
252                 }
253
254                 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
255                 if (count < 0) {
256                         done = count;
257                         goto out_unlock;
258                 }
259                 done += count;
260         }
261
262 out_unlock:
263         mutex_unlock(&migf->lock);
264         return done;
265 }
266
267 static __poll_t mlx5vf_save_poll(struct file *filp,
268                                  struct poll_table_struct *wait)
269 {
270         struct mlx5_vf_migration_file *migf = filp->private_data;
271         __poll_t pollflags = 0;
272
273         poll_wait(filp, &migf->poll_wait, wait);
274
275         mutex_lock(&migf->lock);
276         if (migf->state == MLX5_MIGF_STATE_ERROR)
277                 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
278         else if (!list_empty(&migf->buf_list) ||
279                  migf->state == MLX5_MIGF_STATE_COMPLETE)
280                 pollflags = EPOLLIN | EPOLLRDNORM;
281         mutex_unlock(&migf->lock);
282
283         return pollflags;
284 }
285
286 /*
287  * FD is exposed and user can use it after receiving an error.
288  * Mark migf in error, and wake the user.
289  */
290 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
291 {
292         migf->state = MLX5_MIGF_STATE_ERROR;
293         wake_up_interruptible(&migf->poll_wait);
294 }
295
296 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
297                                    u8 chunk_num, size_t next_required_umem_size)
298 {
299         migf->save_data[chunk_num - 1].next_required_umem_size =
300                         next_required_umem_size;
301         migf->save_data[chunk_num - 1].migf = migf;
302         get_file(migf->filp);
303         queue_work(migf->mvdev->cb_wq,
304                    &migf->save_data[chunk_num - 1].work);
305 }
306
307 static struct mlx5_vhca_data_buffer *
308 mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
309                                   u8 index, size_t required_length)
310 {
311         struct mlx5_vhca_data_buffer *buf = migf->buf[index];
312         u8 chunk_num;
313
314         WARN_ON(!buf);
315         chunk_num = buf->stop_copy_chunk_num;
316         buf->migf->buf[index] = NULL;
317         /* Checking whether the pre-allocated buffer can fit */
318         if (buf->allocated_length >= required_length)
319                 return buf;
320
321         mlx5vf_put_data_buffer(buf);
322         buf = mlx5vf_get_data_buffer(buf->migf, required_length,
323                                      DMA_FROM_DEVICE);
324         if (IS_ERR(buf))
325                 return buf;
326
327         buf->stop_copy_chunk_num = chunk_num;
328         return buf;
329 }
330
331 static void mlx5vf_mig_file_save_work(struct work_struct *_work)
332 {
333         struct mlx5vf_save_work_data *save_data = container_of(_work,
334                 struct mlx5vf_save_work_data, work);
335         struct mlx5_vf_migration_file *migf = save_data->migf;
336         struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
337         struct mlx5_vhca_data_buffer *buf;
338
339         mutex_lock(&mvdev->state_mutex);
340         if (migf->state == MLX5_MIGF_STATE_ERROR)
341                 goto end;
342
343         buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
344                                 save_data->chunk_num - 1,
345                                 save_data->next_required_umem_size);
346         if (IS_ERR(buf))
347                 goto err;
348
349         if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
350                 goto err_save;
351
352         goto end;
353
354 err_save:
355         mlx5vf_put_data_buffer(buf);
356 err:
357         mlx5vf_mark_err(migf);
358 end:
359         mlx5vf_state_mutex_unlock(mvdev);
360         fput(migf->filp);
361 }
362
363 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
364                                        bool track)
365 {
366         size_t size = sizeof(struct mlx5_vf_migration_header) +
367                 sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
368         struct mlx5_vf_migration_tag_stop_copy_data data = {};
369         struct mlx5_vhca_data_buffer *header_buf = NULL;
370         struct mlx5_vf_migration_header header = {};
371         unsigned long flags;
372         struct page *page;
373         u8 *to_buff;
374         int ret;
375
376         header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
377         if (IS_ERR(header_buf))
378                 return PTR_ERR(header_buf);
379
380         header.record_size = cpu_to_le64(sizeof(data));
381         header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
382         header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
383         page = mlx5vf_get_migration_page(header_buf, 0);
384         if (!page) {
385                 ret = -EINVAL;
386                 goto err;
387         }
388         to_buff = kmap_local_page(page);
389         memcpy(to_buff, &header, sizeof(header));
390         header_buf->length = sizeof(header);
391         data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
392         memcpy(to_buff + sizeof(header), &data, sizeof(data));
393         header_buf->length += sizeof(data);
394         kunmap_local(to_buff);
395         header_buf->start_pos = header_buf->migf->max_pos;
396         migf->max_pos += header_buf->length;
397         spin_lock_irqsave(&migf->list_lock, flags);
398         list_add_tail(&header_buf->buf_elm, &migf->buf_list);
399         spin_unlock_irqrestore(&migf->list_lock, flags);
400         if (track)
401                 migf->pre_copy_initial_bytes = size;
402         return 0;
403 err:
404         mlx5vf_put_data_buffer(header_buf);
405         return ret;
406 }
407
408 static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
409                                  struct mlx5_vf_migration_file *migf,
410                                  size_t state_size, u64 full_size,
411                                  bool track)
412 {
413         struct mlx5_vhca_data_buffer *buf;
414         size_t inc_state_size;
415         int num_chunks;
416         int ret;
417         int i;
418
419         if (mvdev->chunk_mode) {
420                 size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
421
422                 /* from firmware perspective at least 'state_size' buffer should be set */
423                 inc_state_size = max(state_size, chunk_size);
424         } else {
425                 if (track) {
426                         /* let's be ready for stop_copy size that might grow by 10 percents */
427                         if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
428                                 inc_state_size = state_size;
429                 } else {
430                         inc_state_size = state_size;
431                 }
432         }
433
434         /* let's not overflow the device specification max SAVE size */
435         inc_state_size = min_t(size_t, inc_state_size,
436                 (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
437
438         num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
439         for (i = 0; i < num_chunks; i++) {
440                 buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
441                 if (IS_ERR(buf)) {
442                         ret = PTR_ERR(buf);
443                         goto err;
444                 }
445
446                 migf->buf[i] = buf;
447                 buf = mlx5vf_get_data_buffer(migf,
448                                 sizeof(struct mlx5_vf_migration_header), DMA_NONE);
449                 if (IS_ERR(buf)) {
450                         ret = PTR_ERR(buf);
451                         goto err;
452                 }
453                 migf->buf_header[i] = buf;
454                 if (mvdev->chunk_mode) {
455                         migf->buf[i]->stop_copy_chunk_num = i + 1;
456                         migf->buf_header[i]->stop_copy_chunk_num = i + 1;
457                         INIT_WORK(&migf->save_data[i].work,
458                                   mlx5vf_mig_file_save_work);
459                         migf->save_data[i].chunk_num = i + 1;
460                 }
461         }
462
463         ret = mlx5vf_add_stop_copy_header(migf, track);
464         if (ret)
465                 goto err;
466         return 0;
467
468 err:
469         for (i = 0; i < num_chunks; i++) {
470                 if (migf->buf[i]) {
471                         mlx5vf_put_data_buffer(migf->buf[i]);
472                         migf->buf[i] = NULL;
473                 }
474                 if (migf->buf_header[i]) {
475                         mlx5vf_put_data_buffer(migf->buf_header[i]);
476                         migf->buf_header[i] = NULL;
477                 }
478         }
479
480         return ret;
481 }
482
483 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
484                                  unsigned long arg)
485 {
486         struct mlx5_vf_migration_file *migf = filp->private_data;
487         struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
488         struct mlx5_vhca_data_buffer *buf;
489         struct vfio_precopy_info info = {};
490         loff_t *pos = &filp->f_pos;
491         unsigned long minsz;
492         size_t inc_length = 0;
493         bool end_of_data = false;
494         int ret;
495
496         if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
497                 return -ENOTTY;
498
499         minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
500
501         if (copy_from_user(&info, (void __user *)arg, minsz))
502                 return -EFAULT;
503
504         if (info.argsz < minsz)
505                 return -EINVAL;
506
507         mutex_lock(&mvdev->state_mutex);
508         if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
509             mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
510                 ret = -EINVAL;
511                 goto err_state_unlock;
512         }
513
514         /*
515          * We can't issue a SAVE command when the device is suspended, so as
516          * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
517          * bytes that can't be read.
518          */
519         if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
520                 /*
521                  * Once the query returns it's guaranteed that there is no
522                  * active SAVE command.
523                  * As so, the other code below is safe with the proper locks.
524                  */
525                 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
526                                                             NULL, MLX5VF_QUERY_INC);
527                 if (ret)
528                         goto err_state_unlock;
529         }
530
531         mutex_lock(&migf->lock);
532         if (migf->state == MLX5_MIGF_STATE_ERROR) {
533                 ret = -ENODEV;
534                 goto err_migf_unlock;
535         }
536
537         if (migf->pre_copy_initial_bytes > *pos) {
538                 info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
539         } else {
540                 info.dirty_bytes = migf->max_pos - *pos;
541                 if (!info.dirty_bytes)
542                         end_of_data = true;
543                 info.dirty_bytes += inc_length;
544         }
545
546         if (!end_of_data || !inc_length) {
547                 mutex_unlock(&migf->lock);
548                 goto done;
549         }
550
551         mutex_unlock(&migf->lock);
552         /*
553          * We finished transferring the current state and the device has a
554          * dirty state, save a new state to be ready for.
555          */
556         buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
557         if (IS_ERR(buf)) {
558                 ret = PTR_ERR(buf);
559                 mlx5vf_mark_err(migf);
560                 goto err_state_unlock;
561         }
562
563         ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
564         if (ret) {
565                 mlx5vf_mark_err(migf);
566                 mlx5vf_put_data_buffer(buf);
567                 goto err_state_unlock;
568         }
569
570 done:
571         mlx5vf_state_mutex_unlock(mvdev);
572         if (copy_to_user((void __user *)arg, &info, minsz))
573                 return -EFAULT;
574         return 0;
575
576 err_migf_unlock:
577         mutex_unlock(&migf->lock);
578 err_state_unlock:
579         mlx5vf_state_mutex_unlock(mvdev);
580         return ret;
581 }
582
583 static const struct file_operations mlx5vf_save_fops = {
584         .owner = THIS_MODULE,
585         .read = mlx5vf_save_read,
586         .poll = mlx5vf_save_poll,
587         .unlocked_ioctl = mlx5vf_precopy_ioctl,
588         .compat_ioctl = compat_ptr_ioctl,
589         .release = mlx5vf_release_file,
590         .llseek = no_llseek,
591 };
592
593 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
594 {
595         struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
596         struct mlx5_vhca_data_buffer *buf;
597         size_t length;
598         int ret;
599
600         if (migf->state == MLX5_MIGF_STATE_ERROR)
601                 return -ENODEV;
602
603         ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
604                                 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
605         if (ret)
606                 goto err;
607
608         buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
609         if (IS_ERR(buf)) {
610                 ret = PTR_ERR(buf);
611                 goto err;
612         }
613
614         ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
615         if (ret)
616                 goto err_save;
617
618         return 0;
619
620 err_save:
621         mlx5vf_put_data_buffer(buf);
622 err:
623         mlx5vf_mark_err(migf);
624         return ret;
625 }
626
627 static struct mlx5_vf_migration_file *
628 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
629 {
630         struct mlx5_vf_migration_file *migf;
631         struct mlx5_vhca_data_buffer *buf;
632         size_t length;
633         u64 full_size;
634         int ret;
635
636         migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
637         if (!migf)
638                 return ERR_PTR(-ENOMEM);
639
640         migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
641                                         O_RDONLY);
642         if (IS_ERR(migf->filp)) {
643                 ret = PTR_ERR(migf->filp);
644                 goto end;
645         }
646
647         migf->mvdev = mvdev;
648         ret = mlx5vf_cmd_alloc_pd(migf);
649         if (ret)
650                 goto out_free;
651
652         stream_open(migf->filp->f_inode, migf->filp);
653         mutex_init(&migf->lock);
654         init_waitqueue_head(&migf->poll_wait);
655         init_completion(&migf->save_comp);
656         /*
657          * save_comp is being used as a binary semaphore built from
658          * a completion. A normal mutex cannot be used because the lock is
659          * passed between kernel threads and lockdep can't model this.
660          */
661         complete(&migf->save_comp);
662         mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
663         INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
664         INIT_LIST_HEAD(&migf->buf_list);
665         INIT_LIST_HEAD(&migf->avail_list);
666         spin_lock_init(&migf->list_lock);
667         ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
668         if (ret)
669                 goto out_pd;
670
671         ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
672         if (ret)
673                 goto out_pd;
674
675         if (track) {
676                 /* leave the allocated buffer ready for the stop-copy phase */
677                 buf = mlx5vf_alloc_data_buffer(migf,
678                         migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
679                 if (IS_ERR(buf)) {
680                         ret = PTR_ERR(buf);
681                         goto out_pd;
682                 }
683         } else {
684                 buf = migf->buf[0];
685                 migf->buf[0] = NULL;
686         }
687
688         ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
689         if (ret)
690                 goto out_save;
691         return migf;
692 out_save:
693         mlx5vf_free_data_buffer(buf);
694 out_pd:
695         mlx5fv_cmd_clean_migf_resources(migf);
696 out_free:
697         fput(migf->filp);
698 end:
699         kfree(migf);
700         return ERR_PTR(ret);
701 }
702
703 static int
704 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
705                               const char __user **buf, size_t *len,
706                               loff_t *pos, ssize_t *done)
707 {
708         unsigned long offset;
709         size_t page_offset;
710         struct page *page;
711         size_t page_len;
712         u8 *to_buff;
713         int ret;
714
715         offset = *pos - vhca_buf->start_pos;
716         page_offset = offset % PAGE_SIZE;
717
718         page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
719         if (!page)
720                 return -EINVAL;
721         page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
722         to_buff = kmap_local_page(page);
723         ret = copy_from_user(to_buff + page_offset, *buf, page_len);
724         kunmap_local(to_buff);
725         if (ret)
726                 return -EFAULT;
727
728         *pos += page_len;
729         *done += page_len;
730         *buf += page_len;
731         *len -= page_len;
732         vhca_buf->length += page_len;
733         return 0;
734 }
735
736 static ssize_t
737 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
738                          struct mlx5_vhca_data_buffer *vhca_buf,
739                          size_t image_size, const char __user **buf,
740                          size_t *len, loff_t *pos, ssize_t *done,
741                          bool *has_work)
742 {
743         size_t copy_len, to_copy;
744         int ret;
745
746         to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
747         copy_len = to_copy;
748         while (to_copy) {
749                 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
750                                                     done);
751                 if (ret)
752                         return ret;
753         }
754
755         *len -= copy_len;
756         if (vhca_buf->length == image_size) {
757                 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
758                 migf->max_pos += image_size;
759                 *has_work = true;
760         }
761
762         return 0;
763 }
764
765 static int
766 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
767                                struct mlx5_vhca_data_buffer *vhca_buf,
768                                const char __user **buf, size_t *len,
769                                loff_t *pos, ssize_t *done)
770 {
771         size_t copy_len, to_copy;
772         size_t required_data;
773         u8 *to_buff;
774         int ret;
775
776         required_data = migf->record_size - vhca_buf->length;
777         to_copy = min_t(size_t, *len, required_data);
778         copy_len = to_copy;
779         while (to_copy) {
780                 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
781                                                     done);
782                 if (ret)
783                         return ret;
784         }
785
786         *len -= copy_len;
787         if (vhca_buf->length == migf->record_size) {
788                 switch (migf->record_tag) {
789                 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
790                 {
791                         struct page *page;
792
793                         page = mlx5vf_get_migration_page(vhca_buf, 0);
794                         if (!page)
795                                 return -EINVAL;
796                         to_buff = kmap_local_page(page);
797                         migf->stop_copy_prep_size = min_t(u64,
798                                 le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
799                         kunmap_local(to_buff);
800                         break;
801                 }
802                 default:
803                         /* Optional tag */
804                         break;
805                 }
806
807                 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
808                 migf->max_pos += migf->record_size;
809                 vhca_buf->length = 0;
810         }
811
812         return 0;
813 }
814
815 static int
816 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
817                           struct mlx5_vhca_data_buffer *vhca_buf,
818                           const char __user **buf,
819                           size_t *len, loff_t *pos,
820                           ssize_t *done, bool *has_work)
821 {
822         struct page *page;
823         size_t copy_len;
824         u8 *to_buff;
825         int ret;
826
827         copy_len = min_t(size_t, *len,
828                 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
829         page = mlx5vf_get_migration_page(vhca_buf, 0);
830         if (!page)
831                 return -EINVAL;
832         to_buff = kmap_local_page(page);
833         ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
834         if (ret) {
835                 ret = -EFAULT;
836                 goto end;
837         }
838
839         *buf += copy_len;
840         *pos += copy_len;
841         *done += copy_len;
842         *len -= copy_len;
843         vhca_buf->length += copy_len;
844         if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
845                 u64 record_size;
846                 u32 flags;
847
848                 record_size = le64_to_cpup((__le64 *)to_buff);
849                 if (record_size > MAX_LOAD_SIZE) {
850                         ret = -ENOMEM;
851                         goto end;
852                 }
853
854                 migf->record_size = record_size;
855                 flags = le32_to_cpup((__le32 *)(to_buff +
856                             offsetof(struct mlx5_vf_migration_header, flags)));
857                 migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
858                             offsetof(struct mlx5_vf_migration_header, tag)));
859                 switch (migf->record_tag) {
860                 case MLX5_MIGF_HEADER_TAG_FW_DATA:
861                         migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
862                         break;
863                 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
864                         migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
865                         break;
866                 default:
867                         if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
868                                 ret = -EOPNOTSUPP;
869                                 goto end;
870                         }
871                         /* We may read and skip this optional record data */
872                         migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
873                 }
874
875                 migf->max_pos += vhca_buf->length;
876                 vhca_buf->length = 0;
877                 *has_work = true;
878         }
879 end:
880         kunmap_local(to_buff);
881         return ret;
882 }
883
884 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
885                                    size_t len, loff_t *pos)
886 {
887         struct mlx5_vf_migration_file *migf = filp->private_data;
888         struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
889         struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
890         loff_t requested_length;
891         bool has_work = false;
892         ssize_t done = 0;
893         int ret = 0;
894
895         if (pos)
896                 return -ESPIPE;
897         pos = &filp->f_pos;
898
899         if (*pos < 0 ||
900             check_add_overflow((loff_t)len, *pos, &requested_length))
901                 return -EINVAL;
902
903         mutex_lock(&migf->mvdev->state_mutex);
904         mutex_lock(&migf->lock);
905         if (migf->state == MLX5_MIGF_STATE_ERROR) {
906                 ret = -ENODEV;
907                 goto out_unlock;
908         }
909
910         while (len || has_work) {
911                 has_work = false;
912                 switch (migf->load_state) {
913                 case MLX5_VF_LOAD_STATE_READ_HEADER:
914                         ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
915                                                         &buf, &len, pos,
916                                                         &done, &has_work);
917                         if (ret)
918                                 goto out_unlock;
919                         break;
920                 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
921                         if (vhca_buf_header->allocated_length < migf->record_size) {
922                                 mlx5vf_free_data_buffer(vhca_buf_header);
923
924                                 migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
925                                                 migf->record_size, DMA_NONE);
926                                 if (IS_ERR(migf->buf_header[0])) {
927                                         ret = PTR_ERR(migf->buf_header[0]);
928                                         migf->buf_header[0] = NULL;
929                                         goto out_unlock;
930                                 }
931
932                                 vhca_buf_header = migf->buf_header[0];
933                         }
934
935                         vhca_buf_header->start_pos = migf->max_pos;
936                         migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
937                         break;
938                 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
939                         ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
940                                                         &buf, &len, pos, &done);
941                         if (ret)
942                                 goto out_unlock;
943                         break;
944                 case MLX5_VF_LOAD_STATE_PREP_IMAGE:
945                 {
946                         u64 size = max(migf->record_size,
947                                        migf->stop_copy_prep_size);
948
949                         if (vhca_buf->allocated_length < size) {
950                                 mlx5vf_free_data_buffer(vhca_buf);
951
952                                 migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
953                                                         size, DMA_TO_DEVICE);
954                                 if (IS_ERR(migf->buf[0])) {
955                                         ret = PTR_ERR(migf->buf[0]);
956                                         migf->buf[0] = NULL;
957                                         goto out_unlock;
958                                 }
959
960                                 vhca_buf = migf->buf[0];
961                         }
962
963                         vhca_buf->start_pos = migf->max_pos;
964                         migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
965                         break;
966                 }
967                 case MLX5_VF_LOAD_STATE_READ_IMAGE:
968                         ret = mlx5vf_resume_read_image(migf, vhca_buf,
969                                                 migf->record_size,
970                                                 &buf, &len, pos, &done, &has_work);
971                         if (ret)
972                                 goto out_unlock;
973                         break;
974                 case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
975                         ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
976                         if (ret)
977                                 goto out_unlock;
978                         migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
979
980                         /* prep header buf for next image */
981                         vhca_buf_header->length = 0;
982                         /* prep data buf for next image */
983                         vhca_buf->length = 0;
984
985                         break;
986                 default:
987                         break;
988                 }
989         }
990
991 out_unlock:
992         if (ret)
993                 migf->state = MLX5_MIGF_STATE_ERROR;
994         mutex_unlock(&migf->lock);
995         mlx5vf_state_mutex_unlock(migf->mvdev);
996         return ret ? ret : done;
997 }
998
999 static const struct file_operations mlx5vf_resume_fops = {
1000         .owner = THIS_MODULE,
1001         .write = mlx5vf_resume_write,
1002         .release = mlx5vf_release_file,
1003         .llseek = no_llseek,
1004 };
1005
1006 static struct mlx5_vf_migration_file *
1007 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
1008 {
1009         struct mlx5_vf_migration_file *migf;
1010         struct mlx5_vhca_data_buffer *buf;
1011         int ret;
1012
1013         migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
1014         if (!migf)
1015                 return ERR_PTR(-ENOMEM);
1016
1017         migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
1018                                         O_WRONLY);
1019         if (IS_ERR(migf->filp)) {
1020                 ret = PTR_ERR(migf->filp);
1021                 goto end;
1022         }
1023
1024         migf->mvdev = mvdev;
1025         ret = mlx5vf_cmd_alloc_pd(migf);
1026         if (ret)
1027                 goto out_free;
1028
1029         buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
1030         if (IS_ERR(buf)) {
1031                 ret = PTR_ERR(buf);
1032                 goto out_pd;
1033         }
1034
1035         migf->buf[0] = buf;
1036         buf = mlx5vf_alloc_data_buffer(migf,
1037                 sizeof(struct mlx5_vf_migration_header), DMA_NONE);
1038         if (IS_ERR(buf)) {
1039                 ret = PTR_ERR(buf);
1040                 goto out_buf;
1041         }
1042
1043         migf->buf_header[0] = buf;
1044         migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
1045
1046         stream_open(migf->filp->f_inode, migf->filp);
1047         mutex_init(&migf->lock);
1048         INIT_LIST_HEAD(&migf->buf_list);
1049         INIT_LIST_HEAD(&migf->avail_list);
1050         spin_lock_init(&migf->list_lock);
1051         return migf;
1052 out_buf:
1053         mlx5vf_free_data_buffer(migf->buf[0]);
1054 out_pd:
1055         mlx5vf_cmd_dealloc_pd(migf);
1056 out_free:
1057         fput(migf->filp);
1058 end:
1059         kfree(migf);
1060         return ERR_PTR(ret);
1061 }
1062
1063 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
1064                         enum mlx5_vf_migf_state *last_save_state)
1065 {
1066         if (mvdev->resuming_migf) {
1067                 mlx5vf_disable_fd(mvdev->resuming_migf);
1068                 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
1069                 fput(mvdev->resuming_migf->filp);
1070                 mvdev->resuming_migf = NULL;
1071         }
1072         if (mvdev->saving_migf) {
1073                 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
1074                 cancel_work_sync(&mvdev->saving_migf->async_data.work);
1075                 if (last_save_state)
1076                         *last_save_state = mvdev->saving_migf->state;
1077                 mlx5vf_disable_fd(mvdev->saving_migf);
1078                 wake_up_interruptible(&mvdev->saving_migf->poll_wait);
1079                 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
1080                 fput(mvdev->saving_migf->filp);
1081                 mvdev->saving_migf = NULL;
1082         }
1083 }
1084
1085 static struct file *
1086 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
1087                                     u32 new)
1088 {
1089         u32 cur = mvdev->mig_state;
1090         int ret;
1091
1092         if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
1093                 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1094                         MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1095                 if (ret)
1096                         return ERR_PTR(ret);
1097                 return NULL;
1098         }
1099
1100         if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
1101                 ret = mlx5vf_cmd_resume_vhca(mvdev,
1102                         MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
1103                 if (ret)
1104                         return ERR_PTR(ret);
1105                 return NULL;
1106         }
1107
1108         if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
1109             (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1110                 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1111                         MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
1112                 if (ret)
1113                         return ERR_PTR(ret);
1114                 return NULL;
1115         }
1116
1117         if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
1118             (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
1119                 ret = mlx5vf_cmd_resume_vhca(mvdev,
1120                         MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
1121                 if (ret)
1122                         return ERR_PTR(ret);
1123                 return NULL;
1124         }
1125
1126         if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
1127                 struct mlx5_vf_migration_file *migf;
1128
1129                 migf = mlx5vf_pci_save_device_data(mvdev, false);
1130                 if (IS_ERR(migf))
1131                         return ERR_CAST(migf);
1132                 get_file(migf->filp);
1133                 mvdev->saving_migf = migf;
1134                 return migf->filp;
1135         }
1136
1137         if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
1138                 mlx5vf_disable_fds(mvdev, NULL);
1139                 return NULL;
1140         }
1141
1142         if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
1143             (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
1144              new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
1145                 struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
1146                 struct mlx5_vhca_data_buffer *buf;
1147                 enum mlx5_vf_migf_state state;
1148                 size_t size;
1149
1150                 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL,
1151                                         MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
1152                 if (ret)
1153                         return ERR_PTR(ret);
1154                 buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE);
1155                 if (IS_ERR(buf))
1156                         return ERR_CAST(buf);
1157                 /* pre_copy cleanup */
1158                 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false);
1159                 if (ret) {
1160                         mlx5vf_put_data_buffer(buf);
1161                         return ERR_PTR(ret);
1162                 }
1163                 mlx5vf_disable_fds(mvdev, &state);
1164                 return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO);
1165         }
1166
1167         if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
1168                 struct mlx5_vf_migration_file *migf;
1169
1170                 migf = mlx5vf_pci_resume_device_data(mvdev);
1171                 if (IS_ERR(migf))
1172                         return ERR_CAST(migf);
1173                 get_file(migf->filp);
1174                 mvdev->resuming_migf = migf;
1175                 return migf->filp;
1176         }
1177
1178         if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
1179                 mlx5vf_disable_fds(mvdev, NULL);
1180                 return NULL;
1181         }
1182
1183         if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
1184             (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
1185              new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1186                 struct mlx5_vf_migration_file *migf;
1187
1188                 migf = mlx5vf_pci_save_device_data(mvdev, true);
1189                 if (IS_ERR(migf))
1190                         return ERR_CAST(migf);
1191                 get_file(migf->filp);
1192                 mvdev->saving_migf = migf;
1193                 return migf->filp;
1194         }
1195
1196         if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
1197                 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1198                         MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1199                 if (ret)
1200                         return ERR_PTR(ret);
1201                 ret = mlx5vf_pci_save_device_inc_data(mvdev);
1202                 return ret ? ERR_PTR(ret) : NULL;
1203         }
1204
1205         /*
1206          * vfio_mig_get_next_state() does not use arcs other than the above
1207          */
1208         WARN_ON(true);
1209         return ERR_PTR(-EINVAL);
1210 }
1211
1212 /*
1213  * This function is called in all state_mutex unlock cases to
1214  * handle a 'deferred_reset' if exists.
1215  */
1216 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
1217 {
1218 again:
1219         spin_lock(&mvdev->reset_lock);
1220         if (mvdev->deferred_reset) {
1221                 mvdev->deferred_reset = false;
1222                 spin_unlock(&mvdev->reset_lock);
1223                 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1224                 mlx5vf_disable_fds(mvdev, NULL);
1225                 goto again;
1226         }
1227         mutex_unlock(&mvdev->state_mutex);
1228         spin_unlock(&mvdev->reset_lock);
1229 }
1230
1231 static struct file *
1232 mlx5vf_pci_set_device_state(struct vfio_device *vdev,
1233                             enum vfio_device_mig_state new_state)
1234 {
1235         struct mlx5vf_pci_core_device *mvdev = container_of(
1236                 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1237         enum vfio_device_mig_state next_state;
1238         struct file *res = NULL;
1239         int ret;
1240
1241         mutex_lock(&mvdev->state_mutex);
1242         while (new_state != mvdev->mig_state) {
1243                 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
1244                                               new_state, &next_state);
1245                 if (ret) {
1246                         res = ERR_PTR(ret);
1247                         break;
1248                 }
1249                 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
1250                 if (IS_ERR(res))
1251                         break;
1252                 mvdev->mig_state = next_state;
1253                 if (WARN_ON(res && new_state != mvdev->mig_state)) {
1254                         fput(res);
1255                         res = ERR_PTR(-EINVAL);
1256                         break;
1257                 }
1258         }
1259         mlx5vf_state_mutex_unlock(mvdev);
1260         return res;
1261 }
1262
1263 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
1264                                     unsigned long *stop_copy_length)
1265 {
1266         struct mlx5vf_pci_core_device *mvdev = container_of(
1267                 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1268         size_t state_size;
1269         u64 total_size;
1270         int ret;
1271
1272         mutex_lock(&mvdev->state_mutex);
1273         ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
1274                                                     &total_size, 0);
1275         if (!ret)
1276                 *stop_copy_length = total_size;
1277         mlx5vf_state_mutex_unlock(mvdev);
1278         return ret;
1279 }
1280
1281 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
1282                                        enum vfio_device_mig_state *curr_state)
1283 {
1284         struct mlx5vf_pci_core_device *mvdev = container_of(
1285                 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1286
1287         mutex_lock(&mvdev->state_mutex);
1288         *curr_state = mvdev->mig_state;
1289         mlx5vf_state_mutex_unlock(mvdev);
1290         return 0;
1291 }
1292
1293 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
1294 {
1295         struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1296
1297         if (!mvdev->migrate_cap)
1298                 return;
1299
1300         /*
1301          * As the higher VFIO layers are holding locks across reset and using
1302          * those same locks with the mm_lock we need to prevent ABBA deadlock
1303          * with the state_mutex and mm_lock.
1304          * In case the state_mutex was taken already we defer the cleanup work
1305          * to the unlock flow of the other running context.
1306          */
1307         spin_lock(&mvdev->reset_lock);
1308         mvdev->deferred_reset = true;
1309         if (!mutex_trylock(&mvdev->state_mutex)) {
1310                 spin_unlock(&mvdev->reset_lock);
1311                 return;
1312         }
1313         spin_unlock(&mvdev->reset_lock);
1314         mlx5vf_state_mutex_unlock(mvdev);
1315 }
1316
1317 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
1318 {
1319         struct mlx5vf_pci_core_device *mvdev = container_of(
1320                 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1321         struct vfio_pci_core_device *vdev = &mvdev->core_device;
1322         int ret;
1323
1324         ret = vfio_pci_core_enable(vdev);
1325         if (ret)
1326                 return ret;
1327
1328         if (mvdev->migrate_cap)
1329                 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1330         vfio_pci_core_finish_enable(vdev);
1331         return 0;
1332 }
1333
1334 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
1335 {
1336         struct mlx5vf_pci_core_device *mvdev = container_of(
1337                 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1338
1339         mlx5vf_cmd_close_migratable(mvdev);
1340         vfio_pci_core_close_device(core_vdev);
1341 }
1342
1343 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
1344         .migration_set_state = mlx5vf_pci_set_device_state,
1345         .migration_get_state = mlx5vf_pci_get_device_state,
1346         .migration_get_data_size = mlx5vf_pci_get_data_size,
1347 };
1348
1349 static const struct vfio_log_ops mlx5vf_pci_log_ops = {
1350         .log_start = mlx5vf_start_page_tracker,
1351         .log_stop = mlx5vf_stop_page_tracker,
1352         .log_read_and_clear = mlx5vf_tracker_read_and_clear,
1353 };
1354
1355 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
1356 {
1357         struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1358                         struct mlx5vf_pci_core_device, core_device.vdev);
1359         int ret;
1360
1361         ret = vfio_pci_core_init_dev(core_vdev);
1362         if (ret)
1363                 return ret;
1364
1365         mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
1366                                   &mlx5vf_pci_log_ops);
1367
1368         return 0;
1369 }
1370
1371 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
1372 {
1373         struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1374                         struct mlx5vf_pci_core_device, core_device.vdev);
1375
1376         mlx5vf_cmd_remove_migratable(mvdev);
1377         vfio_pci_core_release_dev(core_vdev);
1378 }
1379
1380 static const struct vfio_device_ops mlx5vf_pci_ops = {
1381         .name = "mlx5-vfio-pci",
1382         .init = mlx5vf_pci_init_dev,
1383         .release = mlx5vf_pci_release_dev,
1384         .open_device = mlx5vf_pci_open_device,
1385         .close_device = mlx5vf_pci_close_device,
1386         .ioctl = vfio_pci_core_ioctl,
1387         .device_feature = vfio_pci_core_ioctl_feature,
1388         .read = vfio_pci_core_read,
1389         .write = vfio_pci_core_write,
1390         .mmap = vfio_pci_core_mmap,
1391         .request = vfio_pci_core_request,
1392         .match = vfio_pci_core_match,
1393         .bind_iommufd = vfio_iommufd_physical_bind,
1394         .unbind_iommufd = vfio_iommufd_physical_unbind,
1395         .attach_ioas = vfio_iommufd_physical_attach_ioas,
1396         .detach_ioas = vfio_iommufd_physical_detach_ioas,
1397 };
1398
1399 static int mlx5vf_pci_probe(struct pci_dev *pdev,
1400                             const struct pci_device_id *id)
1401 {
1402         struct mlx5vf_pci_core_device *mvdev;
1403         int ret;
1404
1405         mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
1406                                   &pdev->dev, &mlx5vf_pci_ops);
1407         if (IS_ERR(mvdev))
1408                 return PTR_ERR(mvdev);
1409
1410         dev_set_drvdata(&pdev->dev, &mvdev->core_device);
1411         ret = vfio_pci_core_register_device(&mvdev->core_device);
1412         if (ret)
1413                 goto out_put_vdev;
1414         return 0;
1415
1416 out_put_vdev:
1417         vfio_put_device(&mvdev->core_device.vdev);
1418         return ret;
1419 }
1420
1421 static void mlx5vf_pci_remove(struct pci_dev *pdev)
1422 {
1423         struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1424
1425         vfio_pci_core_unregister_device(&mvdev->core_device);
1426         vfio_put_device(&mvdev->core_device.vdev);
1427 }
1428
1429 static const struct pci_device_id mlx5vf_pci_table[] = {
1430         { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1431         {}
1432 };
1433
1434 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
1435
1436 static const struct pci_error_handlers mlx5vf_err_handlers = {
1437         .reset_done = mlx5vf_pci_aer_reset_done,
1438         .error_detected = vfio_pci_core_aer_err_detected,
1439 };
1440
1441 static struct pci_driver mlx5vf_pci_driver = {
1442         .name = KBUILD_MODNAME,
1443         .id_table = mlx5vf_pci_table,
1444         .probe = mlx5vf_pci_probe,
1445         .remove = mlx5vf_pci_remove,
1446         .err_handler = &mlx5vf_err_handlers,
1447         .driver_managed_dma = true,
1448 };
1449
1450 module_pci_driver(mlx5vf_pci_driver);
1451
1452 MODULE_IMPORT_NS(IOMMUFD);
1453 MODULE_LICENSE("GPL");
1454 MODULE_AUTHOR("Max Gurtovoy <[email protected]>");
1455 MODULE_AUTHOR("Yishai Hadas <[email protected]>");
1456 MODULE_DESCRIPTION(
1457         "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
This page took 0.117401 seconds and 4 git commands to generate.