Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg...

author Linus Torvalds <[email protected]>

Thu, 2 Nov 2023 02:44:56 +0000 (16:44 -1000)

committer Linus Torvalds <[email protected]>

Thu, 2 Nov 2023 02:44:56 +0000 (16:44 -1000)
author Linus Torvalds <[email protected]>
Thu, 2 Nov 2023 02:44:56 +0000 (16:44 -1000)
committer Linus Torvalds <[email protected]>
Thu, 2 Nov 2023 02:44:56 +0000 (16:44 -1000)
diff --combined drivers/iommu/Kconfig

index 7f04491ca5f01fddd258e93faf3c27db69507457,5cc869db1b79fc3c7d6034cde5ba724970e55f2f..ee9e2a2edbf563efa22ad2902ac9d5d23077c073
--- 1/drivers/iommu/Kconfig
--- 2/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@@ -7,6 -7,10 +7,10 @@@ config IOMMU_IOV
   config IOMMU_API
         bool
   
+ config IOMMUFD_DRIVER
+       bool
+       default n
+ 
   menuconfig IOMMU_SUPPORT
         bool "IOMMU Hardware Support"
         depends on MMU
@@@ -91,7 -95,7 +95,7 @@@ config IOMMU_DEBUGF
   choice
         prompt "IOMMU default domain type"
         depends on IOMMU_API
- -      default IOMMU_DEFAULT_DMA_LAZY if X86 || IA64
+ +      default IOMMU_DEFAULT_DMA_LAZY if X86
         default IOMMU_DEFAULT_DMA_STRICT
         help
           Choose the type of IOMMU domain used to manage DMA API usage by
@@@ -146,7 -150,7 +150,7 @@@ config OF_IOMM
   
   # IOMMU-agnostic DMA-mapping layer
   config IOMMU_DMA
- -      def_bool ARM64 || IA64 || X86
+ +      def_bool ARM64 || X86
         select DMA_OPS
         select IOMMU_API
         select IOMMU_IOVA
diff --combined drivers/iommu/intel/Kconfig

index 119d2c57a48ed5203af0e28f1330a0b86ac071aa,f5348b80652b65bdc043c2a01168789c65a2e626..012cd2541a68a62b8360591f2c680fe14104eec1
--- 1/drivers/iommu/intel/Kconfig
--- 2/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@@ -11,10 -11,11 +11,11 @@@ config DMAR_DEBU
   
   config INTEL_IOMMU
         bool "Support for Intel IOMMU using DMA Remapping Devices"
- -      depends on PCI_MSI && ACPI && (X86 || IA64)
+ +      depends on PCI_MSI && ACPI && X86
         select DMA_OPS
         select IOMMU_API
         select IOMMU_IOVA
+       select IOMMUFD_DRIVER if IOMMUFD
         select NEED_DMA_MAP_STATE
         select DMAR_TABLE
         select SWIOTLB
diff --combined drivers/iommu/intel/iommu.c

index 3685ba90ec88e81baac849f1693f507e005f4a21,a2c429855cc08393b4ddfc31819884a4a9e2b9d4..d1037280abf7a2bc4fd51d5e6de5ce0932c66424
--- 1/drivers/iommu/intel/iommu.c
--- 2/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@@ -282,7 -282,6 +282,6 @@@ static LIST_HEAD(dmar_satc_units)
   #define for_each_rmrr_units(rmrr) \
         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
   
- static void device_block_translation(struct device *dev);
   static void intel_iommu_domain_free(struct iommu_domain *domain);
   
   int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
@@@ -300,6 -299,7 +299,7 @@@ static int iommu_skip_te_disable
   #define IDENTMAP_AZALIA               4
   
   const struct iommu_ops intel_iommu_ops;
+ const struct iommu_dirty_ops intel_dirty_ops;
   
   static bool translation_pre_enabled(struct intel_iommu *iommu)
   {
@@@ -560,7 -560,7 +560,7 @@@ static unsigned long domain_super_pgsiz
   }
   
   /* Some capabilities may be different across iommus */
- static void domain_update_iommu_cap(struct dmar_domain *domain)
+ void domain_update_iommu_cap(struct dmar_domain *domain)
   {
         domain_update_iommu_coherency(domain);
         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
@@@ -1778,8 -1778,7 +1778,7 @@@ static struct dmar_domain *alloc_domain
         return domain;
   }
   
- static int domain_attach_iommu(struct dmar_domain *domain,
-                              struct intel_iommu *iommu)
+ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
   {
         struct iommu_domain_info *info, *curr;
         unsigned long ndomains;
@@@ -1828,8 -1827,7 +1827,7 @@@ err_unlock
         return ret;
   }
   
- static void domain_detach_iommu(struct dmar_domain *domain,
-                               struct intel_iommu *iommu)
+ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
   {
         struct iommu_domain_info *info;
   
@@@ -2196,6 -2194,11 +2194,11 @@@ __domain_mapping(struct dmar_domain *do
         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
                 return -EINVAL;
   
+       if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
+               pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
+               return -EINVAL;
+       }
+ 
         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
         attr |= DMA_FL_PTE_PRESENT;
         if (domain->use_first_level) {
@@@ -2998,6 -3001,13 +3001,6 @@@ static int iommu_suspend(void
         struct intel_iommu *iommu = NULL;
         unsigned long flag;
   
- -      for_each_active_iommu(iommu, drhd) {
- -              iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
- -                                           GFP_KERNEL);
- -              if (!iommu->iommu_state)
- -                      goto nomem;
- -      }
- -
         iommu_flush_all();
   
         for_each_active_iommu(iommu, drhd) {
@@@ -3017,6 -3027,12 +3020,6 @@@
                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
         }
         return 0;
- -
- -nomem:
- -      for_each_active_iommu(iommu, drhd)
- -              kfree(iommu->iommu_state);
- -
- -      return -ENOMEM;
   }
   
   static void iommu_resume(void)
@@@ -3048,6 -3064,9 +3051,6 @@@
   
                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
         }
- -
- -      for_each_active_iommu(iommu, drhd)
- -              kfree(iommu->iommu_state);
   }
   
   static struct syscore_ops iommu_syscore_ops = {
@@@ -3958,7 -3977,7 +3961,7 @@@ static void dmar_remove_one_dev_info(st
    * all DMA requests without PASID from the device are blocked. If the page
    * table has been set, clean up the data structures.
    */
- static void device_block_translation(struct device *dev)
+ void device_block_translation(struct device *dev)
   {
         struct device_domain_info *info = dev_iommu_priv_get(dev);
         struct intel_iommu *iommu = info->iommu;
@@@ -4058,14 -4077,62 +4061,62 @@@ static struct iommu_domain *intel_iommu
         return NULL;
   }
   
+ static struct iommu_domain *
+ intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
+                             struct iommu_domain *parent,
+                             const struct iommu_user_data *user_data)
+ {
+       struct device_domain_info *info = dev_iommu_priv_get(dev);
+       bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+       bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
+       struct intel_iommu *iommu = info->iommu;
+       struct iommu_domain *domain;
+ 
+       /* Must be NESTING domain */
+       if (parent) {
+               if (!nested_supported(iommu) || flags)
+                       return ERR_PTR(-EOPNOTSUPP);
+               return intel_nested_domain_alloc(parent, user_data);
+       }
+ 
+       if (flags &
+           (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
+               return ERR_PTR(-EOPNOTSUPP);
+       if (nested_parent && !nested_supported(iommu))
+               return ERR_PTR(-EOPNOTSUPP);
+       if (user_data || (dirty_tracking && !ssads_supported(iommu)))
+               return ERR_PTR(-EOPNOTSUPP);
+ 
+       /*
+        * domain_alloc_user op needs to fully initialize a domain before
+        * return, so uses iommu_domain_alloc() here for simple.
+        */
+       domain = iommu_domain_alloc(dev->bus);
+       if (!domain)
+               return ERR_PTR(-ENOMEM);
+ 
+       if (nested_parent)
+               to_dmar_domain(domain)->nested_parent = true;
+ 
+       if (dirty_tracking) {
+               if (to_dmar_domain(domain)->use_first_level) {
+                       iommu_domain_free(domain);
+                       return ERR_PTR(-EOPNOTSUPP);
+               }
+               domain->dirty_ops = &intel_dirty_ops;
+       }
+ 
+       return domain;
+ }
+ 
   static void intel_iommu_domain_free(struct iommu_domain *domain)
   {
         if (domain != &si_domain->domain && domain != &blocking_domain)
                 domain_exit(to_dmar_domain(domain));
   }
   
- static int prepare_domain_attach_device(struct iommu_domain *domain,
-                                       struct device *dev)
+ int prepare_domain_attach_device(struct iommu_domain *domain,
+                                struct device *dev)
   {
         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
         struct intel_iommu *iommu;
@@@ -4078,6 -4145,9 +4129,9 @@@
         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
                 return -EINVAL;
   
+       if (domain->dirty_ops && !ssads_supported(iommu))
+               return -EINVAL;
+ 
         /* check if this iommu agaw is sufficient for max mapped address */
         addr_width = agaw_to_width(iommu->agaw);
         if (addr_width > cap_mgaw(iommu->cap))
@@@ -4332,6 -4402,8 +4386,8 @@@ static bool intel_iommu_capable(struct 
                 return dmar_platform_optin();
         case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
                 return ecap_sc_support(info->iommu->ecap);
+       case IOMMU_CAP_DIRTY_TRACKING:
+               return ssads_supported(info->iommu);
         default:
                 return false;
         }
@@@ -4729,6 -4801,9 +4785,9 @@@ static int intel_iommu_set_dev_pasid(st
         if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
                 return -EOPNOTSUPP;
   
+       if (domain->dirty_ops)
+               return -EINVAL;
+ 
         if (context_copied(iommu, info->bus, info->devfn))
                 return -EBUSY;
   
@@@ -4780,6 -4855,7 +4839,7 @@@ static void *intel_iommu_hw_info(struc
         if (!vtd)
                 return ERR_PTR(-ENOMEM);
   
+       vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
         vtd->cap_reg = iommu->cap;
         vtd->ecap_reg = iommu->ecap;
         *length = sizeof(*vtd);
@@@ -4787,10 -4863,88 +4847,88 @@@
         return vtd;
   }
   
+ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
+                                         bool enable)
+ {
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       struct device_domain_info *info;
+       int ret;
+ 
+       spin_lock(&dmar_domain->lock);
+       if (dmar_domain->dirty_tracking == enable)
+               goto out_unlock;
+ 
+       list_for_each_entry(info, &dmar_domain->devices, link) {
+               ret = intel_pasid_setup_dirty_tracking(info->iommu,
+                                                      info->domain, info->dev,
+                                                      IOMMU_NO_PASID, enable);
+               if (ret)
+                       goto err_unwind;
+       }
+ 
+       dmar_domain->dirty_tracking = enable;
+ out_unlock:
+       spin_unlock(&dmar_domain->lock);
+ 
+       return 0;
+ 
+ err_unwind:
+       list_for_each_entry(info, &dmar_domain->devices, link)
+               intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
+                                                info->dev, IOMMU_NO_PASID,
+                                                dmar_domain->dirty_tracking);
+       spin_unlock(&dmar_domain->lock);
+       return ret;
+ }
+ 
+ static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
+                                           unsigned long iova, size_t size,
+                                           unsigned long flags,
+                                           struct iommu_dirty_bitmap *dirty)
+ {
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       unsigned long end = iova + size - 1;
+       unsigned long pgsize;
+ 
+       /*
+        * IOMMUFD core calls into a dirty tracking disabled domain without an
+        * IOVA bitmap set in order to clean dirty bits in all PTEs that might
+        * have occurred when we stopped dirty tracking. This ensures that we
+        * never inherit dirtied bits from a previous cycle.
+        */
+       if (!dmar_domain->dirty_tracking && dirty->bitmap)
+               return -EINVAL;
+ 
+       do {
+               struct dma_pte *pte;
+               int lvl = 0;
+ 
+               pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
+                                    GFP_ATOMIC);
+               pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
+               if (!pte || !dma_pte_present(pte)) {
+                       iova += pgsize;
+                       continue;
+               }
+ 
+               if (dma_sl_pte_test_and_clear_dirty(pte, flags))
+                       iommu_dirty_bitmap_record(dirty, iova, pgsize);
+               iova += pgsize;
+       } while (iova < end);
+ 
+       return 0;
+ }
+ 
+ const struct iommu_dirty_ops intel_dirty_ops = {
+       .set_dirty_tracking = intel_iommu_set_dirty_tracking,
+       .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
+ };
+ 
   const struct iommu_ops intel_iommu_ops = {
         .capable                = intel_iommu_capable,
         .hw_info                = intel_iommu_hw_info,
         .domain_alloc           = intel_iommu_domain_alloc,
+       .domain_alloc_user      = intel_iommu_domain_alloc_user,
         .probe_device           = intel_iommu_probe_device,
         .probe_finalize         = intel_iommu_probe_finalize,
         .release_device         = intel_iommu_release_device,
diff --combined drivers/iommu/intel/iommu.h

index 7dac94f62b4ec661af7030b475103ef4ac184fee,ba9be915eb844ca24ed8e312858993ec96d813f9..d796d0d9b114a4cf29bda9202636a05df091421d
--- 1/drivers/iommu/intel/iommu.h
--- 2/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@@ -25,6 -25,7 +25,7 @@@
   
   #include <asm/cacheflush.h>
   #include <asm/iommu.h>
+ #include <uapi/linux/iommufd.h>
   
   /*
    * VT-d hardware uses 4KiB page size regardless of host page size.
@@@ -48,6 -49,9 +49,9 @@@
   #define DMA_FL_PTE_DIRTY      BIT_ULL(6)
   #define DMA_FL_PTE_XD         BIT_ULL(63)
   
+ #define DMA_SL_PTE_DIRTY_BIT  9
+ #define DMA_SL_PTE_DIRTY      BIT_ULL(DMA_SL_PTE_DIRTY_BIT)
+ 
   #define ADDR_WIDTH_5LEVEL     (57)
   #define ADDR_WIDTH_4LEVEL     (48)
   
@@@ -539,6 -543,10 +543,10 @@@ enum 
   #define sm_supported(iommu)   (intel_iommu_sm && ecap_smts((iommu)->ecap))
   #define pasid_supported(iommu)        (sm_supported(iommu) &&                 \
                                  ecap_pasid((iommu)->ecap))
+ #define ssads_supported(iommu) (sm_supported(iommu) &&                 \
+                               ecap_slads((iommu)->ecap))
+ #define nested_supported(iommu)       (sm_supported(iommu) &&                 \
+                                ecap_nest((iommu)->ecap))
   
   struct pasid_entry;
   struct pasid_state_entry;
@@@ -592,20 -600,45 +600,45 @@@ struct dmar_domain 
                                          * otherwise, goes through the second
                                          * level.
                                          */
+       u8 dirty_tracking:1;            /* Dirty tracking is enabled */
+       u8 nested_parent:1;             /* Has other domains nested on it */
   
         spinlock_t lock;                /* Protect device tracking lists */
         struct list_head devices;       /* all devices' list */
         struct list_head dev_pasids;    /* all attached pasids */
   
-       struct dma_pte  *pgd;           /* virtual address */
-       int             gaw;            /* max guest address width */
- 
-       /* adjusted guest address width, 0 is level 2 30-bit */
-       int             agaw;
         int             iommu_superpage;/* Level of superpages supported:
                                            0 == 4KiB (no superpages), 1 == 2MiB,
                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
-       u64             max_addr;       /* maximum mapped address */
+       union {
+               /* DMA remapping domain */
+               struct {
+                       /* virtual address */
+                       struct dma_pte  *pgd;
+                       /* max guest address width */
+                       int             gaw;
+                       /*
+                        * adjusted guest address width:
+                        *   0: level 2 30-bit
+                        *   1: level 3 39-bit
+                        *   2: level 4 48-bit
+                        *   3: level 5 57-bit
+                        */
+                       int             agaw;
+                       /* maximum mapped address */
+                       u64             max_addr;
+               };
+ 
+               /* Nested user domain */
+               struct {
+                       /* parent page table which the user domain is nested on */
+                       struct dmar_domain *s2_domain;
+                       /* user page table pointer (in GPA) */
+                       unsigned long s1_pgtbl;
+                       /* page table attributes */
+                       struct iommu_hwpt_vtd_s1 s1_cfg;
+               };
+       };
   
         struct iommu_domain domain;     /* generic domain data structure for
                                            iommu core */
@@@ -681,7 -714,7 +714,7 @@@ struct intel_iommu 
         struct iopf_queue *iopf_queue;
         unsigned char iopfq_name[16];
         struct q_inval  *qi;            /* Queued invalidation info */
- -      u32 *iommu_state; /* Store iommu states between suspend and resume.*/
+ +      u32 iommu_state[MAX_SR_DMAR_REGS]; /* Store iommu states between suspend and resume.*/
   
   #ifdef CONFIG_IRQ_REMAP
         struct ir_table *ir_table;      /* Interrupt remapping info */
@@@ -781,6 -814,16 +814,16 @@@ static inline bool dma_pte_present(stru
         return (pte->val & 3) != 0;
   }
   
+ static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
+                                                  unsigned long flags)
+ {
+       if (flags & IOMMU_DIRTY_NO_CLEAR)
+               return (pte->val & DMA_SL_PTE_DIRTY) != 0;
+ 
+       return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
+                                 (unsigned long *)&pte->val);
+ }
+ 
   static inline bool dma_pte_superpage(struct dma_pte *pte)
   {
         return (pte->val & DMA_PTE_LARGE_PAGE);
@@@ -836,12 -879,21 +879,21 @@@ int qi_submit_sync(struct intel_iommu *
    */
   #define QI_OPT_WAIT_DRAIN             BIT(0)
   
+ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
+ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
+ void device_block_translation(struct device *dev);
+ int prepare_domain_attach_device(struct iommu_domain *domain,
+                                struct device *dev);
+ void domain_update_iommu_cap(struct dmar_domain *domain);
+ 
   int dmar_ir_support(void);
   
   void *alloc_pgtable_page(int node, gfp_t gfp);
   void free_pgtable_page(void *vaddr);
   void iommu_flush_write_buffer(struct intel_iommu *iommu);
   struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
+ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
+                                              const struct iommu_user_data *user_data);
   
   #ifdef CONFIG_INTEL_IOMMU_SVM
   void intel_svm_check(struct intel_iommu *iommu);
diff --combined drivers/vfio/pci/mlx5/main.c

index b6ac66c5008d970a664389eb9cf9eb9101cadd76,5cf2b491d15a01467cc82a5df624ffc494da8b20..fe09a8c8af95e8dedac6e08a4fba74379d1c4b5d
--- 1/drivers/vfio/pci/mlx5/main.c
--- 2/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@@ -24,8 -24,6 +24,8 @@@
   /* Device specification max LOAD size */
   #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
   
+ +#define MAX_CHUNK_SIZE SZ_8M
+ +
   static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
   {
         struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
@@@ -160,41 -158,6 +160,41 @@@ end
         return found ? buf : NULL;
   }
   
+ +static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
+ +{
+ +      struct mlx5_vf_migration_file *migf = vhca_buf->migf;
+ +
+ +      if (vhca_buf->stop_copy_chunk_num) {
+ +              bool is_header = vhca_buf->dma_dir == DMA_NONE;
+ +              u8 chunk_num = vhca_buf->stop_copy_chunk_num;
+ +              size_t next_required_umem_size = 0;
+ +
+ +              if (is_header)
+ +                      migf->buf_header[chunk_num - 1] = vhca_buf;
+ +              else
+ +                      migf->buf[chunk_num - 1] = vhca_buf;
+ +
+ +              spin_lock_irq(&migf->list_lock);
+ +              list_del_init(&vhca_buf->buf_elm);
+ +              if (!is_header) {
+ +                      next_required_umem_size =
+ +                              migf->next_required_umem_size;
+ +                      migf->next_required_umem_size = 0;
+ +                      migf->num_ready_chunks--;
+ +              }
+ +              spin_unlock_irq(&migf->list_lock);
+ +              if (next_required_umem_size)
+ +                      mlx5vf_mig_file_set_save_work(migf, chunk_num,
+ +                                                    next_required_umem_size);
+ +              return;
+ +      }
+ +
+ +      spin_lock_irq(&migf->list_lock);
+ +      list_del_init(&vhca_buf->buf_elm);
+ +      list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
+ +      spin_unlock_irq(&migf->list_lock);
+ +}
+ +
   static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
                                char __user **buf, size_t *len, loff_t *pos)
   {
@@@ -230,8 -193,12 +230,8 @@@
                 copy_len -= page_len;
         }
   
- -      if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
- -              spin_lock_irq(&vhca_buf->migf->list_lock);
- -              list_del_init(&vhca_buf->buf_elm);
- -              list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
- -              spin_unlock_irq(&vhca_buf->migf->list_lock);
- -      }
+ +      if (*pos >= vhca_buf->start_pos + vhca_buf->length)
+ +              mlx5vf_buf_read_done(vhca_buf);
   
         return done;
   }
@@@ -337,75 -304,7 +337,75 @@@ static void mlx5vf_mark_err(struct mlx5
         wake_up_interruptible(&migf->poll_wait);
   }
   
- -static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
+ +void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
+ +                                 u8 chunk_num, size_t next_required_umem_size)
+ +{
+ +      migf->save_data[chunk_num - 1].next_required_umem_size =
+ +                      next_required_umem_size;
+ +      migf->save_data[chunk_num - 1].migf = migf;
+ +      get_file(migf->filp);
+ +      queue_work(migf->mvdev->cb_wq,
+ +                 &migf->save_data[chunk_num - 1].work);
+ +}
+ +
+ +static struct mlx5_vhca_data_buffer *
+ +mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
+ +                                u8 index, size_t required_length)
+ +{
+ +      struct mlx5_vhca_data_buffer *buf = migf->buf[index];
+ +      u8 chunk_num;
+ +
+ +      WARN_ON(!buf);
+ +      chunk_num = buf->stop_copy_chunk_num;
+ +      buf->migf->buf[index] = NULL;
+ +      /* Checking whether the pre-allocated buffer can fit */
+ +      if (buf->allocated_length >= required_length)
+ +              return buf;
+ +
+ +      mlx5vf_put_data_buffer(buf);
+ +      buf = mlx5vf_get_data_buffer(buf->migf, required_length,
+ +                                   DMA_FROM_DEVICE);
+ +      if (IS_ERR(buf))
+ +              return buf;
+ +
+ +      buf->stop_copy_chunk_num = chunk_num;
+ +      return buf;
+ +}
+ +
+ +static void mlx5vf_mig_file_save_work(struct work_struct *_work)
+ +{
+ +      struct mlx5vf_save_work_data *save_data = container_of(_work,
+ +              struct mlx5vf_save_work_data, work);
+ +      struct mlx5_vf_migration_file *migf = save_data->migf;
+ +      struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
+ +      struct mlx5_vhca_data_buffer *buf;
+ +
+ +      mutex_lock(&mvdev->state_mutex);
+ +      if (migf->state == MLX5_MIGF_STATE_ERROR)
+ +              goto end;
+ +
+ +      buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
+ +                              save_data->chunk_num - 1,
+ +                              save_data->next_required_umem_size);
+ +      if (IS_ERR(buf))
+ +              goto err;
+ +
+ +      if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
+ +              goto err_save;
+ +
+ +      goto end;
+ +
+ +err_save:
+ +      mlx5vf_put_data_buffer(buf);
+ +err:
+ +      mlx5vf_mark_err(migf);
+ +end:
+ +      mlx5vf_state_mutex_unlock(mvdev);
+ +      fput(migf->filp);
+ +}
+ +
+ +static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
+ +                                     bool track)
   {
         size_t size = sizeof(struct mlx5_vf_migration_header) +
                 sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
@@@ -432,7 -331,7 +432,7 @@@
         to_buff = kmap_local_page(page);
         memcpy(to_buff, &header, sizeof(header));
         header_buf->length = sizeof(header);
- -      data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
+ +      data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
         memcpy(to_buff + sizeof(header), &data, sizeof(data));
         header_buf->length += sizeof(data);
         kunmap_local(to_buff);
@@@ -441,86 -340,48 +441,86 @@@
         spin_lock_irqsave(&migf->list_lock, flags);
         list_add_tail(&header_buf->buf_elm, &migf->buf_list);
         spin_unlock_irqrestore(&migf->list_lock, flags);
- -      migf->pre_copy_initial_bytes = size;
+ +      if (track)
+ +              migf->pre_copy_initial_bytes = size;
         return 0;
   err:
         mlx5vf_put_data_buffer(header_buf);
         return ret;
   }
   
- -static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
- -                               size_t state_size)
+ +static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
+ +                               struct mlx5_vf_migration_file *migf,
+ +                               size_t state_size, u64 full_size,
+ +                               bool track)
   {
         struct mlx5_vhca_data_buffer *buf;
         size_t inc_state_size;
+ +      int num_chunks;
         int ret;
+ +      int i;
   
- -      /* let's be ready for stop_copy size that might grow by 10 percents */
- -      if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
- -              inc_state_size = state_size;
+ +      if (mvdev->chunk_mode) {
+ +              size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
   
- -      buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
- -      if (IS_ERR(buf))
- -              return PTR_ERR(buf);
+ +              /* from firmware perspective at least 'state_size' buffer should be set */
+ +              inc_state_size = max(state_size, chunk_size);
+ +      } else {
+ +              if (track) {
+ +                      /* let's be ready for stop_copy size that might grow by 10 percents */
+ +                      if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
+ +                              inc_state_size = state_size;
+ +              } else {
+ +                      inc_state_size = state_size;
+ +              }
+ +      }
   
- -      migf->buf = buf;
- -      buf = mlx5vf_get_data_buffer(migf,
- -                      sizeof(struct mlx5_vf_migration_header), DMA_NONE);
- -      if (IS_ERR(buf)) {
- -              ret = PTR_ERR(buf);
- -              goto err;
+ +      /* let's not overflow the device specification max SAVE size */
+ +      inc_state_size = min_t(size_t, inc_state_size,
+ +              (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
+ +
+ +      num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
+ +      for (i = 0; i < num_chunks; i++) {
+ +              buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
+ +              if (IS_ERR(buf)) {
+ +                      ret = PTR_ERR(buf);
+ +                      goto err;
+ +              }
+ +
+ +              migf->buf[i] = buf;
+ +              buf = mlx5vf_get_data_buffer(migf,
+ +                              sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ +              if (IS_ERR(buf)) {
+ +                      ret = PTR_ERR(buf);
+ +                      goto err;
+ +              }
+ +              migf->buf_header[i] = buf;
+ +              if (mvdev->chunk_mode) {
+ +                      migf->buf[i]->stop_copy_chunk_num = i + 1;
+ +                      migf->buf_header[i]->stop_copy_chunk_num = i + 1;
+ +                      INIT_WORK(&migf->save_data[i].work,
+ +                                mlx5vf_mig_file_save_work);
+ +                      migf->save_data[i].chunk_num = i + 1;
+ +              }
         }
   
- -      migf->buf_header = buf;
- -      ret = mlx5vf_add_stop_copy_header(migf);
+ +      ret = mlx5vf_add_stop_copy_header(migf, track);
         if (ret)
- -              goto err_header;
+ +              goto err;
         return 0;
   
- -err_header:
- -      mlx5vf_put_data_buffer(migf->buf_header);
- -      migf->buf_header = NULL;
   err:
- -      mlx5vf_put_data_buffer(migf->buf);
- -      migf->buf = NULL;
+ +      for (i = 0; i < num_chunks; i++) {
+ +              if (migf->buf[i]) {
+ +                      mlx5vf_put_data_buffer(migf->buf[i]);
+ +                      migf->buf[i] = NULL;
+ +              }
+ +              if (migf->buf_header[i]) {
+ +                      mlx5vf_put_data_buffer(migf->buf_header[i]);
+ +                      migf->buf_header[i] = NULL;
+ +              }
+ +      }
+ +
         return ret;
   }
   
@@@ -567,7 -428,7 +567,7 @@@ static long mlx5vf_precopy_ioctl(struc
                  * As so, the other code below is safe with the proper locks.
                  */
                 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
- -                                                          MLX5VF_QUERY_INC);
+ +                                                          NULL, MLX5VF_QUERY_INC);
                 if (ret)
                         goto err_state_unlock;
         }
@@@ -644,15 -505,21 +644,15 @@@ static int mlx5vf_pci_save_device_inc_d
         if (migf->state == MLX5_MIGF_STATE_ERROR)
                 return -ENODEV;
   
- -      ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
+ +      ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
                                 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
         if (ret)
                 goto err;
   
- -      /* Checking whether we have a matching pre-allocated buffer that can fit */
- -      if (migf->buf && migf->buf->allocated_length >= length) {
- -              buf = migf->buf;
- -              migf->buf = NULL;
- -      } else {
- -              buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
- -              if (IS_ERR(buf)) {
- -                      ret = PTR_ERR(buf);
- -                      goto err;
- -              }
+ +      buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
+ +      if (IS_ERR(buf)) {
+ +              ret = PTR_ERR(buf);
+ +              goto err;
         }
   
         ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
@@@ -674,7 -541,6 +674,7 @@@ mlx5vf_pci_save_device_data(struct mlx5
         struct mlx5_vf_migration_file *migf;
         struct mlx5_vhca_data_buffer *buf;
         size_t length;
+ +      u64 full_size;
         int ret;
   
         migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
@@@ -708,25 -574,20 +708,25 @@@
         INIT_LIST_HEAD(&migf->buf_list);
         INIT_LIST_HEAD(&migf->avail_list);
         spin_lock_init(&migf->list_lock);
- -      ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
+ +      ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
+ +      if (ret)
+ +              goto out_pd;
+ +
+ +      ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
         if (ret)
                 goto out_pd;
   
         if (track) {
- -              ret = mlx5vf_prep_stop_copy(migf, length);
- -              if (ret)
+ +              /* leave the allocated buffer ready for the stop-copy phase */
+ +              buf = mlx5vf_alloc_data_buffer(migf,
+ +                      migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
+ +              if (IS_ERR(buf)) {
+ +                      ret = PTR_ERR(buf);
                         goto out_pd;
- -      }
- -
- -      buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
- -      if (IS_ERR(buf)) {
- -              ret = PTR_ERR(buf);
- -              goto out_pd;
+ +              }
+ +      } else {
+ +              buf = migf->buf[0];
+ +              migf->buf[0] = NULL;
         }
   
         ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
@@@ -959,8 -820,8 +959,8 @@@ static ssize_t mlx5vf_resume_write(stru
                                    size_t len, loff_t *pos)
   {
         struct mlx5_vf_migration_file *migf = filp->private_data;
- -      struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
- -      struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
+ +      struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
+ +      struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
         loff_t requested_length;
         bool has_work = false;
         ssize_t done = 0;
@@@ -995,15 -856,15 +995,15 @@@
                         if (vhca_buf_header->allocated_length < migf->record_size) {
                                 mlx5vf_free_data_buffer(vhca_buf_header);
   
- -                              migf->buf_header = mlx5vf_alloc_data_buffer(migf,
+ +                              migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
                                                 migf->record_size, DMA_NONE);
- -                              if (IS_ERR(migf->buf_header)) {
- -                                      ret = PTR_ERR(migf->buf_header);
- -                                      migf->buf_header = NULL;
+ +                              if (IS_ERR(migf->buf_header[0])) {
+ +                                      ret = PTR_ERR(migf->buf_header[0]);
+ +                                      migf->buf_header[0] = NULL;
                                         goto out_unlock;
                                 }
   
- -                              vhca_buf_header = migf->buf_header;
+ +                              vhca_buf_header = migf->buf_header[0];
                         }
   
                         vhca_buf_header->start_pos = migf->max_pos;
@@@ -1023,15 -884,15 +1023,15 @@@
                         if (vhca_buf->allocated_length < size) {
                                 mlx5vf_free_data_buffer(vhca_buf);
   
- -                              migf->buf = mlx5vf_alloc_data_buffer(migf,
+ +                              migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
                                                         size, DMA_TO_DEVICE);
- -                              if (IS_ERR(migf->buf)) {
- -                                      ret = PTR_ERR(migf->buf);
- -                                      migf->buf = NULL;
+ +                              if (IS_ERR(migf->buf[0])) {
+ +                                      ret = PTR_ERR(migf->buf[0]);
+ +                                      migf->buf[0] = NULL;
                                         goto out_unlock;
                                 }
   
- -                              vhca_buf = migf->buf;
+ +                              vhca_buf = migf->buf[0];
                         }
   
                         vhca_buf->start_pos = migf->max_pos;
@@@ -1113,7 -974,7 +1113,7 @@@ mlx5vf_pci_resume_device_data(struct ml
                 goto out_pd;
         }
   
- -      migf->buf = buf;
+ +      migf->buf[0] = buf;
         if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
                 buf = mlx5vf_alloc_data_buffer(migf,
                         sizeof(struct mlx5_vf_migration_header), DMA_NONE);
@@@ -1122,7 -983,7 +1122,7 @@@
                         goto out_buf;
                 }
   
- -              migf->buf_header = buf;
+ +              migf->buf_header[0] = buf;
                 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
         } else {
                 /* Initial state will be to read the image */
@@@ -1136,7 -997,7 +1136,7 @@@
         spin_lock_init(&migf->list_lock);
         return migf;
   out_buf:
- -      mlx5vf_free_data_buffer(migf->buf);
+ +      mlx5vf_free_data_buffer(migf->buf[0]);
   out_pd:
         mlx5vf_cmd_dealloc_pd(migf);
   out_free:
@@@ -1158,7 -1019,6 +1158,7 @@@ void mlx5vf_disable_fds(struct mlx5vf_p
                 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
                 cancel_work_sync(&mvdev->saving_migf->async_data.work);
                 mlx5vf_disable_fd(mvdev->saving_migf);
+ +              wake_up_interruptible(&mvdev->saving_migf->poll_wait);
                 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
                 fput(mvdev->saving_migf->filp);
                 mvdev->saving_migf = NULL;
@@@ -1240,7 -1100,7 +1240,7 @@@ mlx5vf_pci_step_device_state_locked(str
                 if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
                         ret = mlx5vf_cmd_load_vhca_state(mvdev,
                                                          mvdev->resuming_migf,
- -                                                       mvdev->resuming_migf->buf);
+ +                                                       mvdev->resuming_migf->buf[0]);
                         if (ret)
                                 return ERR_PTR(ret);
                 }
@@@ -1334,14 -1194,13 +1334,14 @@@ static int mlx5vf_pci_get_data_size(str
         struct mlx5vf_pci_core_device *mvdev = container_of(
                 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
         size_t state_size;
+ +      u64 total_size;
         int ret;
   
         mutex_lock(&mvdev->state_mutex);
- -      ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
- -                                                  &state_size, 0);
+ +      ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
+ +                                                  &total_size, 0);
         if (!ret)
- -              *stop_copy_length = state_size;
+ +              *stop_copy_length = total_size;
         mlx5vf_state_mutex_unlock(mvdev);
         return ret;
   }
@@@ -1517,6 -1376,7 +1517,7 @@@ static struct pci_driver mlx5vf_pci_dri
   
   module_pci_driver(mlx5vf_pci_driver);
   
+ MODULE_IMPORT_NS(IOMMUFD);
   MODULE_LICENSE("GPL");
   MODULE_AUTHOR("Max Gurtovoy <[email protected]>");
   MODULE_AUTHOR("Yishai Hadas <[email protected]>");
diff --combined drivers/vfio/pci/pds/Kconfig

index 6eceef7b028aae9b8b7a8cb49614e88525f4bade,fff368a8183b25455a534a7a9dffe7db4b0070eb..fec9b167c7b9ac98ae24dddd9265e30d95942e7d
--- 1/drivers/vfio/pci/pds/Kconfig
--- 2/drivers/vfio/pci/pds/Kconfig
+++ b/drivers/vfio/pci/pds/Kconfig
@@@ -3,8 -3,9 +3,9 @@@
   
   config PDS_VFIO_PCI
         tristate "VFIO support for PDS PCI devices"
- -      depends on PDS_CORE
+ +      depends on PDS_CORE && PCI_IOV
         select VFIO_PCI_CORE
+       select IOMMUFD_DRIVER
         help
           This provides generic PCI support for PDS devices using the VFIO
           framework.
diff --combined drivers/vfio/vfio_main.c

index e31e1952d7b8f1a49a27f005e05653e3b6f23976,a96d97da367daa87a9e5920f36216d64f2c1afc0..8d4995ada74a01848ce8e7becf61120cc10ec33a
--- 1/drivers/vfio/vfio_main.c
--- 2/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@@ -946,11 -946,6 +946,11 @@@ void vfio_combine_iova_ranges(struct rb
                 unsigned long last;
   
                 comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
+ +
+ +              /* Empty list */
+ +              if (WARN_ON_ONCE(!comb_start))
+ +                      return;
+ +
                 curr = comb_start;
                 while (curr) {
                         last = curr->last;
@@@ -980,11 -975,6 +980,11 @@@
                         prev = curr;
                         curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
                 }
+ +
+ +              /* Empty list or no nodes to combine */
+ +              if (WARN_ON_ONCE(min_gap == ULONG_MAX))
+ +                      break;
+ +
                 comb_start->last = comb_end->last;
                 interval_tree_remove(comb_end, root);
                 cur_nodes--;
@@@ -1703,6 -1693,7 +1703,7 @@@ static void __exit vfio_cleanup(void
   module_init(vfio_init);
   module_exit(vfio_cleanup);
   
+ MODULE_IMPORT_NS(IOMMUFD);
   MODULE_VERSION(DRIVER_VERSION);
   MODULE_LICENSE("GPL v2");
   MODULE_AUTHOR(DRIVER_AUTHOR);
author	Linus Torvalds <[email protected]>
	Thu, 2 Nov 2023 02:44:56 +0000 (16:44 -1000)
committer	Linus Torvalds <[email protected]>
	Thu, 2 Nov 2023 02:44:56 +0000 (16:44 -1000)
		1	2
drivers/iommu/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/iommu/intel/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/iommu/intel/iommu.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/iommu/intel/iommu.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/vfio/pci/mlx5/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/vfio/pci/pds/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/vfio/vfio_main.c	patch \|	diff1 \|	diff2 \|	blob \| history