]> Git Repo - linux.git/commitdiff
Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg...
authorLinus Torvalds <[email protected]>
Thu, 2 Nov 2023 02:44:56 +0000 (16:44 -1000)
committerLinus Torvalds <[email protected]>
Thu, 2 Nov 2023 02:44:56 +0000 (16:44 -1000)
Pull iommufd updates from Jason Gunthorpe:
 "This brings three new iommufd capabilities:

   - Dirty tracking for DMA.

     AMD/ARM/Intel CPUs can now record if a DMA writes to a page in the
     IOPTEs within the IO page table. This can be used to generate a
     record of what memory is being dirtied by DMA activities during a
     VM migration process. A VMM like qemu will combine the IOMMU dirty
     bits with the CPU's dirty log to determine what memory to transfer.

     VFIO already has a DMA dirty tracking framework that requires PCI
     devices to implement tracking HW internally. The iommufd version
     provides an alternative that the VMM can select, if available. The
     two are designed to have very similar APIs.

   - Userspace controlled attributes for hardware page tables
     (HWPT/iommu_domain). There are currently a few generic attributes
     for HWPTs (support dirty tracking, and parent of a nest). This is
     an entry point for the userspace iommu driver to control the HW in
     detail.

   - Nested translation support for HWPTs. This is a 2D translation
     scheme similar to the CPU where a DMA goes through a first stage to
     determine an intermediate address which is then translated trough a
     second stage to a physical address.

     Like for CPU translation the first stage table would exist in VM
     controlled memory and the second stage is in the kernel and matches
     the VM's guest to physical map.

     As every IOMMU has a unique set of parameter to describe the S1 IO
     page table and its associated parameters the userspace IOMMU driver
     has to marshal the information into the correct format.

     This is 1/3 of the feature, it allows creating the nested
     translation and binding it to VFIO devices, however the API to
     support IOTLB and ATC invalidation of the stage 1 io page table,
     and forwarding of IO faults are still in progress.

  The series includes AMD and Intel support for dirty tracking. Intel
  support for nested translation.

  Along the way are a number of internal items:

   - New iommu core items: ops->domain_alloc_user(),
     ops->set_dirty_tracking, ops->read_and_clear_dirty(),
     IOMMU_DOMAIN_NESTED, and iommu_copy_struct_from_user

   - UAF fix in iopt_area_split()

   - Spelling fixes and some test suite improvement"

* tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (52 commits)
  iommufd: Organize the mock domain alloc functions closer to Joerg's tree
  iommufd/selftest: Fix page-size check in iommufd_test_dirty()
  iommufd: Add iopt_area_alloc()
  iommufd: Fix missing update of domains_itree after splitting iopt_area
  iommu/vt-d: Disallow read-only mappings to nest parent domain
  iommu/vt-d: Add nested domain allocation
  iommu/vt-d: Set the nested domain to a device
  iommu/vt-d: Make domain attach helpers to be extern
  iommu/vt-d: Add helper to setup pasid nested translation
  iommu/vt-d: Add helper for nested domain allocation
  iommu/vt-d: Extend dmar_domain to support nested domain
  iommufd: Add data structure for Intel VT-d stage-1 domain allocation
  iommu/vt-d: Enhance capability check for nested parent domain allocation
  iommufd/selftest: Add coverage for IOMMU_HWPT_ALLOC with nested HWPTs
  iommufd/selftest: Add nested domain allocation for mock domain
  iommu: Add iommu_copy_struct_from_user helper
  iommufd: Add a nested HW pagetable object
  iommu: Pass in parent domain with user_data to domain_alloc_user op
  iommufd: Share iommufd_hwpt_alloc with IOMMUFD_OBJ_HWPT_NESTED
  iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable
  ...

1  2 
drivers/iommu/Kconfig
drivers/iommu/intel/Kconfig
drivers/iommu/intel/iommu.c
drivers/iommu/intel/iommu.h
drivers/vfio/pci/mlx5/main.c
drivers/vfio/pci/pds/Kconfig
drivers/vfio/vfio_main.c

diff --combined drivers/iommu/Kconfig
index 7f04491ca5f01fddd258e93faf3c27db69507457,5cc869db1b79fc3c7d6034cde5ba724970e55f2f..ee9e2a2edbf563efa22ad2902ac9d5d23077c073
@@@ -7,6 -7,10 +7,10 @@@ config IOMMU_IOV
  config IOMMU_API
        bool
  
+ config IOMMUFD_DRIVER
+       bool
+       default n
  menuconfig IOMMU_SUPPORT
        bool "IOMMU Hardware Support"
        depends on MMU
@@@ -91,7 -95,7 +95,7 @@@ config IOMMU_DEBUGF
  choice
        prompt "IOMMU default domain type"
        depends on IOMMU_API
 -      default IOMMU_DEFAULT_DMA_LAZY if X86 || IA64
 +      default IOMMU_DEFAULT_DMA_LAZY if X86
        default IOMMU_DEFAULT_DMA_STRICT
        help
          Choose the type of IOMMU domain used to manage DMA API usage by
@@@ -146,7 -150,7 +150,7 @@@ config OF_IOMM
  
  # IOMMU-agnostic DMA-mapping layer
  config IOMMU_DMA
 -      def_bool ARM64 || IA64 || X86
 +      def_bool ARM64 || X86
        select DMA_OPS
        select IOMMU_API
        select IOMMU_IOVA
index 119d2c57a48ed5203af0e28f1330a0b86ac071aa,f5348b80652b65bdc043c2a01168789c65a2e626..012cd2541a68a62b8360591f2c680fe14104eec1
@@@ -11,10 -11,11 +11,11 @@@ config DMAR_DEBU
  
  config INTEL_IOMMU
        bool "Support for Intel IOMMU using DMA Remapping Devices"
 -      depends on PCI_MSI && ACPI && (X86 || IA64)
 +      depends on PCI_MSI && ACPI && X86
        select DMA_OPS
        select IOMMU_API
        select IOMMU_IOVA
+       select IOMMUFD_DRIVER if IOMMUFD
        select NEED_DMA_MAP_STATE
        select DMAR_TABLE
        select SWIOTLB
index 3685ba90ec88e81baac849f1693f507e005f4a21,a2c429855cc08393b4ddfc31819884a4a9e2b9d4..d1037280abf7a2bc4fd51d5e6de5ce0932c66424
@@@ -282,7 -282,6 +282,6 @@@ static LIST_HEAD(dmar_satc_units)
  #define for_each_rmrr_units(rmrr) \
        list_for_each_entry(rmrr, &dmar_rmrr_units, list)
  
- static void device_block_translation(struct device *dev);
  static void intel_iommu_domain_free(struct iommu_domain *domain);
  
  int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
@@@ -300,6 -299,7 +299,7 @@@ static int iommu_skip_te_disable
  #define IDENTMAP_AZALIA               4
  
  const struct iommu_ops intel_iommu_ops;
+ const struct iommu_dirty_ops intel_dirty_ops;
  
  static bool translation_pre_enabled(struct intel_iommu *iommu)
  {
@@@ -560,7 -560,7 +560,7 @@@ static unsigned long domain_super_pgsiz
  }
  
  /* Some capabilities may be different across iommus */
static void domain_update_iommu_cap(struct dmar_domain *domain)
+ void domain_update_iommu_cap(struct dmar_domain *domain)
  {
        domain_update_iommu_coherency(domain);
        domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
@@@ -1778,8 -1778,7 +1778,7 @@@ static struct dmar_domain *alloc_domain
        return domain;
  }
  
- static int domain_attach_iommu(struct dmar_domain *domain,
-                              struct intel_iommu *iommu)
+ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
  {
        struct iommu_domain_info *info, *curr;
        unsigned long ndomains;
@@@ -1828,8 -1827,7 +1827,7 @@@ err_unlock
        return ret;
  }
  
- static void domain_detach_iommu(struct dmar_domain *domain,
-                               struct intel_iommu *iommu)
+ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
  {
        struct iommu_domain_info *info;
  
@@@ -2196,6 -2194,11 +2194,11 @@@ __domain_mapping(struct dmar_domain *do
        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
                return -EINVAL;
  
+       if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
+               pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
+               return -EINVAL;
+       }
        attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
        attr |= DMA_FL_PTE_PRESENT;
        if (domain->use_first_level) {
@@@ -2998,6 -3001,13 +3001,6 @@@ static int iommu_suspend(void
        struct intel_iommu *iommu = NULL;
        unsigned long flag;
  
 -      for_each_active_iommu(iommu, drhd) {
 -              iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
 -                                           GFP_KERNEL);
 -              if (!iommu->iommu_state)
 -                      goto nomem;
 -      }
 -
        iommu_flush_all();
  
        for_each_active_iommu(iommu, drhd) {
                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
        }
        return 0;
 -
 -nomem:
 -      for_each_active_iommu(iommu, drhd)
 -              kfree(iommu->iommu_state);
 -
 -      return -ENOMEM;
  }
  
  static void iommu_resume(void)
  
                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
        }
 -
 -      for_each_active_iommu(iommu, drhd)
 -              kfree(iommu->iommu_state);
  }
  
  static struct syscore_ops iommu_syscore_ops = {
@@@ -3958,7 -3977,7 +3961,7 @@@ static void dmar_remove_one_dev_info(st
   * all DMA requests without PASID from the device are blocked. If the page
   * table has been set, clean up the data structures.
   */
static void device_block_translation(struct device *dev)
+ void device_block_translation(struct device *dev)
  {
        struct device_domain_info *info = dev_iommu_priv_get(dev);
        struct intel_iommu *iommu = info->iommu;
@@@ -4058,14 -4077,62 +4061,62 @@@ static struct iommu_domain *intel_iommu
        return NULL;
  }
  
+ static struct iommu_domain *
+ intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
+                             struct iommu_domain *parent,
+                             const struct iommu_user_data *user_data)
+ {
+       struct device_domain_info *info = dev_iommu_priv_get(dev);
+       bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+       bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
+       struct intel_iommu *iommu = info->iommu;
+       struct iommu_domain *domain;
+       /* Must be NESTING domain */
+       if (parent) {
+               if (!nested_supported(iommu) || flags)
+                       return ERR_PTR(-EOPNOTSUPP);
+               return intel_nested_domain_alloc(parent, user_data);
+       }
+       if (flags &
+           (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
+               return ERR_PTR(-EOPNOTSUPP);
+       if (nested_parent && !nested_supported(iommu))
+               return ERR_PTR(-EOPNOTSUPP);
+       if (user_data || (dirty_tracking && !ssads_supported(iommu)))
+               return ERR_PTR(-EOPNOTSUPP);
+       /*
+        * domain_alloc_user op needs to fully initialize a domain before
+        * return, so uses iommu_domain_alloc() here for simple.
+        */
+       domain = iommu_domain_alloc(dev->bus);
+       if (!domain)
+               return ERR_PTR(-ENOMEM);
+       if (nested_parent)
+               to_dmar_domain(domain)->nested_parent = true;
+       if (dirty_tracking) {
+               if (to_dmar_domain(domain)->use_first_level) {
+                       iommu_domain_free(domain);
+                       return ERR_PTR(-EOPNOTSUPP);
+               }
+               domain->dirty_ops = &intel_dirty_ops;
+       }
+       return domain;
+ }
  static void intel_iommu_domain_free(struct iommu_domain *domain)
  {
        if (domain != &si_domain->domain && domain != &blocking_domain)
                domain_exit(to_dmar_domain(domain));
  }
  
static int prepare_domain_attach_device(struct iommu_domain *domain,
-                                       struct device *dev)
+ int prepare_domain_attach_device(struct iommu_domain *domain,
+                                struct device *dev)
  {
        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
        struct intel_iommu *iommu;
        if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
                return -EINVAL;
  
+       if (domain->dirty_ops && !ssads_supported(iommu))
+               return -EINVAL;
        /* check if this iommu agaw is sufficient for max mapped address */
        addr_width = agaw_to_width(iommu->agaw);
        if (addr_width > cap_mgaw(iommu->cap))
@@@ -4332,6 -4402,8 +4386,8 @@@ static bool intel_iommu_capable(struct 
                return dmar_platform_optin();
        case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
                return ecap_sc_support(info->iommu->ecap);
+       case IOMMU_CAP_DIRTY_TRACKING:
+               return ssads_supported(info->iommu);
        default:
                return false;
        }
@@@ -4729,6 -4801,9 +4785,9 @@@ static int intel_iommu_set_dev_pasid(st
        if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
                return -EOPNOTSUPP;
  
+       if (domain->dirty_ops)
+               return -EINVAL;
        if (context_copied(iommu, info->bus, info->devfn))
                return -EBUSY;
  
@@@ -4780,6 -4855,7 +4839,7 @@@ static void *intel_iommu_hw_info(struc
        if (!vtd)
                return ERR_PTR(-ENOMEM);
  
+       vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
        vtd->cap_reg = iommu->cap;
        vtd->ecap_reg = iommu->ecap;
        *length = sizeof(*vtd);
        return vtd;
  }
  
+ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
+                                         bool enable)
+ {
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       struct device_domain_info *info;
+       int ret;
+       spin_lock(&dmar_domain->lock);
+       if (dmar_domain->dirty_tracking == enable)
+               goto out_unlock;
+       list_for_each_entry(info, &dmar_domain->devices, link) {
+               ret = intel_pasid_setup_dirty_tracking(info->iommu,
+                                                      info->domain, info->dev,
+                                                      IOMMU_NO_PASID, enable);
+               if (ret)
+                       goto err_unwind;
+       }
+       dmar_domain->dirty_tracking = enable;
+ out_unlock:
+       spin_unlock(&dmar_domain->lock);
+       return 0;
+ err_unwind:
+       list_for_each_entry(info, &dmar_domain->devices, link)
+               intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
+                                                info->dev, IOMMU_NO_PASID,
+                                                dmar_domain->dirty_tracking);
+       spin_unlock(&dmar_domain->lock);
+       return ret;
+ }
+ static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
+                                           unsigned long iova, size_t size,
+                                           unsigned long flags,
+                                           struct iommu_dirty_bitmap *dirty)
+ {
+       struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+       unsigned long end = iova + size - 1;
+       unsigned long pgsize;
+       /*
+        * IOMMUFD core calls into a dirty tracking disabled domain without an
+        * IOVA bitmap set in order to clean dirty bits in all PTEs that might
+        * have occurred when we stopped dirty tracking. This ensures that we
+        * never inherit dirtied bits from a previous cycle.
+        */
+       if (!dmar_domain->dirty_tracking && dirty->bitmap)
+               return -EINVAL;
+       do {
+               struct dma_pte *pte;
+               int lvl = 0;
+               pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
+                                    GFP_ATOMIC);
+               pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
+               if (!pte || !dma_pte_present(pte)) {
+                       iova += pgsize;
+                       continue;
+               }
+               if (dma_sl_pte_test_and_clear_dirty(pte, flags))
+                       iommu_dirty_bitmap_record(dirty, iova, pgsize);
+               iova += pgsize;
+       } while (iova < end);
+       return 0;
+ }
+ const struct iommu_dirty_ops intel_dirty_ops = {
+       .set_dirty_tracking = intel_iommu_set_dirty_tracking,
+       .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
+ };
  const struct iommu_ops intel_iommu_ops = {
        .capable                = intel_iommu_capable,
        .hw_info                = intel_iommu_hw_info,
        .domain_alloc           = intel_iommu_domain_alloc,
+       .domain_alloc_user      = intel_iommu_domain_alloc_user,
        .probe_device           = intel_iommu_probe_device,
        .probe_finalize         = intel_iommu_probe_finalize,
        .release_device         = intel_iommu_release_device,
index 7dac94f62b4ec661af7030b475103ef4ac184fee,ba9be915eb844ca24ed8e312858993ec96d813f9..d796d0d9b114a4cf29bda9202636a05df091421d
@@@ -25,6 -25,7 +25,7 @@@
  
  #include <asm/cacheflush.h>
  #include <asm/iommu.h>
+ #include <uapi/linux/iommufd.h>
  
  /*
   * VT-d hardware uses 4KiB page size regardless of host page size.
@@@ -48,6 -49,9 +49,9 @@@
  #define DMA_FL_PTE_DIRTY      BIT_ULL(6)
  #define DMA_FL_PTE_XD         BIT_ULL(63)
  
+ #define DMA_SL_PTE_DIRTY_BIT  9
+ #define DMA_SL_PTE_DIRTY      BIT_ULL(DMA_SL_PTE_DIRTY_BIT)
  #define ADDR_WIDTH_5LEVEL     (57)
  #define ADDR_WIDTH_4LEVEL     (48)
  
@@@ -539,6 -543,10 +543,10 @@@ enum 
  #define sm_supported(iommu)   (intel_iommu_sm && ecap_smts((iommu)->ecap))
  #define pasid_supported(iommu)        (sm_supported(iommu) &&                 \
                                 ecap_pasid((iommu)->ecap))
+ #define ssads_supported(iommu) (sm_supported(iommu) &&                 \
+                               ecap_slads((iommu)->ecap))
+ #define nested_supported(iommu)       (sm_supported(iommu) &&                 \
+                                ecap_nest((iommu)->ecap))
  
  struct pasid_entry;
  struct pasid_state_entry;
@@@ -592,20 -600,45 +600,45 @@@ struct dmar_domain 
                                         * otherwise, goes through the second
                                         * level.
                                         */
+       u8 dirty_tracking:1;            /* Dirty tracking is enabled */
+       u8 nested_parent:1;             /* Has other domains nested on it */
  
        spinlock_t lock;                /* Protect device tracking lists */
        struct list_head devices;       /* all devices' list */
        struct list_head dev_pasids;    /* all attached pasids */
  
-       struct dma_pte  *pgd;           /* virtual address */
-       int             gaw;            /* max guest address width */
-       /* adjusted guest address width, 0 is level 2 30-bit */
-       int             agaw;
        int             iommu_superpage;/* Level of superpages supported:
                                           0 == 4KiB (no superpages), 1 == 2MiB,
                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
-       u64             max_addr;       /* maximum mapped address */
+       union {
+               /* DMA remapping domain */
+               struct {
+                       /* virtual address */
+                       struct dma_pte  *pgd;
+                       /* max guest address width */
+                       int             gaw;
+                       /*
+                        * adjusted guest address width:
+                        *   0: level 2 30-bit
+                        *   1: level 3 39-bit
+                        *   2: level 4 48-bit
+                        *   3: level 5 57-bit
+                        */
+                       int             agaw;
+                       /* maximum mapped address */
+                       u64             max_addr;
+               };
+               /* Nested user domain */
+               struct {
+                       /* parent page table which the user domain is nested on */
+                       struct dmar_domain *s2_domain;
+                       /* user page table pointer (in GPA) */
+                       unsigned long s1_pgtbl;
+                       /* page table attributes */
+                       struct iommu_hwpt_vtd_s1 s1_cfg;
+               };
+       };
  
        struct iommu_domain domain;     /* generic domain data structure for
                                           iommu core */
@@@ -681,7 -714,7 +714,7 @@@ struct intel_iommu 
        struct iopf_queue *iopf_queue;
        unsigned char iopfq_name[16];
        struct q_inval  *qi;            /* Queued invalidation info */
 -      u32 *iommu_state; /* Store iommu states between suspend and resume.*/
 +      u32 iommu_state[MAX_SR_DMAR_REGS]; /* Store iommu states between suspend and resume.*/
  
  #ifdef CONFIG_IRQ_REMAP
        struct ir_table *ir_table;      /* Interrupt remapping info */
@@@ -781,6 -814,16 +814,16 @@@ static inline bool dma_pte_present(stru
        return (pte->val & 3) != 0;
  }
  
+ static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
+                                                  unsigned long flags)
+ {
+       if (flags & IOMMU_DIRTY_NO_CLEAR)
+               return (pte->val & DMA_SL_PTE_DIRTY) != 0;
+       return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
+                                 (unsigned long *)&pte->val);
+ }
  static inline bool dma_pte_superpage(struct dma_pte *pte)
  {
        return (pte->val & DMA_PTE_LARGE_PAGE);
@@@ -836,12 -879,21 +879,21 @@@ int qi_submit_sync(struct intel_iommu *
   */
  #define QI_OPT_WAIT_DRAIN             BIT(0)
  
+ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
+ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
+ void device_block_translation(struct device *dev);
+ int prepare_domain_attach_device(struct iommu_domain *domain,
+                                struct device *dev);
+ void domain_update_iommu_cap(struct dmar_domain *domain);
  int dmar_ir_support(void);
  
  void *alloc_pgtable_page(int node, gfp_t gfp);
  void free_pgtable_page(void *vaddr);
  void iommu_flush_write_buffer(struct intel_iommu *iommu);
  struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
+ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
+                                              const struct iommu_user_data *user_data);
  
  #ifdef CONFIG_INTEL_IOMMU_SVM
  void intel_svm_check(struct intel_iommu *iommu);
index b6ac66c5008d970a664389eb9cf9eb9101cadd76,5cf2b491d15a01467cc82a5df624ffc494da8b20..fe09a8c8af95e8dedac6e08a4fba74379d1c4b5d
@@@ -24,8 -24,6 +24,8 @@@
  /* Device specification max LOAD size */
  #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
  
 +#define MAX_CHUNK_SIZE SZ_8M
 +
  static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
  {
        struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
@@@ -160,41 -158,6 +160,41 @@@ end
        return found ? buf : NULL;
  }
  
 +static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
 +{
 +      struct mlx5_vf_migration_file *migf = vhca_buf->migf;
 +
 +      if (vhca_buf->stop_copy_chunk_num) {
 +              bool is_header = vhca_buf->dma_dir == DMA_NONE;
 +              u8 chunk_num = vhca_buf->stop_copy_chunk_num;
 +              size_t next_required_umem_size = 0;
 +
 +              if (is_header)
 +                      migf->buf_header[chunk_num - 1] = vhca_buf;
 +              else
 +                      migf->buf[chunk_num - 1] = vhca_buf;
 +
 +              spin_lock_irq(&migf->list_lock);
 +              list_del_init(&vhca_buf->buf_elm);
 +              if (!is_header) {
 +                      next_required_umem_size =
 +                              migf->next_required_umem_size;
 +                      migf->next_required_umem_size = 0;
 +                      migf->num_ready_chunks--;
 +              }
 +              spin_unlock_irq(&migf->list_lock);
 +              if (next_required_umem_size)
 +                      mlx5vf_mig_file_set_save_work(migf, chunk_num,
 +                                                    next_required_umem_size);
 +              return;
 +      }
 +
 +      spin_lock_irq(&migf->list_lock);
 +      list_del_init(&vhca_buf->buf_elm);
 +      list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
 +      spin_unlock_irq(&migf->list_lock);
 +}
 +
  static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
                               char __user **buf, size_t *len, loff_t *pos)
  {
                copy_len -= page_len;
        }
  
 -      if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
 -              spin_lock_irq(&vhca_buf->migf->list_lock);
 -              list_del_init(&vhca_buf->buf_elm);
 -              list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
 -              spin_unlock_irq(&vhca_buf->migf->list_lock);
 -      }
 +      if (*pos >= vhca_buf->start_pos + vhca_buf->length)
 +              mlx5vf_buf_read_done(vhca_buf);
  
        return done;
  }
@@@ -337,75 -304,7 +337,75 @@@ static void mlx5vf_mark_err(struct mlx5
        wake_up_interruptible(&migf->poll_wait);
  }
  
 -static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
 +void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
 +                                 u8 chunk_num, size_t next_required_umem_size)
 +{
 +      migf->save_data[chunk_num - 1].next_required_umem_size =
 +                      next_required_umem_size;
 +      migf->save_data[chunk_num - 1].migf = migf;
 +      get_file(migf->filp);
 +      queue_work(migf->mvdev->cb_wq,
 +                 &migf->save_data[chunk_num - 1].work);
 +}
 +
 +static struct mlx5_vhca_data_buffer *
 +mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
 +                                u8 index, size_t required_length)
 +{
 +      struct mlx5_vhca_data_buffer *buf = migf->buf[index];
 +      u8 chunk_num;
 +
 +      WARN_ON(!buf);
 +      chunk_num = buf->stop_copy_chunk_num;
 +      buf->migf->buf[index] = NULL;
 +      /* Checking whether the pre-allocated buffer can fit */
 +      if (buf->allocated_length >= required_length)
 +              return buf;
 +
 +      mlx5vf_put_data_buffer(buf);
 +      buf = mlx5vf_get_data_buffer(buf->migf, required_length,
 +                                   DMA_FROM_DEVICE);
 +      if (IS_ERR(buf))
 +              return buf;
 +
 +      buf->stop_copy_chunk_num = chunk_num;
 +      return buf;
 +}
 +
 +static void mlx5vf_mig_file_save_work(struct work_struct *_work)
 +{
 +      struct mlx5vf_save_work_data *save_data = container_of(_work,
 +              struct mlx5vf_save_work_data, work);
 +      struct mlx5_vf_migration_file *migf = save_data->migf;
 +      struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
 +      struct mlx5_vhca_data_buffer *buf;
 +
 +      mutex_lock(&mvdev->state_mutex);
 +      if (migf->state == MLX5_MIGF_STATE_ERROR)
 +              goto end;
 +
 +      buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
 +                              save_data->chunk_num - 1,
 +                              save_data->next_required_umem_size);
 +      if (IS_ERR(buf))
 +              goto err;
 +
 +      if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
 +              goto err_save;
 +
 +      goto end;
 +
 +err_save:
 +      mlx5vf_put_data_buffer(buf);
 +err:
 +      mlx5vf_mark_err(migf);
 +end:
 +      mlx5vf_state_mutex_unlock(mvdev);
 +      fput(migf->filp);
 +}
 +
 +static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
 +                                     bool track)
  {
        size_t size = sizeof(struct mlx5_vf_migration_header) +
                sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
        to_buff = kmap_local_page(page);
        memcpy(to_buff, &header, sizeof(header));
        header_buf->length = sizeof(header);
 -      data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
 +      data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
        memcpy(to_buff + sizeof(header), &data, sizeof(data));
        header_buf->length += sizeof(data);
        kunmap_local(to_buff);
        spin_lock_irqsave(&migf->list_lock, flags);
        list_add_tail(&header_buf->buf_elm, &migf->buf_list);
        spin_unlock_irqrestore(&migf->list_lock, flags);
 -      migf->pre_copy_initial_bytes = size;
 +      if (track)
 +              migf->pre_copy_initial_bytes = size;
        return 0;
  err:
        mlx5vf_put_data_buffer(header_buf);
        return ret;
  }
  
 -static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
 -                               size_t state_size)
 +static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
 +                               struct mlx5_vf_migration_file *migf,
 +                               size_t state_size, u64 full_size,
 +                               bool track)
  {
        struct mlx5_vhca_data_buffer *buf;
        size_t inc_state_size;
 +      int num_chunks;
        int ret;
 +      int i;
  
 -      /* let's be ready for stop_copy size that might grow by 10 percents */
 -      if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
 -              inc_state_size = state_size;
 +      if (mvdev->chunk_mode) {
 +              size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
  
 -      buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
 -      if (IS_ERR(buf))
 -              return PTR_ERR(buf);
 +              /* from firmware perspective at least 'state_size' buffer should be set */
 +              inc_state_size = max(state_size, chunk_size);
 +      } else {
 +              if (track) {
 +                      /* let's be ready for stop_copy size that might grow by 10 percents */
 +                      if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
 +                              inc_state_size = state_size;
 +              } else {
 +                      inc_state_size = state_size;
 +              }
 +      }
  
 -      migf->buf = buf;
 -      buf = mlx5vf_get_data_buffer(migf,
 -                      sizeof(struct mlx5_vf_migration_header), DMA_NONE);
 -      if (IS_ERR(buf)) {
 -              ret = PTR_ERR(buf);
 -              goto err;
 +      /* let's not overflow the device specification max SAVE size */
 +      inc_state_size = min_t(size_t, inc_state_size,
 +              (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
 +
 +      num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
 +      for (i = 0; i < num_chunks; i++) {
 +              buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
 +              if (IS_ERR(buf)) {
 +                      ret = PTR_ERR(buf);
 +                      goto err;
 +              }
 +
 +              migf->buf[i] = buf;
 +              buf = mlx5vf_get_data_buffer(migf,
 +                              sizeof(struct mlx5_vf_migration_header), DMA_NONE);
 +              if (IS_ERR(buf)) {
 +                      ret = PTR_ERR(buf);
 +                      goto err;
 +              }
 +              migf->buf_header[i] = buf;
 +              if (mvdev->chunk_mode) {
 +                      migf->buf[i]->stop_copy_chunk_num = i + 1;
 +                      migf->buf_header[i]->stop_copy_chunk_num = i + 1;
 +                      INIT_WORK(&migf->save_data[i].work,
 +                                mlx5vf_mig_file_save_work);
 +                      migf->save_data[i].chunk_num = i + 1;
 +              }
        }
  
 -      migf->buf_header = buf;
 -      ret = mlx5vf_add_stop_copy_header(migf);
 +      ret = mlx5vf_add_stop_copy_header(migf, track);
        if (ret)
 -              goto err_header;
 +              goto err;
        return 0;
  
 -err_header:
 -      mlx5vf_put_data_buffer(migf->buf_header);
 -      migf->buf_header = NULL;
  err:
 -      mlx5vf_put_data_buffer(migf->buf);
 -      migf->buf = NULL;
 +      for (i = 0; i < num_chunks; i++) {
 +              if (migf->buf[i]) {
 +                      mlx5vf_put_data_buffer(migf->buf[i]);
 +                      migf->buf[i] = NULL;
 +              }
 +              if (migf->buf_header[i]) {
 +                      mlx5vf_put_data_buffer(migf->buf_header[i]);
 +                      migf->buf_header[i] = NULL;
 +              }
 +      }
 +
        return ret;
  }
  
@@@ -567,7 -428,7 +567,7 @@@ static long mlx5vf_precopy_ioctl(struc
                 * As so, the other code below is safe with the proper locks.
                 */
                ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
 -                                                          MLX5VF_QUERY_INC);
 +                                                          NULL, MLX5VF_QUERY_INC);
                if (ret)
                        goto err_state_unlock;
        }
@@@ -644,15 -505,21 +644,15 @@@ static int mlx5vf_pci_save_device_inc_d
        if (migf->state == MLX5_MIGF_STATE_ERROR)
                return -ENODEV;
  
 -      ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
 +      ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
                                MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
        if (ret)
                goto err;
  
 -      /* Checking whether we have a matching pre-allocated buffer that can fit */
 -      if (migf->buf && migf->buf->allocated_length >= length) {
 -              buf = migf->buf;
 -              migf->buf = NULL;
 -      } else {
 -              buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
 -              if (IS_ERR(buf)) {
 -                      ret = PTR_ERR(buf);
 -                      goto err;
 -              }
 +      buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
 +      if (IS_ERR(buf)) {
 +              ret = PTR_ERR(buf);
 +              goto err;
        }
  
        ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
@@@ -674,7 -541,6 +674,7 @@@ mlx5vf_pci_save_device_data(struct mlx5
        struct mlx5_vf_migration_file *migf;
        struct mlx5_vhca_data_buffer *buf;
        size_t length;
 +      u64 full_size;
        int ret;
  
        migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
        INIT_LIST_HEAD(&migf->buf_list);
        INIT_LIST_HEAD(&migf->avail_list);
        spin_lock_init(&migf->list_lock);
 -      ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
 +      ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
 +      if (ret)
 +              goto out_pd;
 +
 +      ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
        if (ret)
                goto out_pd;
  
        if (track) {
 -              ret = mlx5vf_prep_stop_copy(migf, length);
 -              if (ret)
 +              /* leave the allocated buffer ready for the stop-copy phase */
 +              buf = mlx5vf_alloc_data_buffer(migf,
 +                      migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
 +              if (IS_ERR(buf)) {
 +                      ret = PTR_ERR(buf);
                        goto out_pd;
 -      }
 -
 -      buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
 -      if (IS_ERR(buf)) {
 -              ret = PTR_ERR(buf);
 -              goto out_pd;
 +              }
 +      } else {
 +              buf = migf->buf[0];
 +              migf->buf[0] = NULL;
        }
  
        ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
@@@ -959,8 -820,8 +959,8 @@@ static ssize_t mlx5vf_resume_write(stru
                                   size_t len, loff_t *pos)
  {
        struct mlx5_vf_migration_file *migf = filp->private_data;
 -      struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
 -      struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
 +      struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
 +      struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
        loff_t requested_length;
        bool has_work = false;
        ssize_t done = 0;
                        if (vhca_buf_header->allocated_length < migf->record_size) {
                                mlx5vf_free_data_buffer(vhca_buf_header);
  
 -                              migf->buf_header = mlx5vf_alloc_data_buffer(migf,
 +                              migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
                                                migf->record_size, DMA_NONE);
 -                              if (IS_ERR(migf->buf_header)) {
 -                                      ret = PTR_ERR(migf->buf_header);
 -                                      migf->buf_header = NULL;
 +                              if (IS_ERR(migf->buf_header[0])) {
 +                                      ret = PTR_ERR(migf->buf_header[0]);
 +                                      migf->buf_header[0] = NULL;
                                        goto out_unlock;
                                }
  
 -                              vhca_buf_header = migf->buf_header;
 +                              vhca_buf_header = migf->buf_header[0];
                        }
  
                        vhca_buf_header->start_pos = migf->max_pos;
                        if (vhca_buf->allocated_length < size) {
                                mlx5vf_free_data_buffer(vhca_buf);
  
 -                              migf->buf = mlx5vf_alloc_data_buffer(migf,
 +                              migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
                                                        size, DMA_TO_DEVICE);
 -                              if (IS_ERR(migf->buf)) {
 -                                      ret = PTR_ERR(migf->buf);
 -                                      migf->buf = NULL;
 +                              if (IS_ERR(migf->buf[0])) {
 +                                      ret = PTR_ERR(migf->buf[0]);
 +                                      migf->buf[0] = NULL;
                                        goto out_unlock;
                                }
  
 -                              vhca_buf = migf->buf;
 +                              vhca_buf = migf->buf[0];
                        }
  
                        vhca_buf->start_pos = migf->max_pos;
@@@ -1113,7 -974,7 +1113,7 @@@ mlx5vf_pci_resume_device_data(struct ml
                goto out_pd;
        }
  
 -      migf->buf = buf;
 +      migf->buf[0] = buf;
        if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
                buf = mlx5vf_alloc_data_buffer(migf,
                        sizeof(struct mlx5_vf_migration_header), DMA_NONE);
                        goto out_buf;
                }
  
 -              migf->buf_header = buf;
 +              migf->buf_header[0] = buf;
                migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
        } else {
                /* Initial state will be to read the image */
        spin_lock_init(&migf->list_lock);
        return migf;
  out_buf:
 -      mlx5vf_free_data_buffer(migf->buf);
 +      mlx5vf_free_data_buffer(migf->buf[0]);
  out_pd:
        mlx5vf_cmd_dealloc_pd(migf);
  out_free:
@@@ -1158,7 -1019,6 +1158,7 @@@ void mlx5vf_disable_fds(struct mlx5vf_p
                mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
                cancel_work_sync(&mvdev->saving_migf->async_data.work);
                mlx5vf_disable_fd(mvdev->saving_migf);
 +              wake_up_interruptible(&mvdev->saving_migf->poll_wait);
                mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
                fput(mvdev->saving_migf->filp);
                mvdev->saving_migf = NULL;
@@@ -1240,7 -1100,7 +1240,7 @@@ mlx5vf_pci_step_device_state_locked(str
                if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
                        ret = mlx5vf_cmd_load_vhca_state(mvdev,
                                                         mvdev->resuming_migf,
 -                                                       mvdev->resuming_migf->buf);
 +                                                       mvdev->resuming_migf->buf[0]);
                        if (ret)
                                return ERR_PTR(ret);
                }
@@@ -1334,14 -1194,13 +1334,14 @@@ static int mlx5vf_pci_get_data_size(str
        struct mlx5vf_pci_core_device *mvdev = container_of(
                vdev, struct mlx5vf_pci_core_device, core_device.vdev);
        size_t state_size;
 +      u64 total_size;
        int ret;
  
        mutex_lock(&mvdev->state_mutex);
 -      ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
 -                                                  &state_size, 0);
 +      ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
 +                                                  &total_size, 0);
        if (!ret)
 -              *stop_copy_length = state_size;
 +              *stop_copy_length = total_size;
        mlx5vf_state_mutex_unlock(mvdev);
        return ret;
  }
@@@ -1517,6 -1376,7 +1517,7 @@@ static struct pci_driver mlx5vf_pci_dri
  
  module_pci_driver(mlx5vf_pci_driver);
  
+ MODULE_IMPORT_NS(IOMMUFD);
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Max Gurtovoy <[email protected]>");
  MODULE_AUTHOR("Yishai Hadas <[email protected]>");
index 6eceef7b028aae9b8b7a8cb49614e88525f4bade,fff368a8183b25455a534a7a9dffe7db4b0070eb..fec9b167c7b9ac98ae24dddd9265e30d95942e7d
@@@ -3,8 -3,9 +3,9 @@@
  
  config PDS_VFIO_PCI
        tristate "VFIO support for PDS PCI devices"
 -      depends on PDS_CORE
 +      depends on PDS_CORE && PCI_IOV
        select VFIO_PCI_CORE
+       select IOMMUFD_DRIVER
        help
          This provides generic PCI support for PDS devices using the VFIO
          framework.
diff --combined drivers/vfio/vfio_main.c
index e31e1952d7b8f1a49a27f005e05653e3b6f23976,a96d97da367daa87a9e5920f36216d64f2c1afc0..8d4995ada74a01848ce8e7becf61120cc10ec33a
@@@ -946,11 -946,6 +946,11 @@@ void vfio_combine_iova_ranges(struct rb
                unsigned long last;
  
                comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
 +
 +              /* Empty list */
 +              if (WARN_ON_ONCE(!comb_start))
 +                      return;
 +
                curr = comb_start;
                while (curr) {
                        last = curr->last;
                        prev = curr;
                        curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
                }
 +
 +              /* Empty list or no nodes to combine */
 +              if (WARN_ON_ONCE(min_gap == ULONG_MAX))
 +                      break;
 +
                comb_start->last = comb_end->last;
                interval_tree_remove(comb_end, root);
                cur_nodes--;
@@@ -1703,6 -1693,7 +1703,7 @@@ static void __exit vfio_cleanup(void
  module_init(vfio_init);
  module_exit(vfio_cleanup);
  
+ MODULE_IMPORT_NS(IOMMUFD);
  MODULE_VERSION(DRIVER_VERSION);
  MODULE_LICENSE("GPL v2");
  MODULE_AUTHOR(DRIVER_AUTHOR);
This page took 0.11572 seconds and 4 git commands to generate.