]> Git Repo - linux.git/commitdiff
Merge tag 'powerpc-4.5-4' into next
authorMichael Ellerman <[email protected]>
Thu, 25 Feb 2016 10:52:58 +0000 (21:52 +1100)
committerMichael Ellerman <[email protected]>
Thu, 25 Feb 2016 10:52:58 +0000 (21:52 +1100)
Pull in our current fixes from 4.5, in particular the "Fix Multi hit
ERAT" bug is causing folks some grief when testing next.

1  2 
arch/powerpc/Kconfig
arch/powerpc/kernel/eeh_driver.c
arch/powerpc/platforms/powernv/eeh-powernv.c
arch/powerpc/platforms/powernv/pci-ioda.c
arch/powerpc/platforms/powernv/pci.c
arch/powerpc/platforms/powernv/pci.h
mm/huge_memory.c

diff --combined arch/powerpc/Kconfig
index e4824fd04bb7449d262c1a7697b5f539b03f6bab,5ead6a31854bf5987994e6e118ece6aebf8f920f..9faa18c4f3f702adceb4f555b05b72bc8437cf6c
@@@ -108,6 -108,7 +108,6 @@@ config PP
        select HAVE_ARCH_TRACEHOOK
        select HAVE_MEMBLOCK
        select HAVE_MEMBLOCK_NODE_MAP
 -      select HAVE_DMA_ATTRS
        select HAVE_DMA_API_DEBUG
        select HAVE_OPROFILE
        select HAVE_DEBUG_KMEMLEAK
        select ARCH_HAS_DMA_SET_COHERENT_MASK
        select ARCH_HAS_DEVMEM_IS_ALLOWED
        select HAVE_ARCH_SECCOMP_FILTER
 +      select ARCH_HAS_UBSAN_SANITIZE_ALL
  
  config GENERIC_CSUM
        def_bool CPU_LITTLE_ENDIAN
@@@ -557,7 -557,7 +557,7 @@@ choic
  
  config PPC_4K_PAGES
        bool "4k page size"
-       select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S
+       select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
  
  config PPC_16K_PAGES
        bool "16k page size"
  config PPC_64K_PAGES
        bool "64k page size"
        depends on !PPC_FSL_BOOK3E && (44x || PPC_STD_MMU_64 || PPC_BOOK3E_64)
-       select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S
+       select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
  
  config PPC_256K_PAGES
        bool "256k page size"
index 938742135ee08fc8dd058df690cfba7eacdabc0b,52c1e273f8cd5d5d641e1f2bc7591ff8b0efdbd9..650cfb31ea3d9b3cee0301948bd85e54400cb951
@@@ -400,7 -400,7 +400,7 @@@ static void *eeh_rmv_device(void *data
         * support EEH. So we just care about PCI devices for
         * simplicity here.
         */
 -      if (!dev || (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE))
 +      if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
                return NULL;
  
        /*
                eeh_pcid_put(dev);
                if (driver->err_handler &&
                    driver->err_handler->error_detected &&
-                   driver->err_handler->slot_reset &&
-                   driver->err_handler->resume)
+                   driver->err_handler->slot_reset)
                        return NULL;
        }
  
@@@ -564,6 -563,7 +563,7 @@@ static int eeh_reset_device(struct eeh_
         */
        eeh_pe_state_mark(pe, EEH_PE_KEEP);
        if (bus) {
+               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
                pci_lock_rescan_remove();
                pcibios_remove_pci_devices(bus);
                pci_unlock_rescan_remove();
@@@ -803,6 -803,7 +803,7 @@@ perm_error
         * the their PCI config any more.
         */
        if (frozen_bus) {
+               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
                eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
  
                pci_lock_rescan_remove();
@@@ -886,6 -887,7 +887,7 @@@ static void eeh_handle_special_event(vo
                                        continue;
  
                                /* Notify all devices to be down */
+                               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
                                bus = eeh_pe_bus_get(phb_pe);
                                eeh_pe_dev_traverse(pe,
                                        eeh_report_failure, NULL);
index 3f1cb35d9cdf05fa91b00d09be32189fd0ebbac3,87f47e55aab65ac234df1d67c926ca17b1517f06..811917219bf11317ec8d4ee32df3ddd49041ce5c
@@@ -167,26 -167,42 +167,26 @@@ static int pnv_eeh_dbgfs_get(void *data
        return 0;
  }
  
 -static int pnv_eeh_outb_dbgfs_set(void *data, u64 val)
 -{
 -      return pnv_eeh_dbgfs_set(data, 0xD10, val);
 -}
 -
 -static int pnv_eeh_outb_dbgfs_get(void *data, u64 *val)
 -{
 -      return pnv_eeh_dbgfs_get(data, 0xD10, val);
 -}
 -
 -static int pnv_eeh_inbA_dbgfs_set(void *data, u64 val)
 -{
 -      return pnv_eeh_dbgfs_set(data, 0xD90, val);
 -}
 -
 -static int pnv_eeh_inbA_dbgfs_get(void *data, u64 *val)
 -{
 -      return pnv_eeh_dbgfs_get(data, 0xD90, val);
 -}
 -
 -static int pnv_eeh_inbB_dbgfs_set(void *data, u64 val)
 -{
 -      return pnv_eeh_dbgfs_set(data, 0xE10, val);
 -}
 -
 -static int pnv_eeh_inbB_dbgfs_get(void *data, u64 *val)
 -{
 -      return pnv_eeh_dbgfs_get(data, 0xE10, val);
 -}
 +#define PNV_EEH_DBGFS_ENTRY(name, reg)                                \
 +static int pnv_eeh_dbgfs_set_##name(void *data, u64 val)      \
 +{                                                             \
 +      return pnv_eeh_dbgfs_set(data, reg, val);               \
 +}                                                             \
 +                                                              \
 +static int pnv_eeh_dbgfs_get_##name(void *data, u64 *val)     \
 +{                                                             \
 +      return pnv_eeh_dbgfs_get(data, reg, val);               \
 +}                                                             \
 +                                                              \
 +DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_dbgfs_ops_##name,             \
 +                      pnv_eeh_dbgfs_get_##name,               \
 +                        pnv_eeh_dbgfs_set_##name,             \
 +                      "0x%llx\n")
 +
 +PNV_EEH_DBGFS_ENTRY(outb, 0xD10);
 +PNV_EEH_DBGFS_ENTRY(inbA, 0xD90);
 +PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
  
 -DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_outb_dbgfs_ops, pnv_eeh_outb_dbgfs_get,
 -                      pnv_eeh_outb_dbgfs_set, "0x%llx\n");
 -DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_inbA_dbgfs_ops, pnv_eeh_inbA_dbgfs_get,
 -                      pnv_eeh_inbA_dbgfs_set, "0x%llx\n");
 -DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_inbB_dbgfs_ops, pnv_eeh_inbB_dbgfs_get,
 -                      pnv_eeh_inbB_dbgfs_set, "0x%llx\n");
  #endif /* CONFIG_DEBUG_FS */
  
  /**
@@@ -252,13 -268,13 +252,13 @@@ static int pnv_eeh_post_init(void
  
                debugfs_create_file("err_injct_outbound", 0600,
                                    phb->dbgfs, hose,
 -                                  &pnv_eeh_outb_dbgfs_ops);
 +                                  &pnv_eeh_dbgfs_ops_outb);
                debugfs_create_file("err_injct_inboundA", 0600,
                                    phb->dbgfs, hose,
 -                                  &pnv_eeh_inbA_dbgfs_ops);
 +                                  &pnv_eeh_dbgfs_ops_inbA);
                debugfs_create_file("err_injct_inboundB", 0600,
                                    phb->dbgfs, hose,
 -                                  &pnv_eeh_inbB_dbgfs_ops);
 +                                  &pnv_eeh_dbgfs_ops_inbB);
  #endif /* CONFIG_DEBUG_FS */
        }
  
@@@ -428,9 -444,12 +428,12 @@@ static void *pnv_eeh_probe(struct pci_d
         * PCI devices of the PE are expected to be removed prior
         * to PE reset.
         */
-       if (!edev->pe->bus)
+       if (!(edev->pe->state & EEH_PE_PRI_BUS)) {
                edev->pe->bus = pci_find_bus(hose->global_number,
                                             pdn->busno);
+               if (edev->pe->bus)
+                       edev->pe->state |= EEH_PE_PRI_BUS;
+       }
  
        /*
         * Enable EEH explicitly so that we will do EEH check
index dc868586315d02378e8506589ec107e7d35801de,f90dc04395bf47bcc0e662a7a1c17526220ccda2..c5baaf3cc4e5ef565bcaadeb125bd6ac2138a4c9
@@@ -872,6 -872,9 +872,6 @@@ static int pnv_pci_vf_resource_shift(st
                if (!res->flags || !res->parent)
                        continue;
  
 -              if (!pnv_pci_is_mem_pref_64(res->flags))
 -                      continue;
 -
                /*
                 * The actual IOV BAR range is determined by the start address
                 * and the actual size for num_vfs VFs BAR.  This check is to
                if (!res->flags || !res->parent)
                        continue;
  
 -              if (!pnv_pci_is_mem_pref_64(res->flags))
 -                      continue;
 -
                size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
                res2 = *res;
                res->start += size * offset;
@@@ -1190,36 -1196,29 +1190,36 @@@ static void pnv_pci_ioda_setup_PEs(void
  }
  
  #ifdef CONFIG_PCI_IOV
 -static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
 +static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
  {
        struct pci_bus        *bus;
        struct pci_controller *hose;
        struct pnv_phb        *phb;
        struct pci_dn         *pdn;
        int                    i, j;
 +      int                    m64_bars;
  
        bus = pdev->bus;
        hose = pci_bus_to_host(bus);
        phb = hose->private_data;
        pdn = pci_get_pdn(pdev);
  
 +      if (pdn->m64_single_mode)
 +              m64_bars = num_vfs;
 +      else
 +              m64_bars = 1;
 +
        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
 -              for (j = 0; j < M64_PER_IOV; j++) {
 -                      if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
 +              for (j = 0; j < m64_bars; j++) {
 +                      if (pdn->m64_map[j][i] == IODA_INVALID_M64)
                                continue;
                        opal_pci_phb_mmio_enable(phb->opal_id,
 -                              OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
 -                      clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
 -                      pdn->m64_wins[i][j] = IODA_INVALID_M64;
 +                              OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
 +                      clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
 +                      pdn->m64_map[j][i] = IODA_INVALID_M64;
                }
  
 +      kfree(pdn->m64_map);
        return 0;
  }
  
@@@ -1236,7 -1235,8 +1236,7 @@@ static int pnv_pci_vf_assign_m64(struc
        int                    total_vfs;
        resource_size_t        size, start;
        int                    pe_num;
 -      int                    vf_groups;
 -      int                    vf_per_group;
 +      int                    m64_bars;
  
        bus = pdev->bus;
        hose = pci_bus_to_host(bus);
        pdn = pci_get_pdn(pdev);
        total_vfs = pci_sriov_get_totalvfs(pdev);
  
 -      /* Initialize the m64_wins to IODA_INVALID_M64 */
 -      for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
 -              for (j = 0; j < M64_PER_IOV; j++)
 -                      pdn->m64_wins[i][j] = IODA_INVALID_M64;
 +      if (pdn->m64_single_mode)
 +              m64_bars = num_vfs;
 +      else
 +              m64_bars = 1;
 +
 +      pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL);
 +      if (!pdn->m64_map)
 +              return -ENOMEM;
 +      /* Initialize the m64_map to IODA_INVALID_M64 */
 +      for (i = 0; i < m64_bars ; i++)
 +              for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
 +                      pdn->m64_map[i][j] = IODA_INVALID_M64;
  
 -      if (pdn->m64_per_iov == M64_PER_IOV) {
 -              vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
 -              vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
 -                      roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
 -      } else {
 -              vf_groups = 1;
 -              vf_per_group = 1;
 -      }
  
        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
                res = &pdev->resource[i + PCI_IOV_RESOURCES];
                if (!res->flags || !res->parent)
                        continue;
  
 -              if (!pnv_pci_is_mem_pref_64(res->flags))
 -                      continue;
 -
 -              for (j = 0; j < vf_groups; j++) {
 +              for (j = 0; j < m64_bars; j++) {
                        do {
                                win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
                                                phb->ioda.m64_bar_idx + 1, 0);
                                        goto m64_failed;
                        } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
  
 -                      pdn->m64_wins[i][j] = win;
 +                      pdn->m64_map[j][i] = win;
  
 -                      if (pdn->m64_per_iov == M64_PER_IOV) {
 +                      if (pdn->m64_single_mode) {
                                size = pci_iov_resource_size(pdev,
                                                        PCI_IOV_RESOURCES + i);
 -                              size = size * vf_per_group;
                                start = res->start + size * j;
                        } else {
                                size = resource_size(res);
                        }
  
                        /* Map the M64 here */
 -                      if (pdn->m64_per_iov == M64_PER_IOV) {
 -                              pe_num = pdn->offset + j;
 +                      if (pdn->m64_single_mode) {
 +                              pe_num = pdn->pe_num_map[j];
                                rc = opal_pci_map_pe_mmio_window(phb->opal_id,
                                                pe_num, OPAL_M64_WINDOW_TYPE,
 -                                              pdn->m64_wins[i][j], 0);
 +                                              pdn->m64_map[j][i], 0);
                        }
  
                        rc = opal_pci_set_phb_mem_window(phb->opal_id,
                                                 OPAL_M64_WINDOW_TYPE,
 -                                               pdn->m64_wins[i][j],
 +                                               pdn->m64_map[j][i],
                                                 start,
                                                 0, /* unused */
                                                 size);
                                goto m64_failed;
                        }
  
 -                      if (pdn->m64_per_iov == M64_PER_IOV)
 +                      if (pdn->m64_single_mode)
                                rc = opal_pci_phb_mmio_enable(phb->opal_id,
 -                                   OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
 +                                   OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
                        else
                                rc = opal_pci_phb_mmio_enable(phb->opal_id,
 -                                   OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
 +                                   OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
  
                        if (rc != OPAL_SUCCESS) {
                                dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
        return 0;
  
  m64_failed:
 -      pnv_pci_vf_release_m64(pdev);
 +      pnv_pci_vf_release_m64(pdev, num_vfs);
        return -EBUSY;
  }
  
@@@ -1349,13 -1353,15 +1349,13 @@@ static void pnv_pci_ioda2_release_dma_p
        iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
  }
  
 -static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
  {
        struct pci_bus        *bus;
        struct pci_controller *hose;
        struct pnv_phb        *phb;
        struct pnv_ioda_pe    *pe, *pe_n;
        struct pci_dn         *pdn;
 -      u16                    vf_index;
 -      int64_t                rc;
  
        bus = pdev->bus;
        hose = pci_bus_to_host(bus);
        if (!pdev->is_physfn)
                return;
  
 -      if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
 -              int   vf_group;
 -              int   vf_per_group;
 -              int   vf_index1;
 -
 -              vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
 -
 -              for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
 -                      for (vf_index = vf_group * vf_per_group;
 -                              vf_index < (vf_group + 1) * vf_per_group &&
 -                              vf_index < num_vfs;
 -                              vf_index++)
 -                              for (vf_index1 = vf_group * vf_per_group;
 -                                      vf_index1 < (vf_group + 1) * vf_per_group &&
 -                                      vf_index1 < num_vfs;
 -                                      vf_index1++){
 -
 -                                      rc = opal_pci_set_peltv(phb->opal_id,
 -                                              pdn->offset + vf_index,
 -                                              pdn->offset + vf_index1,
 -                                              OPAL_REMOVE_PE_FROM_DOMAIN);
 -
 -                                      if (rc)
 -                                          dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
 -                                              __func__,
 -                                              pdn->offset + vf_index1, rc);
 -                              }
 -      }
 -
        list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
                if (pe->parent_dev != pdev)
                        continue;
@@@ -1389,7 -1424,7 +1389,7 @@@ void pnv_pci_sriov_disable(struct pci_d
        struct pnv_phb        *phb;
        struct pci_dn         *pdn;
        struct pci_sriov      *iov;
 -      u16 num_vfs;
 +      u16                    num_vfs, i;
  
        bus = pdev->bus;
        hose = pci_bus_to_host(bus);
        num_vfs = pdn->num_vfs;
  
        /* Release VF PEs */
 -      pnv_ioda_release_vf_PE(pdev, num_vfs);
 +      pnv_ioda_release_vf_PE(pdev);
  
        if (phb->type == PNV_PHB_IODA2) {
 -              if (pdn->m64_per_iov == 1)
 -                      pnv_pci_vf_resource_shift(pdev, -pdn->offset);
 +              if (!pdn->m64_single_mode)
 +                      pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
  
                /* Release M64 windows */
 -              pnv_pci_vf_release_m64(pdev);
 +              pnv_pci_vf_release_m64(pdev, num_vfs);
  
                /* Release PE numbers */
 -              bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
 -              pdn->offset = 0;
 +              if (pdn->m64_single_mode) {
 +                      for (i = 0; i < num_vfs; i++) {
 +                              if (pdn->pe_num_map[i] != IODA_INVALID_PE)
 +                                      pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
 +                      }
 +              } else
 +                      bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
 +              /* Releasing pe_num_map */
 +              kfree(pdn->pe_num_map);
        }
  }
  
@@@ -1432,6 -1460,7 +1432,6 @@@ static void pnv_ioda_setup_vf_PE(struc
        int                    pe_num;
        u16                    vf_index;
        struct pci_dn         *pdn;
 -      int64_t                rc;
  
        bus = pdev->bus;
        hose = pci_bus_to_host(bus);
  
        /* Reserve PE for each VF */
        for (vf_index = 0; vf_index < num_vfs; vf_index++) {
 -              pe_num = pdn->offset + vf_index;
 +              if (pdn->m64_single_mode)
 +                      pe_num = pdn->pe_num_map[vf_index];
 +              else
 +                      pe_num = *pdn->pe_num_map + vf_index;
  
                pe = &phb->ioda.pe_array[pe_num];
                pe->pe_number = pe_num;
  
                pnv_pci_ioda2_setup_dma_pe(phb, pe);
        }
 -
 -      if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
 -              int   vf_group;
 -              int   vf_per_group;
 -              int   vf_index1;
 -
 -              vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
 -
 -              for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
 -                      for (vf_index = vf_group * vf_per_group;
 -                           vf_index < (vf_group + 1) * vf_per_group &&
 -                           vf_index < num_vfs;
 -                           vf_index++) {
 -                              for (vf_index1 = vf_group * vf_per_group;
 -                                   vf_index1 < (vf_group + 1) * vf_per_group &&
 -                                   vf_index1 < num_vfs;
 -                                   vf_index1++) {
 -
 -                                      rc = opal_pci_set_peltv(phb->opal_id,
 -                                              pdn->offset + vf_index,
 -                                              pdn->offset + vf_index1,
 -                                              OPAL_ADD_PE_TO_DOMAIN);
 -
 -                                      if (rc)
 -                                          dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
 -                                              __func__,
 -                                              pdn->offset + vf_index1, rc);
 -                              }
 -                      }
 -              }
 -      }
  }
  
  int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
        struct pnv_phb        *phb;
        struct pci_dn         *pdn;
        int                    ret;
 +      u16                    i;
  
        bus = pdev->bus;
        hose = pci_bus_to_host(bus);
        pdn = pci_get_pdn(pdev);
  
        if (phb->type == PNV_PHB_IODA2) {
 +              if (!pdn->vfs_expanded) {
 +                      dev_info(&pdev->dev, "don't support this SRIOV device"
 +                              " with non 64bit-prefetchable IOV BAR\n");
 +                      return -ENOSPC;
 +              }
 +
 +              /*
 +               * When M64 BARs functions in Single PE mode, the number of VFs
 +               * could be enabled must be less than the number of M64 BARs.
 +               */
 +              if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
 +                      dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
 +                      return -EBUSY;
 +              }
 +
 +              /* Allocating pe_num_map */
 +              if (pdn->m64_single_mode)
 +                      pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map) * num_vfs,
 +                                      GFP_KERNEL);
 +              else
 +                      pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
 +
 +              if (!pdn->pe_num_map)
 +                      return -ENOMEM;
 +
 +              if (pdn->m64_single_mode)
 +                      for (i = 0; i < num_vfs; i++)
 +                              pdn->pe_num_map[i] = IODA_INVALID_PE;
 +
                /* Calculate available PE for required VFs */
 -              mutex_lock(&phb->ioda.pe_alloc_mutex);
 -              pdn->offset = bitmap_find_next_zero_area(
 -                      phb->ioda.pe_alloc, phb->ioda.total_pe,
 -                      0, num_vfs, 0);
 -              if (pdn->offset >= phb->ioda.total_pe) {
 +              if (pdn->m64_single_mode) {
 +                      for (i = 0; i < num_vfs; i++) {
 +                              pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb);
 +                              if (pdn->pe_num_map[i] == IODA_INVALID_PE) {
 +                                      ret = -EBUSY;
 +                                      goto m64_failed;
 +                              }
 +                      }
 +              } else {
 +                      mutex_lock(&phb->ioda.pe_alloc_mutex);
 +                      *pdn->pe_num_map = bitmap_find_next_zero_area(
 +                              phb->ioda.pe_alloc, phb->ioda.total_pe,
 +                              0, num_vfs, 0);
 +                      if (*pdn->pe_num_map >= phb->ioda.total_pe) {
 +                              mutex_unlock(&phb->ioda.pe_alloc_mutex);
 +                              dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
 +                              kfree(pdn->pe_num_map);
 +                              return -EBUSY;
 +                      }
 +                      bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
                        mutex_unlock(&phb->ioda.pe_alloc_mutex);
 -                      dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
 -                      pdn->offset = 0;
 -                      return -EBUSY;
                }
 -              bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs);
                pdn->num_vfs = num_vfs;
 -              mutex_unlock(&phb->ioda.pe_alloc_mutex);
  
                /* Assign M64 window accordingly */
                ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
                 * the IOV BAR according to the PE# allocated to the VFs.
                 * Otherwise, the PE# for the VF will conflict with others.
                 */
 -              if (pdn->m64_per_iov == 1) {
 -                      ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
 +              if (!pdn->m64_single_mode) {
 +                      ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
                        if (ret)
                                goto m64_failed;
                }
        return 0;
  
  m64_failed:
 -      bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
 -      pdn->offset = 0;
 +      if (pdn->m64_single_mode) {
 +              for (i = 0; i < num_vfs; i++) {
 +                      if (pdn->pe_num_map[i] != IODA_INVALID_PE)
 +                              pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
 +              }
 +      } else
 +              bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
 +
 +      /* Releasing pe_num_map */
 +      kfree(pdn->pe_num_map);
  
        return ret;
  }
@@@ -1603,7 -1612,8 +1603,7 @@@ int pcibios_sriov_enable(struct pci_de
        /* Allocate PCI data */
        add_dev_pci_data(pdev);
  
 -      pnv_pci_sriov_enable(pdev, num_vfs);
 -      return 0;
 +      return pnv_pci_sriov_enable(pdev, num_vfs);
  }
  #endif /* CONFIG_PCI_IOV */
  
@@@ -2841,58 -2851,45 +2841,58 @@@ static void pnv_pci_init_ioda_msis(stru
  #ifdef CONFIG_PCI_IOV
  static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
  {
 -      struct pci_controller *hose;
 -      struct pnv_phb *phb;
 +      struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 +      struct pnv_phb *phb = hose->private_data;
 +      const resource_size_t gate = phb->ioda.m64_segsize >> 2;
        struct resource *res;
        int i;
 -      resource_size_t size;
 +      resource_size_t size, total_vf_bar_sz;
        struct pci_dn *pdn;
        int mul, total_vfs;
  
        if (!pdev->is_physfn || pdev->is_added)
                return;
  
 -      hose = pci_bus_to_host(pdev->bus);
 -      phb = hose->private_data;
 -
        pdn = pci_get_pdn(pdev);
        pdn->vfs_expanded = 0;
 +      pdn->m64_single_mode = false;
  
        total_vfs = pci_sriov_get_totalvfs(pdev);
 -      pdn->m64_per_iov = 1;
        mul = phb->ioda.total_pe;
 +      total_vf_bar_sz = 0;
  
        for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
                res = &pdev->resource[i + PCI_IOV_RESOURCES];
                if (!res->flags || res->parent)
                        continue;
                if (!pnv_pci_is_mem_pref_64(res->flags)) {
 -                      dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
 +                      dev_warn(&pdev->dev, "Don't support SR-IOV with"
 +                                      " non M64 VF BAR%d: %pR. \n",
                                 i, res);
 -                      continue;
 +                      goto truncate_iov;
                }
  
 -              size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
 +              total_vf_bar_sz += pci_iov_resource_size(pdev,
 +                              i + PCI_IOV_RESOURCES);
  
 -              /* bigger than 64M */
 -              if (size > (1 << 26)) {
 -                      dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
 -                               i, res);
 -                      pdn->m64_per_iov = M64_PER_IOV;
 +              /*
 +               * If bigger than quarter of M64 segment size, just round up
 +               * power of two.
 +               *
 +               * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
 +               * with other devices, IOV BAR size is expanded to be
 +               * (total_pe * VF_BAR_size).  When VF_BAR_size is half of M64
 +               * segment size , the expanded size would equal to half of the
 +               * whole M64 space size, which will exhaust the M64 Space and
 +               * limit the system flexibility.  This is a design decision to
 +               * set the boundary to quarter of the M64 segment size.
 +               */
 +              if (total_vf_bar_sz > gate) {
                        mul = roundup_pow_of_two(total_vfs);
 +                      dev_info(&pdev->dev,
 +                              "VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
 +                              total_vf_bar_sz, gate, mul);
 +                      pdn->m64_single_mode = true;
                        break;
                }
        }
                res = &pdev->resource[i + PCI_IOV_RESOURCES];
                if (!res->flags || res->parent)
                        continue;
 -              if (!pnv_pci_is_mem_pref_64(res->flags)) {
 -                      dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
 -                               i, res);
 -                      continue;
 -              }
  
 -              dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
                size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
 +              /*
 +               * On PHB3, the minimum size alignment of M64 BAR in single
 +               * mode is 32MB.
 +               */
 +              if (pdn->m64_single_mode && (size < SZ_32M))
 +                      goto truncate_iov;
 +              dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
                res->end = res->start + size * mul - 1;
                dev_dbg(&pdev->dev, "                       %pR\n", res);
                dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
                         i, res, mul);
        }
        pdn->vfs_expanded = mul;
 +
 +      return;
 +
 +truncate_iov:
 +      /* To save MMIO space, IOV BAR is truncated. */
 +      for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 +              res = &pdev->resource[i + PCI_IOV_RESOURCES];
 +              res->flags = 0;
 +              res->end = res->start - 1;
 +      }
  }
  #endif /* CONFIG_PCI_IOV */
  
@@@ -3139,35 -3125,18 +3139,35 @@@ static resource_size_t pnv_pci_window_a
  static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
                                                      int resno)
  {
 +      struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 +      struct pnv_phb *phb = hose->private_data;
        struct pci_dn *pdn = pci_get_pdn(pdev);
 -      resource_size_t align, iov_align;
 -
 -      iov_align = resource_size(&pdev->resource[resno]);
 -      if (iov_align)
 -              return iov_align;
 +      resource_size_t align;
  
 +      /*
 +       * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
 +       * SR-IOV. While from hardware perspective, the range mapped by M64
 +       * BAR should be size aligned.
 +       *
 +       * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
 +       * powernv-specific hardware restriction is gone. But if just use the
 +       * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
 +       * in one segment of M64 #15, which introduces the PE conflict between
 +       * PF and VF. Based on this, the minimum alignment of an IOV BAR is
 +       * m64_segsize.
 +       *
 +       * This function returns the total IOV BAR size if M64 BAR is in
 +       * Shared PE mode or just VF BAR size if not.
 +       * If the M64 BAR is in Single PE mode, return the VF BAR size or
 +       * M64 segment size if IOV BAR size is less.
 +       */
        align = pci_iov_resource_size(pdev, resno);
 -      if (pdn->vfs_expanded)
 -              return pdn->vfs_expanded * align;
 +      if (!pdn->vfs_expanded)
 +              return align;
 +      if (pdn->m64_single_mode)
 +              return max(align, (resource_size_t)phb->ioda.m64_segsize);
  
 -      return align;
 +      return pdn->vfs_expanded * align;
  }
  #endif /* CONFIG_PCI_IOV */
  
@@@ -3211,6 -3180,7 +3211,7 @@@ static void pnv_pci_ioda_shutdown(struc
  
  static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
         .dma_dev_setup = pnv_pci_dma_dev_setup,
+        .dma_bus_setup = pnv_pci_dma_bus_setup,
  #ifdef CONFIG_PCI_MSI
         .setup_msi_irqs = pnv_setup_msi_irqs,
         .teardown_msi_irqs = pnv_teardown_msi_irqs,
index 8de0140332b2a0a15af57d56a0431e50594c489f,b1ef84a6c9d13cff03c2d4a5234e57265ef5bc75..73c8dc2a353fdd540b9a05a5aa208b3e50970e47
@@@ -380,7 -380,10 +380,7 @@@ static void pnv_pci_config_check_eeh(st
         */
        pe_no = pdn->pe_number;
        if (pe_no == IODA_INVALID_PE) {
 -              if (phb->type == PNV_PHB_P5IOC2)
 -                      pe_no = 0;
 -              else
 -                      pe_no = phb->ioda.reserved_pe;
 +              pe_no = phb->ioda.reserved_pe;
        }
  
        /*
@@@ -596,6 -599,9 +596,9 @@@ int pnv_tce_build(struct iommu_table *t
        u64 rpn = __pa(uaddr) >> tbl->it_page_shift;
        long i;
  
+       if (proto_tce & TCE_PCI_WRITE)
+               proto_tce |= TCE_PCI_READ;
        for (i = 0; i < npages; i++) {
                unsigned long newtce = proto_tce |
                        ((rpn + i) << tbl->it_page_shift);
@@@ -617,6 -623,9 +620,9 @@@ int pnv_tce_xchg(struct iommu_table *tb
  
        BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl));
  
+       if (newtce & TCE_PCI_WRITE)
+               newtce |= TCE_PCI_READ;
        oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce));
        *hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE);
        *direction = iommu_tce_direction(oldtce);
@@@ -757,6 -766,26 +763,26 @@@ void pnv_pci_dma_dev_setup(struct pci_d
                phb->dma_dev_setup(phb, pdev);
  }
  
+ void pnv_pci_dma_bus_setup(struct pci_bus *bus)
+ {
+       struct pci_controller *hose = bus->sysdata;
+       struct pnv_phb *phb = hose->private_data;
+       struct pnv_ioda_pe *pe;
+       list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+               if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
+                       continue;
+               if (!pe->pbus)
+                       continue;
+               if (bus->number == ((pe->rid >> 8) & 0xFF)) {
+                       pe->pbus = bus;
+                       break;
+               }
+       }
+ }
  void pnv_pci_shutdown(void)
  {
        struct pci_controller *hose;
@@@ -776,6 -805,7 +802,6 @@@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_I
  void __init pnv_pci_init(void)
  {
        struct device_node *np;
 -      bool found_ioda = false;
  
        pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN);
  
        if (!firmware_has_feature(FW_FEATURE_OPAL))
                return;
  
 -      /* Look for IODA IO-Hubs. We don't support mixing IODA
 -       * and p5ioc2 due to the need to change some global
 -       * probing flags
 -       */
 +      /* Look for IODA IO-Hubs. */
        for_each_compatible_node(np, NULL, "ibm,ioda-hub") {
                pnv_pci_init_ioda_hub(np);
 -              found_ioda = true;
        }
  
 -      /* Look for p5ioc2 IO-Hubs */
 -      if (!found_ioda)
 -              for_each_compatible_node(np, NULL, "ibm,p5ioc2")
 -                      pnv_pci_init_p5ioc2_hub(np);
 -
        /* Look for ioda2 built-in PHB3's */
        for_each_compatible_node(np, NULL, "ibm,ioda2-phb")
                pnv_pci_init_ioda2_phb(np);
index 32cae3d8e01185e0cf72e59381e77a0857a0ba58,00691a9b99af67b09967c73b39b652b973392f06..3f814f382b2e793bf0b5374abb74bebf35246a98
@@@ -4,14 -4,16 +4,14 @@@
  struct pci_dn;
  
  enum pnv_phb_type {
 -      PNV_PHB_P5IOC2  = 0,
 -      PNV_PHB_IODA1   = 1,
 -      PNV_PHB_IODA2   = 2,
 -      PNV_PHB_NPU     = 3,
 +      PNV_PHB_IODA1   = 0,
 +      PNV_PHB_IODA2   = 1,
 +      PNV_PHB_NPU     = 2,
  };
  
  /* Precise PHB model for error management */
  enum pnv_phb_model {
        PNV_PHB_MODEL_UNKNOWN,
 -      PNV_PHB_MODEL_P5IOC2,
        PNV_PHB_MODEL_P7IOC,
        PNV_PHB_MODEL_PHB3,
        PNV_PHB_MODEL_NPU,
@@@ -119,74 -121,81 +119,74 @@@ struct pnv_phb 
        void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
        int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
  
 -      union {
 -              struct {
 -                      struct iommu_table iommu_table;
 -                      struct iommu_table_group table_group;
 -              } p5ioc2;
 -
 -              struct {
 -                      /* Global bridge info */
 -                      unsigned int            total_pe;
 -                      unsigned int            reserved_pe;
 -
 -                      /* 32-bit MMIO window */
 -                      unsigned int            m32_size;
 -                      unsigned int            m32_segsize;
 -                      unsigned int            m32_pci_base;
 -
 -                      /* 64-bit MMIO window */
 -                      unsigned int            m64_bar_idx;
 -                      unsigned long           m64_size;
 -                      unsigned long           m64_segsize;
 -                      unsigned long           m64_base;
 -                      unsigned long           m64_bar_alloc;
 -
 -                      /* IO ports */
 -                      unsigned int            io_size;
 -                      unsigned int            io_segsize;
 -                      unsigned int            io_pci_base;
 -
 -                      /* PE allocation bitmap */
 -                      unsigned long           *pe_alloc;
 -                      /* PE allocation mutex */
 -                      struct mutex            pe_alloc_mutex;
 -
 -                      /* M32 & IO segment maps */
 -                      unsigned int            *m32_segmap;
 -                      unsigned int            *io_segmap;
 -                      struct pnv_ioda_pe      *pe_array;
 -
 -                      /* IRQ chip */
 -                      int                     irq_chip_init;
 -                      struct irq_chip         irq_chip;
 -
 -                      /* Sorted list of used PE's based
 -                       * on the sequence of creation
 -                       */
 -                      struct list_head        pe_list;
 -                      struct mutex            pe_list_mutex;
 -
 -                      /* Reverse map of PEs, will have to extend if
 -                       * we are to support more than 256 PEs, indexed
 -                       * bus { bus, devfn }
 -                       */
 -                      unsigned char           pe_rmap[0x10000];
 -
 -                      /* 32-bit TCE tables allocation */
 -                      unsigned long           tce32_count;
 -
 -                      /* Total "weight" for the sake of DMA resources
 -                       * allocation
 -                       */
 -                      unsigned int            dma_weight;
 -                      unsigned int            dma_pe_count;
 -
 -                      /* Sorted list of used PE's, sorted at
 -                       * boot for resource allocation purposes
 -                       */
 -                      struct list_head        pe_dma_list;
 -
 -                      /* TCE cache invalidate registers (physical and
 -                       * remapped)
 -                       */
 -                      phys_addr_t             tce_inval_reg_phys;
 -                      __be64 __iomem          *tce_inval_reg;
 -              } ioda;
 -      };
 +      struct {
 +              /* Global bridge info */
 +              unsigned int            total_pe;
 +              unsigned int            reserved_pe;
 +
 +              /* 32-bit MMIO window */
 +              unsigned int            m32_size;
 +              unsigned int            m32_segsize;
 +              unsigned int            m32_pci_base;
 +
 +              /* 64-bit MMIO window */
 +              unsigned int            m64_bar_idx;
 +              unsigned long           m64_size;
 +              unsigned long           m64_segsize;
 +              unsigned long           m64_base;
 +              unsigned long           m64_bar_alloc;
 +
 +              /* IO ports */
 +              unsigned int            io_size;
 +              unsigned int            io_segsize;
 +              unsigned int            io_pci_base;
 +
 +              /* PE allocation bitmap */
 +              unsigned long           *pe_alloc;
 +              /* PE allocation mutex */
 +              struct mutex            pe_alloc_mutex;
 +
 +              /* M32 & IO segment maps */
 +              unsigned int            *m32_segmap;
 +              unsigned int            *io_segmap;
 +              struct pnv_ioda_pe      *pe_array;
 +
 +              /* IRQ chip */
 +              int                     irq_chip_init;
 +              struct irq_chip         irq_chip;
 +
 +              /* Sorted list of used PE's based
 +               * on the sequence of creation
 +               */
 +              struct list_head        pe_list;
 +              struct mutex            pe_list_mutex;
 +
 +              /* Reverse map of PEs, will have to extend if
 +               * we are to support more than 256 PEs, indexed
 +               * bus { bus, devfn }
 +               */
 +              unsigned char           pe_rmap[0x10000];
 +
 +              /* 32-bit TCE tables allocation */
 +              unsigned long           tce32_count;
 +
 +              /* Total "weight" for the sake of DMA resources
 +               * allocation
 +               */
 +              unsigned int            dma_weight;
 +              unsigned int            dma_pe_count;
 +
 +              /* Sorted list of used PE's, sorted at
 +               * boot for resource allocation purposes
 +               */
 +              struct list_head        pe_dma_list;
 +
 +              /* TCE cache invalidate registers (physical and
 +               * remapped)
 +               */
 +              phys_addr_t             tce_inval_reg_phys;
 +              __be64 __iomem          *tce_inval_reg;
 +      } ioda;
  
        /* PHB and hub status structure */
        union {
@@@ -223,6 -232,7 +223,6 @@@ extern void pnv_pci_unlink_table_and_gr
  extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
                                      void *tce_mem, u64 tce_size,
                                      u64 dma_offset, unsigned page_shift);
 -extern void pnv_pci_init_p5ioc2_hub(struct device_node *np);
  extern void pnv_pci_init_ioda_hub(struct device_node *np);
  extern void pnv_pci_init_ioda2_phb(struct device_node *np);
  extern void pnv_pci_init_npu_phb(struct device_node *np);
@@@ -232,6 -242,7 +232,7 @@@ extern void pnv_pci_reset_secondary_bus
  extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
  
  extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
+ extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
  extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
  extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
  
diff --combined mm/huge_memory.c
index 08fc0ba2207e555a9c734524cd607dc8f53e644a,de3f43cde129f6da955b4477f97e19825a205712..aea8f7a42df97d7185f626d5bbc445c64f376eb1
@@@ -138,6 -138,9 +138,6 @@@ static struct khugepaged_scan khugepage
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
  };
  
 -static DEFINE_SPINLOCK(split_queue_lock);
 -static LIST_HEAD(split_queue);
 -static unsigned long split_queue_len;
  static struct shrinker deferred_split_shrinker;
  
  static void set_recommended_min_free_kbytes(void)
@@@ -858,8 -861,7 +858,8 @@@ static bool set_huge_zero_page(pgtable_
                return false;
        entry = mk_pmd(zero_page, vma->vm_page_prot);
        entry = pmd_mkhuge(entry);
 -      pgtable_trans_huge_deposit(mm, pmd, pgtable);
 +      if (pgtable)
 +              pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
        atomic_long_inc(&mm->nr_ptes);
        return true;
@@@ -1037,15 -1039,13 +1037,15 @@@ int copy_huge_pmd(struct mm_struct *dst
        spinlock_t *dst_ptl, *src_ptl;
        struct page *src_page;
        pmd_t pmd;
 -      pgtable_t pgtable;
 +      pgtable_t pgtable = NULL;
        int ret;
  
 -      ret = -ENOMEM;
 -      pgtable = pte_alloc_one(dst_mm, addr);
 -      if (unlikely(!pgtable))
 -              goto out;
 +      if (!vma_is_dax(vma)) {
 +              ret = -ENOMEM;
 +              pgtable = pte_alloc_one(dst_mm, addr);
 +              if (unlikely(!pgtable))
 +                      goto out;
 +      }
  
        dst_ptl = pmd_lock(dst_mm, dst_pmd);
        src_ptl = pmd_lockptr(src_mm, src_pmd);
                goto out_unlock;
        }
  
 -      if (pmd_trans_huge(pmd)) {
 +      if (!vma_is_dax(vma)) {
                /* thp accounting separate from pmd_devmap accounting */
                src_page = pmd_page(pmd);
                VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
@@@ -1560,8 -1560,7 +1560,8 @@@ int madvise_free_huge_pmd(struct mmu_ga
        struct mm_struct *mm = tlb->mm;
        int ret = 0;
  
 -      if (!pmd_trans_huge_lock(pmd, vma, &ptl))
 +      ptl = pmd_trans_huge_lock(pmd, vma);
 +      if (!ptl)
                goto out_unlocked;
  
        orig_pmd = *pmd;
@@@ -1628,8 -1627,7 +1628,8 @@@ int zap_huge_pmd(struct mmu_gather *tlb
        pmd_t orig_pmd;
        spinlock_t *ptl;
  
 -      if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
 +      ptl = __pmd_trans_huge_lock(pmd, vma);
 +      if (!ptl)
                return 0;
        /*
         * For architectures like ppc64 we look at deposited pgtable
@@@ -1692,8 -1690,7 +1692,8 @@@ bool move_huge_pmd(struct vm_area_struc
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_sem prevents deadlock.
         */
 -      if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
 +      old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
 +      if (old_ptl) {
                new_ptl = pmd_lockptr(mm, new_pmd);
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@@ -1727,8 -1724,7 +1727,8 @@@ int change_huge_pmd(struct vm_area_stru
        spinlock_t *ptl;
        int ret = 0;
  
 -      if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
 +      ptl = __pmd_trans_huge_lock(pmd, vma);
 +      if (ptl) {
                pmd_t entry;
                bool preserve_write = prot_numa && pmd_write(*pmd);
                ret = 1;
   * Note that if it returns true, this routine returns without unlocking page
   * table lock. So callers must unlock it.
   */
 -bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
 -              spinlock_t **ptl)
 +spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
  {
 -      *ptl = pmd_lock(vma->vm_mm, pmd);
 +      spinlock_t *ptl;
 +      ptl = pmd_lock(vma->vm_mm, pmd);
        if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
 -              return true;
 -      spin_unlock(*ptl);
 -      return false;
 +              return ptl;
 +      spin_unlock(ptl);
 +      return NULL;
  }
  
  #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
@@@ -2072,7 -2068,7 +2072,7 @@@ static int __collapse_huge_page_isolate
        if (likely(writable)) {
                if (likely(referenced)) {
                        result = SCAN_SUCCEED;
 -                      trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
 +                      trace_mm_collapse_huge_page_isolate(page, none_or_zero,
                                                            referenced, writable, result);
                        return 1;
                }
  
  out:
        release_pte_pages(pte, _pte);
 -      trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
 +      trace_mm_collapse_huge_page_isolate(page, none_or_zero,
                                            referenced, writable, result);
        return 0;
  }
@@@ -2580,7 -2576,7 +2580,7 @@@ out_unmap
                collapse_huge_page(mm, address, hpage, vma, node);
        }
  out:
 -      trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
 +      trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
                                     none_or_zero, result);
        return ret;
  }
@@@ -2860,6 -2856,7 +2860,7 @@@ static void __split_huge_pmd_locked(str
        young = pmd_young(*pmd);
        dirty = pmd_dirty(*pmd);
  
+       pmdp_huge_split_prepare(vma, haddr, pmd);
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
  
@@@ -3358,11 -3355,9 +3359,11 @@@ int total_mapcount(struct page *page
  int split_huge_page_to_list(struct page *page, struct list_head *list)
  {
        struct page *head = compound_head(page);
 +      struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
        struct anon_vma *anon_vma;
        int count, mapcount, ret;
        bool mlocked;
 +      unsigned long flags;
  
        VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
        VM_BUG_ON_PAGE(!PageAnon(page), page);
                lru_add_drain();
  
        /* Prevent deferred_split_scan() touching ->_count */
 -      spin_lock(&split_queue_lock);
 +      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
        count = page_count(head);
        mapcount = total_mapcount(head);
        if (!mapcount && count == 1) {
                if (!list_empty(page_deferred_list(head))) {
 -                      split_queue_len--;
 +                      pgdata->split_queue_len--;
                        list_del(page_deferred_list(head));
                }
 -              spin_unlock(&split_queue_lock);
 +              spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
                __split_huge_page(page, list);
                ret = 0;
        } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
 -              spin_unlock(&split_queue_lock);
 +              spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
                pr_alert("total_mapcount: %u, page_count(): %u\n",
                                mapcount, count);
                if (PageTail(page))
                dump_page(page, "total_mapcount(head) > 0");
                BUG();
        } else {
 -              spin_unlock(&split_queue_lock);
 +              spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
                unfreeze_page(anon_vma, head);
                ret = -EBUSY;
        }
  
  void free_transhuge_page(struct page *page)
  {
 +      struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
        unsigned long flags;
  
 -      spin_lock_irqsave(&split_queue_lock, flags);
 +      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
        if (!list_empty(page_deferred_list(page))) {
 -              split_queue_len--;
 +              pgdata->split_queue_len--;
                list_del(page_deferred_list(page));
        }
 -      spin_unlock_irqrestore(&split_queue_lock, flags);
 +      spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
        free_compound_page(page);
  }
  
  void deferred_split_huge_page(struct page *page)
  {
 +      struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
        unsigned long flags;
  
        VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  
 -      spin_lock_irqsave(&split_queue_lock, flags);
 +      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
        if (list_empty(page_deferred_list(page))) {
 -              list_add_tail(page_deferred_list(page), &split_queue);
 -              split_queue_len++;
 +              list_add_tail(page_deferred_list(page), &pgdata->split_queue);
 +              pgdata->split_queue_len++;
        }
 -      spin_unlock_irqrestore(&split_queue_lock, flags);
 +      spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
  }
  
  static unsigned long deferred_split_count(struct shrinker *shrink,
                struct shrink_control *sc)
  {
 -      /*
 -       * Split a page from split_queue will free up at least one page,
 -       * at most HPAGE_PMD_NR - 1. We don't track exact number.
 -       * Let's use HPAGE_PMD_NR / 2 as ballpark.
 -       */
 -      return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
 +      struct pglist_data *pgdata = NODE_DATA(sc->nid);
 +      return ACCESS_ONCE(pgdata->split_queue_len);
  }
  
  static unsigned long deferred_split_scan(struct shrinker *shrink,
                struct shrink_control *sc)
  {
 +      struct pglist_data *pgdata = NODE_DATA(sc->nid);
        unsigned long flags;
        LIST_HEAD(list), *pos, *next;
        struct page *page;
        int split = 0;
  
 -      spin_lock_irqsave(&split_queue_lock, flags);
 -      list_splice_init(&split_queue, &list);
 -
 +      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
        /* Take pin on all head pages to avoid freeing them under us */
 -      list_for_each_safe(pos, next, &list) {
 +      list_for_each_safe(pos, next, &pgdata->split_queue) {
                page = list_entry((void *)pos, struct page, mapping);
                page = compound_head(page);
 -              /* race with put_compound_page() */
 -              if (!get_page_unless_zero(page)) {
 +              if (get_page_unless_zero(page)) {
 +                      list_move(page_deferred_list(page), &list);
 +              } else {
 +                      /* We lost race with put_compound_page() */
                        list_del_init(page_deferred_list(page));
 -                      split_queue_len--;
 +                      pgdata->split_queue_len--;
                }
 +              if (!--sc->nr_to_scan)
 +                      break;
        }
 -      spin_unlock_irqrestore(&split_queue_lock, flags);
 +      spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
  
        list_for_each_safe(pos, next, &list) {
                page = list_entry((void *)pos, struct page, mapping);
                put_page(page);
        }
  
 -      spin_lock_irqsave(&split_queue_lock, flags);
 -      list_splice_tail(&list, &split_queue);
 -      spin_unlock_irqrestore(&split_queue_lock, flags);
 +      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
 +      list_splice_tail(&list, &pgdata->split_queue);
 +      spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
  
 -      return split * HPAGE_PMD_NR / 2;
 +      /*
 +       * Stop shrinker if we didn't split any page, but the queue is empty.
 +       * This can happen if pages were freed under us.
 +       */
 +      if (!split && list_empty(&pgdata->split_queue))
 +              return SHRINK_STOP;
 +      return split;
  }
  
  static struct shrinker deferred_split_shrinker = {
        .count_objects = deferred_split_count,
        .scan_objects = deferred_split_scan,
        .seeks = DEFAULT_SEEKS,
 +      .flags = SHRINKER_NUMA_AWARE,
  };
  
  #ifdef CONFIG_DEBUG_FS
This page took 0.156903 seconds and 4 git commands to generate.