Merge tag 'powerpc-4.5-4' into next

author Michael Ellerman <[email protected]>

Thu, 25 Feb 2016 10:52:58 +0000 (21:52 +1100)

committer Michael Ellerman <[email protected]>

Thu, 25 Feb 2016 10:52:58 +0000 (21:52 +1100)
author Michael Ellerman <[email protected]>
Thu, 25 Feb 2016 10:52:58 +0000 (21:52 +1100)
committer Michael Ellerman <[email protected]>
Thu, 25 Feb 2016 10:52:58 +0000 (21:52 +1100)
diff --combined arch/powerpc/Kconfig

index e4824fd04bb7449d262c1a7697b5f539b03f6bab,5ead6a31854bf5987994e6e118ece6aebf8f920f..9faa18c4f3f702adceb4f555b05b72bc8437cf6c
--- 1/arch/powerpc/Kconfig
--- 2/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@@ -108,6 -108,7 +108,6 @@@ config PP
         select HAVE_ARCH_TRACEHOOK
         select HAVE_MEMBLOCK
         select HAVE_MEMBLOCK_NODE_MAP
- -      select HAVE_DMA_ATTRS
         select HAVE_DMA_API_DEBUG
         select HAVE_OPROFILE
         select HAVE_DEBUG_KMEMLEAK
@@@ -157,7 -158,6 +157,7 @@@
         select ARCH_HAS_DMA_SET_COHERENT_MASK
         select ARCH_HAS_DEVMEM_IS_ALLOWED
         select HAVE_ARCH_SECCOMP_FILTER
+ +      select ARCH_HAS_UBSAN_SANITIZE_ALL
   
   config GENERIC_CSUM
         def_bool CPU_LITTLE_ENDIAN
@@@ -557,7 -557,7 +557,7 @@@ choic
   
   config PPC_4K_PAGES
         bool "4k page size"
-       select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S
+       select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
   
   config PPC_16K_PAGES
         bool "16k page size"
@@@ -566,7 -566,7 +566,7 @@@
   config PPC_64K_PAGES
         bool "64k page size"
         depends on !PPC_FSL_BOOK3E && (44x || PPC_STD_MMU_64 || PPC_BOOK3E_64)
-       select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S
+       select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
   
   config PPC_256K_PAGES
         bool "256k page size"
diff --combined arch/powerpc/kernel/eeh_driver.c

index 938742135ee08fc8dd058df690cfba7eacdabc0b,52c1e273f8cd5d5d641e1f2bc7591ff8b0efdbd9..650cfb31ea3d9b3cee0301948bd85e54400cb951
--- 1/arch/powerpc/kernel/eeh_driver.c
--- 2/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@@ -400,7 -400,7 +400,7 @@@ static void *eeh_rmv_device(void *data
          * support EEH. So we just care about PCI devices for
          * simplicity here.
          */
- -      if (!dev || (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE))
+ +      if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
                 return NULL;
   
         /*
@@@ -418,8 -418,7 +418,7 @@@
                 eeh_pcid_put(dev);
                 if (driver->err_handler &&
                     driver->err_handler->error_detected &&
-                   driver->err_handler->slot_reset &&
-                   driver->err_handler->resume)
+                   driver->err_handler->slot_reset)
                         return NULL;
         }
   
@@@ -564,6 -563,7 +563,7 @@@ static int eeh_reset_device(struct eeh_
          */
         eeh_pe_state_mark(pe, EEH_PE_KEEP);
         if (bus) {
+               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
                 pci_lock_rescan_remove();
                 pcibios_remove_pci_devices(bus);
                 pci_unlock_rescan_remove();
@@@ -803,6 -803,7 +803,7 @@@ perm_error
          * the their PCI config any more.
          */
         if (frozen_bus) {
+               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
                 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
   
                 pci_lock_rescan_remove();
@@@ -886,6 -887,7 +887,7 @@@ static void eeh_handle_special_event(vo
                                         continue;
   
                                 /* Notify all devices to be down */
+                               eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
                                 bus = eeh_pe_bus_get(phb_pe);
                                 eeh_pe_dev_traverse(pe,
                                         eeh_report_failure, NULL);
diff --combined arch/powerpc/platforms/powernv/eeh-powernv.c

index 3f1cb35d9cdf05fa91b00d09be32189fd0ebbac3,87f47e55aab65ac234df1d67c926ca17b1517f06..811917219bf11317ec8d4ee32df3ddd49041ce5c
--- 1/arch/powerpc/platforms/powernv/eeh-powernv.c
--- 2/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@@ -167,26 -167,42 +167,26 @@@ static int pnv_eeh_dbgfs_get(void *data
         return 0;
   }
   
- -static int pnv_eeh_outb_dbgfs_set(void *data, u64 val)
- -{
- -      return pnv_eeh_dbgfs_set(data, 0xD10, val);
- -}
- -
- -static int pnv_eeh_outb_dbgfs_get(void *data, u64 *val)
- -{
- -      return pnv_eeh_dbgfs_get(data, 0xD10, val);
- -}
- -
- -static int pnv_eeh_inbA_dbgfs_set(void *data, u64 val)
- -{
- -      return pnv_eeh_dbgfs_set(data, 0xD90, val);
- -}
- -
- -static int pnv_eeh_inbA_dbgfs_get(void *data, u64 *val)
- -{
- -      return pnv_eeh_dbgfs_get(data, 0xD90, val);
- -}
- -
- -static int pnv_eeh_inbB_dbgfs_set(void *data, u64 val)
- -{
- -      return pnv_eeh_dbgfs_set(data, 0xE10, val);
- -}
- -
- -static int pnv_eeh_inbB_dbgfs_get(void *data, u64 *val)
- -{
- -      return pnv_eeh_dbgfs_get(data, 0xE10, val);
- -}
+ +#define PNV_EEH_DBGFS_ENTRY(name, reg)                                \
+ +static int pnv_eeh_dbgfs_set_##name(void *data, u64 val)      \
+ +{                                                             \
+ +      return pnv_eeh_dbgfs_set(data, reg, val);               \
+ +}                                                             \
+ +                                                              \
+ +static int pnv_eeh_dbgfs_get_##name(void *data, u64 *val)     \
+ +{                                                             \
+ +      return pnv_eeh_dbgfs_get(data, reg, val);               \
+ +}                                                             \
+ +                                                              \
+ +DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_dbgfs_ops_##name,             \
+ +                      pnv_eeh_dbgfs_get_##name,               \
+ +                        pnv_eeh_dbgfs_set_##name,             \
+ +                      "0x%llx\n")
+ +
+ +PNV_EEH_DBGFS_ENTRY(outb, 0xD10);
+ +PNV_EEH_DBGFS_ENTRY(inbA, 0xD90);
+ +PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
   
- -DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_outb_dbgfs_ops, pnv_eeh_outb_dbgfs_get,
- -                      pnv_eeh_outb_dbgfs_set, "0x%llx\n");
- -DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_inbA_dbgfs_ops, pnv_eeh_inbA_dbgfs_get,
- -                      pnv_eeh_inbA_dbgfs_set, "0x%llx\n");
- -DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_inbB_dbgfs_ops, pnv_eeh_inbB_dbgfs_get,
- -                      pnv_eeh_inbB_dbgfs_set, "0x%llx\n");
   #endif /* CONFIG_DEBUG_FS */
   
   /**
@@@ -252,13 -268,13 +252,13 @@@ static int pnv_eeh_post_init(void
   
                 debugfs_create_file("err_injct_outbound", 0600,
                                     phb->dbgfs, hose,
- -                                  &pnv_eeh_outb_dbgfs_ops);
+ +                                  &pnv_eeh_dbgfs_ops_outb);
                 debugfs_create_file("err_injct_inboundA", 0600,
                                     phb->dbgfs, hose,
- -                                  &pnv_eeh_inbA_dbgfs_ops);
+ +                                  &pnv_eeh_dbgfs_ops_inbA);
                 debugfs_create_file("err_injct_inboundB", 0600,
                                     phb->dbgfs, hose,
- -                                  &pnv_eeh_inbB_dbgfs_ops);
+ +                                  &pnv_eeh_dbgfs_ops_inbB);
   #endif /* CONFIG_DEBUG_FS */
         }
   
@@@ -428,9 -444,12 +428,12 @@@ static void *pnv_eeh_probe(struct pci_d
          * PCI devices of the PE are expected to be removed prior
          * to PE reset.
          */
-       if (!edev->pe->bus)
+       if (!(edev->pe->state & EEH_PE_PRI_BUS)) {
                 edev->pe->bus = pci_find_bus(hose->global_number,
                                              pdn->busno);
+               if (edev->pe->bus)
+                       edev->pe->state |= EEH_PE_PRI_BUS;
+       }
   
         /*
          * Enable EEH explicitly so that we will do EEH check
diff --combined arch/powerpc/platforms/powernv/pci-ioda.c

index dc868586315d02378e8506589ec107e7d35801de,f90dc04395bf47bcc0e662a7a1c17526220ccda2..c5baaf3cc4e5ef565bcaadeb125bd6ac2138a4c9
--- 1/arch/powerpc/platforms/powernv/pci-ioda.c
--- 2/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@@ -872,6 -872,9 +872,6 @@@ static int pnv_pci_vf_resource_shift(st
                 if (!res->flags || !res->parent)
                         continue;
   
- -              if (!pnv_pci_is_mem_pref_64(res->flags))
- -                      continue;
- -
                 /*
                  * The actual IOV BAR range is determined by the start address
                  * and the actual size for num_vfs VFs BAR.  This check is to
@@@ -900,6 -903,9 +900,6 @@@
                 if (!res->flags || !res->parent)
                         continue;
   
- -              if (!pnv_pci_is_mem_pref_64(res->flags))
- -                      continue;
- -
                 size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
                 res2 = *res;
                 res->start += size * offset;
@@@ -1190,36 -1196,29 +1190,36 @@@ static void pnv_pci_ioda_setup_PEs(void
   }
   
   #ifdef CONFIG_PCI_IOV
- -static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
+ +static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
   {
         struct pci_bus        *bus;
         struct pci_controller *hose;
         struct pnv_phb        *phb;
         struct pci_dn         *pdn;
         int                    i, j;
+ +      int                    m64_bars;
   
         bus = pdev->bus;
         hose = pci_bus_to_host(bus);
         phb = hose->private_data;
         pdn = pci_get_pdn(pdev);
   
+ +      if (pdn->m64_single_mode)
+ +              m64_bars = num_vfs;
+ +      else
+ +              m64_bars = 1;
+ +
         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
- -              for (j = 0; j < M64_PER_IOV; j++) {
- -                      if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
+ +              for (j = 0; j < m64_bars; j++) {
+ +                      if (pdn->m64_map[j][i] == IODA_INVALID_M64)
                                 continue;
                         opal_pci_phb_mmio_enable(phb->opal_id,
- -                              OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
- -                      clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
- -                      pdn->m64_wins[i][j] = IODA_INVALID_M64;
+ +                              OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
+ +                      clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
+ +                      pdn->m64_map[j][i] = IODA_INVALID_M64;
                 }
   
+ +      kfree(pdn->m64_map);
         return 0;
   }
   
@@@ -1236,7 -1235,8 +1236,7 @@@ static int pnv_pci_vf_assign_m64(struc
         int                    total_vfs;
         resource_size_t        size, start;
         int                    pe_num;
- -      int                    vf_groups;
- -      int                    vf_per_group;
+ +      int                    m64_bars;
   
         bus = pdev->bus;
         hose = pci_bus_to_host(bus);
@@@ -1244,26 -1244,29 +1244,26 @@@
         pdn = pci_get_pdn(pdev);
         total_vfs = pci_sriov_get_totalvfs(pdev);
   
- -      /* Initialize the m64_wins to IODA_INVALID_M64 */
- -      for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
- -              for (j = 0; j < M64_PER_IOV; j++)
- -                      pdn->m64_wins[i][j] = IODA_INVALID_M64;
+ +      if (pdn->m64_single_mode)
+ +              m64_bars = num_vfs;
+ +      else
+ +              m64_bars = 1;
+ +
+ +      pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL);
+ +      if (!pdn->m64_map)
+ +              return -ENOMEM;
+ +      /* Initialize the m64_map to IODA_INVALID_M64 */
+ +      for (i = 0; i < m64_bars ; i++)
+ +              for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
+ +                      pdn->m64_map[i][j] = IODA_INVALID_M64;
   
- -      if (pdn->m64_per_iov == M64_PER_IOV) {
- -              vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
- -              vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
- -                      roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
- -      } else {
- -              vf_groups = 1;
- -              vf_per_group = 1;
- -      }
   
         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
                 res = &pdev->resource[i + PCI_IOV_RESOURCES];
                 if (!res->flags || !res->parent)
                         continue;
   
- -              if (!pnv_pci_is_mem_pref_64(res->flags))
- -                      continue;
- -
- -              for (j = 0; j < vf_groups; j++) {
+ +              for (j = 0; j < m64_bars; j++) {
                         do {
                                 win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
                                                 phb->ioda.m64_bar_idx + 1, 0);
@@@ -1272,11 -1275,12 +1272,11 @@@
                                         goto m64_failed;
                         } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
   
- -                      pdn->m64_wins[i][j] = win;
+ +                      pdn->m64_map[j][i] = win;
   
- -                      if (pdn->m64_per_iov == M64_PER_IOV) {
+ +                      if (pdn->m64_single_mode) {
                                 size = pci_iov_resource_size(pdev,
                                                         PCI_IOV_RESOURCES + i);
- -                              size = size * vf_per_group;
                                 start = res->start + size * j;
                         } else {
                                 size = resource_size(res);
@@@ -1284,16 -1288,16 +1284,16 @@@
                         }
   
                         /* Map the M64 here */
- -                      if (pdn->m64_per_iov == M64_PER_IOV) {
- -                              pe_num = pdn->offset + j;
+ +                      if (pdn->m64_single_mode) {
+ +                              pe_num = pdn->pe_num_map[j];
                                 rc = opal_pci_map_pe_mmio_window(phb->opal_id,
                                                 pe_num, OPAL_M64_WINDOW_TYPE,
- -                                              pdn->m64_wins[i][j], 0);
+ +                                              pdn->m64_map[j][i], 0);
                         }
   
                         rc = opal_pci_set_phb_mem_window(phb->opal_id,
                                                  OPAL_M64_WINDOW_TYPE,
- -                                               pdn->m64_wins[i][j],
+ +                                               pdn->m64_map[j][i],
                                                  start,
                                                  0, /* unused */
                                                  size);
@@@ -1305,12 -1309,12 +1305,12 @@@
                                 goto m64_failed;
                         }
   
- -                      if (pdn->m64_per_iov == M64_PER_IOV)
+ +                      if (pdn->m64_single_mode)
                                 rc = opal_pci_phb_mmio_enable(phb->opal_id,
- -                                   OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
+ +                                   OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
                         else
                                 rc = opal_pci_phb_mmio_enable(phb->opal_id,
- -                                   OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
+ +                                   OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
   
                         if (rc != OPAL_SUCCESS) {
                                 dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
@@@ -1322,7 -1326,7 +1322,7 @@@
         return 0;
   
   m64_failed:
- -      pnv_pci_vf_release_m64(pdev);
+ +      pnv_pci_vf_release_m64(pdev, num_vfs);
         return -EBUSY;
   }
   
@@@ -1349,13 -1353,15 +1349,13 @@@ static void pnv_pci_ioda2_release_dma_p
         iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
   }
   
- -static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+ +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
   {
         struct pci_bus        *bus;
         struct pci_controller *hose;
         struct pnv_phb        *phb;
         struct pnv_ioda_pe    *pe, *pe_n;
         struct pci_dn         *pdn;
- -      u16                    vf_index;
- -      int64_t                rc;
   
         bus = pdev->bus;
         hose = pci_bus_to_host(bus);
@@@ -1365,6 -1371,35 +1365,6 @@@
         if (!pdev->is_physfn)
                 return;
   
- -      if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
- -              int   vf_group;
- -              int   vf_per_group;
- -              int   vf_index1;
- -
- -              vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
- -
- -              for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
- -                      for (vf_index = vf_group * vf_per_group;
- -                              vf_index < (vf_group + 1) * vf_per_group &&
- -                              vf_index < num_vfs;
- -                              vf_index++)
- -                              for (vf_index1 = vf_group * vf_per_group;
- -                                      vf_index1 < (vf_group + 1) * vf_per_group &&
- -                                      vf_index1 < num_vfs;
- -                                      vf_index1++){
- -
- -                                      rc = opal_pci_set_peltv(phb->opal_id,
- -                                              pdn->offset + vf_index,
- -                                              pdn->offset + vf_index1,
- -                                              OPAL_REMOVE_PE_FROM_DOMAIN);
- -
- -                                      if (rc)
- -                                          dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
- -                                              __func__,
- -                                              pdn->offset + vf_index1, rc);
- -                              }
- -      }
- -
         list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
                 if (pe->parent_dev != pdev)
                         continue;
@@@ -1389,7 -1424,7 +1389,7 @@@ void pnv_pci_sriov_disable(struct pci_d
         struct pnv_phb        *phb;
         struct pci_dn         *pdn;
         struct pci_sriov      *iov;
- -      u16 num_vfs;
+ +      u16                    num_vfs, i;
   
         bus = pdev->bus;
         hose = pci_bus_to_host(bus);
@@@ -1399,25 -1434,18 +1399,25 @@@
         num_vfs = pdn->num_vfs;
   
         /* Release VF PEs */
- -      pnv_ioda_release_vf_PE(pdev, num_vfs);
+ +      pnv_ioda_release_vf_PE(pdev);
   
         if (phb->type == PNV_PHB_IODA2) {
- -              if (pdn->m64_per_iov == 1)
- -                      pnv_pci_vf_resource_shift(pdev, -pdn->offset);
+ +              if (!pdn->m64_single_mode)
+ +                      pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
   
                 /* Release M64 windows */
- -              pnv_pci_vf_release_m64(pdev);
+ +              pnv_pci_vf_release_m64(pdev, num_vfs);
   
                 /* Release PE numbers */
- -              bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
- -              pdn->offset = 0;
+ +              if (pdn->m64_single_mode) {
+ +                      for (i = 0; i < num_vfs; i++) {
+ +                              if (pdn->pe_num_map[i] != IODA_INVALID_PE)
+ +                                      pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
+ +                      }
+ +              } else
+ +                      bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
+ +              /* Releasing pe_num_map */
+ +              kfree(pdn->pe_num_map);
         }
   }
   
@@@ -1432,6 -1460,7 +1432,6 @@@ static void pnv_ioda_setup_vf_PE(struc
         int                    pe_num;
         u16                    vf_index;
         struct pci_dn         *pdn;
- -      int64_t                rc;
   
         bus = pdev->bus;
         hose = pci_bus_to_host(bus);
@@@ -1443,10 -1472,7 +1443,10 @@@
   
         /* Reserve PE for each VF */
         for (vf_index = 0; vf_index < num_vfs; vf_index++) {
- -              pe_num = pdn->offset + vf_index;
+ +              if (pdn->m64_single_mode)
+ +                      pe_num = pdn->pe_num_map[vf_index];
+ +              else
+ +                      pe_num = *pdn->pe_num_map + vf_index;
   
                 pe = &phb->ioda.pe_array[pe_num];
                 pe->pe_number = pe_num;
@@@ -1479,6 -1505,37 +1479,6 @@@
   
                 pnv_pci_ioda2_setup_dma_pe(phb, pe);
         }
- -
- -      if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
- -              int   vf_group;
- -              int   vf_per_group;
- -              int   vf_index1;
- -
- -              vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
- -
- -              for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
- -                      for (vf_index = vf_group * vf_per_group;
- -                           vf_index < (vf_group + 1) * vf_per_group &&
- -                           vf_index < num_vfs;
- -                           vf_index++) {
- -                              for (vf_index1 = vf_group * vf_per_group;
- -                                   vf_index1 < (vf_group + 1) * vf_per_group &&
- -                                   vf_index1 < num_vfs;
- -                                   vf_index1++) {
- -
- -                                      rc = opal_pci_set_peltv(phb->opal_id,
- -                                              pdn->offset + vf_index,
- -                                              pdn->offset + vf_index1,
- -                                              OPAL_ADD_PE_TO_DOMAIN);
- -
- -                                      if (rc)
- -                                          dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
- -                                              __func__,
- -                                              pdn->offset + vf_index1, rc);
- -                              }
- -                      }
- -              }
- -      }
   }
   
   int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
@@@ -1488,7 -1545,6 +1488,7 @@@
         struct pnv_phb        *phb;
         struct pci_dn         *pdn;
         int                    ret;
+ +      u16                    i;
   
         bus = pdev->bus;
         hose = pci_bus_to_host(bus);
@@@ -1496,59 -1552,20 +1496,59 @@@
         pdn = pci_get_pdn(pdev);
   
         if (phb->type == PNV_PHB_IODA2) {
+ +              if (!pdn->vfs_expanded) {
+ +                      dev_info(&pdev->dev, "don't support this SRIOV device"
+ +                              " with non 64bit-prefetchable IOV BAR\n");
+ +                      return -ENOSPC;
+ +              }
+ +
+ +              /*
+ +               * When M64 BARs functions in Single PE mode, the number of VFs
+ +               * could be enabled must be less than the number of M64 BARs.
+ +               */
+ +              if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
+ +                      dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
+ +                      return -EBUSY;
+ +              }
+ +
+ +              /* Allocating pe_num_map */
+ +              if (pdn->m64_single_mode)
+ +                      pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map) * num_vfs,
+ +                                      GFP_KERNEL);
+ +              else
+ +                      pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
+ +
+ +              if (!pdn->pe_num_map)
+ +                      return -ENOMEM;
+ +
+ +              if (pdn->m64_single_mode)
+ +                      for (i = 0; i < num_vfs; i++)
+ +                              pdn->pe_num_map[i] = IODA_INVALID_PE;
+ +
                 /* Calculate available PE for required VFs */
- -              mutex_lock(&phb->ioda.pe_alloc_mutex);
- -              pdn->offset = bitmap_find_next_zero_area(
- -                      phb->ioda.pe_alloc, phb->ioda.total_pe,
- -                      0, num_vfs, 0);
- -              if (pdn->offset >= phb->ioda.total_pe) {
+ +              if (pdn->m64_single_mode) {
+ +                      for (i = 0; i < num_vfs; i++) {
+ +                              pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb);
+ +                              if (pdn->pe_num_map[i] == IODA_INVALID_PE) {
+ +                                      ret = -EBUSY;
+ +                                      goto m64_failed;
+ +                              }
+ +                      }
+ +              } else {
+ +                      mutex_lock(&phb->ioda.pe_alloc_mutex);
+ +                      *pdn->pe_num_map = bitmap_find_next_zero_area(
+ +                              phb->ioda.pe_alloc, phb->ioda.total_pe,
+ +                              0, num_vfs, 0);
+ +                      if (*pdn->pe_num_map >= phb->ioda.total_pe) {
+ +                              mutex_unlock(&phb->ioda.pe_alloc_mutex);
+ +                              dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
+ +                              kfree(pdn->pe_num_map);
+ +                              return -EBUSY;
+ +                      }
+ +                      bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
                         mutex_unlock(&phb->ioda.pe_alloc_mutex);
- -                      dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
- -                      pdn->offset = 0;
- -                      return -EBUSY;
                 }
- -              bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs);
                 pdn->num_vfs = num_vfs;
- -              mutex_unlock(&phb->ioda.pe_alloc_mutex);
   
                 /* Assign M64 window accordingly */
                 ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
@@@ -1562,8 -1579,8 +1562,8 @@@
                  * the IOV BAR according to the PE# allocated to the VFs.
                  * Otherwise, the PE# for the VF will conflict with others.
                  */
- -              if (pdn->m64_per_iov == 1) {
- -                      ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
+ +              if (!pdn->m64_single_mode) {
+ +                      ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
                         if (ret)
                                 goto m64_failed;
                 }
@@@ -1575,16 -1592,8 +1575,16 @@@
         return 0;
   
   m64_failed:
- -      bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
- -      pdn->offset = 0;
+ +      if (pdn->m64_single_mode) {
+ +              for (i = 0; i < num_vfs; i++) {
+ +                      if (pdn->pe_num_map[i] != IODA_INVALID_PE)
+ +                              pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
+ +              }
+ +      } else
+ +              bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
+ +
+ +      /* Releasing pe_num_map */
+ +      kfree(pdn->pe_num_map);
   
         return ret;
   }
@@@ -1603,7 -1612,8 +1603,7 @@@ int pcibios_sriov_enable(struct pci_de
         /* Allocate PCI data */
         add_dev_pci_data(pdev);
   
- -      pnv_pci_sriov_enable(pdev, num_vfs);
- -      return 0;
+ +      return pnv_pci_sriov_enable(pdev, num_vfs);
   }
   #endif /* CONFIG_PCI_IOV */
   
@@@ -2841,58 -2851,45 +2841,58 @@@ static void pnv_pci_init_ioda_msis(stru
   #ifdef CONFIG_PCI_IOV
   static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
   {
- -      struct pci_controller *hose;
- -      struct pnv_phb *phb;
+ +      struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ +      struct pnv_phb *phb = hose->private_data;
+ +      const resource_size_t gate = phb->ioda.m64_segsize >> 2;
         struct resource *res;
         int i;
- -      resource_size_t size;
+ +      resource_size_t size, total_vf_bar_sz;
         struct pci_dn *pdn;
         int mul, total_vfs;
   
         if (!pdev->is_physfn || pdev->is_added)
                 return;
   
- -      hose = pci_bus_to_host(pdev->bus);
- -      phb = hose->private_data;
- -
         pdn = pci_get_pdn(pdev);
         pdn->vfs_expanded = 0;
+ +      pdn->m64_single_mode = false;
   
         total_vfs = pci_sriov_get_totalvfs(pdev);
- -      pdn->m64_per_iov = 1;
         mul = phb->ioda.total_pe;
+ +      total_vf_bar_sz = 0;
   
         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
                 res = &pdev->resource[i + PCI_IOV_RESOURCES];
                 if (!res->flags || res->parent)
                         continue;
                 if (!pnv_pci_is_mem_pref_64(res->flags)) {
- -                      dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
+ +                      dev_warn(&pdev->dev, "Don't support SR-IOV with"
+ +                                      " non M64 VF BAR%d: %pR. \n",
                                  i, res);
- -                      continue;
+ +                      goto truncate_iov;
                 }
   
- -              size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+ +              total_vf_bar_sz += pci_iov_resource_size(pdev,
+ +                              i + PCI_IOV_RESOURCES);
   
- -              /* bigger than 64M */
- -              if (size > (1 << 26)) {
- -                      dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
- -                               i, res);
- -                      pdn->m64_per_iov = M64_PER_IOV;
+ +              /*
+ +               * If bigger than quarter of M64 segment size, just round up
+ +               * power of two.
+ +               *
+ +               * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
+ +               * with other devices, IOV BAR size is expanded to be
+ +               * (total_pe * VF_BAR_size).  When VF_BAR_size is half of M64
+ +               * segment size , the expanded size would equal to half of the
+ +               * whole M64 space size, which will exhaust the M64 Space and
+ +               * limit the system flexibility.  This is a design decision to
+ +               * set the boundary to quarter of the M64 segment size.
+ +               */
+ +              if (total_vf_bar_sz > gate) {
                         mul = roundup_pow_of_two(total_vfs);
+ +                      dev_info(&pdev->dev,
+ +                              "VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
+ +                              total_vf_bar_sz, gate, mul);
+ +                      pdn->m64_single_mode = true;
                         break;
                 }
         }
@@@ -2901,31 -2898,20 +2901,31 @@@
                 res = &pdev->resource[i + PCI_IOV_RESOURCES];
                 if (!res->flags || res->parent)
                         continue;
- -              if (!pnv_pci_is_mem_pref_64(res->flags)) {
- -                      dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
- -                               i, res);
- -                      continue;
- -              }
   
- -              dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
                 size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+ +              /*
+ +               * On PHB3, the minimum size alignment of M64 BAR in single
+ +               * mode is 32MB.
+ +               */
+ +              if (pdn->m64_single_mode && (size < SZ_32M))
+ +                      goto truncate_iov;
+ +              dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
                 res->end = res->start + size * mul - 1;
                 dev_dbg(&pdev->dev, "                       %pR\n", res);
                 dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
                          i, res, mul);
         }
         pdn->vfs_expanded = mul;
+ +
+ +      return;
+ +
+ +truncate_iov:
+ +      /* To save MMIO space, IOV BAR is truncated. */
+ +      for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ +              res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ +              res->flags = 0;
+ +              res->end = res->start - 1;
+ +      }
   }
   #endif /* CONFIG_PCI_IOV */
   
@@@ -3139,35 -3125,18 +3139,35 @@@ static resource_size_t pnv_pci_window_a
   static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
                                                       int resno)
   {
+ +      struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ +      struct pnv_phb *phb = hose->private_data;
         struct pci_dn *pdn = pci_get_pdn(pdev);
- -      resource_size_t align, iov_align;
- -
- -      iov_align = resource_size(&pdev->resource[resno]);
- -      if (iov_align)
- -              return iov_align;
+ +      resource_size_t align;
   
+ +      /*
+ +       * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
+ +       * SR-IOV. While from hardware perspective, the range mapped by M64
+ +       * BAR should be size aligned.
+ +       *
+ +       * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
+ +       * powernv-specific hardware restriction is gone. But if just use the
+ +       * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
+ +       * in one segment of M64 #15, which introduces the PE conflict between
+ +       * PF and VF. Based on this, the minimum alignment of an IOV BAR is
+ +       * m64_segsize.
+ +       *
+ +       * This function returns the total IOV BAR size if M64 BAR is in
+ +       * Shared PE mode or just VF BAR size if not.
+ +       * If the M64 BAR is in Single PE mode, return the VF BAR size or
+ +       * M64 segment size if IOV BAR size is less.
+ +       */
         align = pci_iov_resource_size(pdev, resno);
- -      if (pdn->vfs_expanded)
- -              return pdn->vfs_expanded * align;
+ +      if (!pdn->vfs_expanded)
+ +              return align;
+ +      if (pdn->m64_single_mode)
+ +              return max(align, (resource_size_t)phb->ioda.m64_segsize);
   
- -      return align;
+ +      return pdn->vfs_expanded * align;
   }
   #endif /* CONFIG_PCI_IOV */
   
@@@ -3211,6 -3180,7 +3211,7 @@@ static void pnv_pci_ioda_shutdown(struc
   
   static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
          .dma_dev_setup = pnv_pci_dma_dev_setup,
+        .dma_bus_setup = pnv_pci_dma_bus_setup,
   #ifdef CONFIG_PCI_MSI
          .setup_msi_irqs = pnv_setup_msi_irqs,
          .teardown_msi_irqs = pnv_teardown_msi_irqs,
diff --combined arch/powerpc/platforms/powernv/pci.c

index 8de0140332b2a0a15af57d56a0431e50594c489f,b1ef84a6c9d13cff03c2d4a5234e57265ef5bc75..73c8dc2a353fdd540b9a05a5aa208b3e50970e47
--- 1/arch/powerpc/platforms/powernv/pci.c
--- 2/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@@ -380,7 -380,10 +380,7 @@@ static void pnv_pci_config_check_eeh(st
          */
         pe_no = pdn->pe_number;
         if (pe_no == IODA_INVALID_PE) {
- -              if (phb->type == PNV_PHB_P5IOC2)
- -                      pe_no = 0;
- -              else
- -                      pe_no = phb->ioda.reserved_pe;
+ +              pe_no = phb->ioda.reserved_pe;
         }
   
         /*
@@@ -596,6 -599,9 +596,9 @@@ int pnv_tce_build(struct iommu_table *t
         u64 rpn = __pa(uaddr) >> tbl->it_page_shift;
         long i;
   
+       if (proto_tce & TCE_PCI_WRITE)
+               proto_tce |= TCE_PCI_READ;
+ 
         for (i = 0; i < npages; i++) {
                 unsigned long newtce = proto_tce |
                         ((rpn + i) << tbl->it_page_shift);
@@@ -617,6 -623,9 +620,9 @@@ int pnv_tce_xchg(struct iommu_table *tb
   
         BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl));
   
+       if (newtce & TCE_PCI_WRITE)
+               newtce |= TCE_PCI_READ;
+ 
         oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce));
         *hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE);
         *direction = iommu_tce_direction(oldtce);
@@@ -757,6 -766,26 +763,26 @@@ void pnv_pci_dma_dev_setup(struct pci_d
                 phb->dma_dev_setup(phb, pdev);
   }
   
+ void pnv_pci_dma_bus_setup(struct pci_bus *bus)
+ {
+       struct pci_controller *hose = bus->sysdata;
+       struct pnv_phb *phb = hose->private_data;
+       struct pnv_ioda_pe *pe;
+ 
+       list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+               if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
+                       continue;
+ 
+               if (!pe->pbus)
+                       continue;
+ 
+               if (bus->number == ((pe->rid >> 8) & 0xFF)) {
+                       pe->pbus = bus;
+                       break;
+               }
+       }
+ }
+ 
   void pnv_pci_shutdown(void)
   {
         struct pci_controller *hose;
@@@ -776,6 -805,7 +802,6 @@@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_I
   void __init pnv_pci_init(void)
   {
         struct device_node *np;
- -      bool found_ioda = false;
   
         pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN);
   
@@@ -783,11 -813,20 +809,11 @@@
         if (!firmware_has_feature(FW_FEATURE_OPAL))
                 return;
   
- -      /* Look for IODA IO-Hubs. We don't support mixing IODA
- -       * and p5ioc2 due to the need to change some global
- -       * probing flags
- -       */
+ +      /* Look for IODA IO-Hubs. */
         for_each_compatible_node(np, NULL, "ibm,ioda-hub") {
                 pnv_pci_init_ioda_hub(np);
- -              found_ioda = true;
         }
   
- -      /* Look for p5ioc2 IO-Hubs */
- -      if (!found_ioda)
- -              for_each_compatible_node(np, NULL, "ibm,p5ioc2")
- -                      pnv_pci_init_p5ioc2_hub(np);
- -
         /* Look for ioda2 built-in PHB3's */
         for_each_compatible_node(np, NULL, "ibm,ioda2-phb")
                 pnv_pci_init_ioda2_phb(np);
diff --combined arch/powerpc/platforms/powernv/pci.h

index 32cae3d8e01185e0cf72e59381e77a0857a0ba58,00691a9b99af67b09967c73b39b652b973392f06..3f814f382b2e793bf0b5374abb74bebf35246a98
--- 1/arch/powerpc/platforms/powernv/pci.h
--- 2/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@@ -4,14 -4,16 +4,14 @@@
   struct pci_dn;
   
   enum pnv_phb_type {
- -      PNV_PHB_P5IOC2  = 0,
- -      PNV_PHB_IODA1   = 1,
- -      PNV_PHB_IODA2   = 2,
- -      PNV_PHB_NPU     = 3,
+ +      PNV_PHB_IODA1   = 0,
+ +      PNV_PHB_IODA2   = 1,
+ +      PNV_PHB_NPU     = 2,
   };
   
   /* Precise PHB model for error management */
   enum pnv_phb_model {
         PNV_PHB_MODEL_UNKNOWN,
- -      PNV_PHB_MODEL_P5IOC2,
         PNV_PHB_MODEL_P7IOC,
         PNV_PHB_MODEL_PHB3,
         PNV_PHB_MODEL_NPU,
@@@ -119,74 -121,81 +119,74 @@@ struct pnv_phb 
         void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
         int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
   
- -      union {
- -              struct {
- -                      struct iommu_table iommu_table;
- -                      struct iommu_table_group table_group;
- -              } p5ioc2;
- -
- -              struct {
- -                      /* Global bridge info */
- -                      unsigned int            total_pe;
- -                      unsigned int            reserved_pe;
- -
- -                      /* 32-bit MMIO window */
- -                      unsigned int            m32_size;
- -                      unsigned int            m32_segsize;
- -                      unsigned int            m32_pci_base;
- -
- -                      /* 64-bit MMIO window */
- -                      unsigned int            m64_bar_idx;
- -                      unsigned long           m64_size;
- -                      unsigned long           m64_segsize;
- -                      unsigned long           m64_base;
- -                      unsigned long           m64_bar_alloc;
- -
- -                      /* IO ports */
- -                      unsigned int            io_size;
- -                      unsigned int            io_segsize;
- -                      unsigned int            io_pci_base;
- -
- -                      /* PE allocation bitmap */
- -                      unsigned long           *pe_alloc;
- -                      /* PE allocation mutex */
- -                      struct mutex            pe_alloc_mutex;
- -
- -                      /* M32 & IO segment maps */
- -                      unsigned int            *m32_segmap;
- -                      unsigned int            *io_segmap;
- -                      struct pnv_ioda_pe      *pe_array;
- -
- -                      /* IRQ chip */
- -                      int                     irq_chip_init;
- -                      struct irq_chip         irq_chip;
- -
- -                      /* Sorted list of used PE's based
- -                       * on the sequence of creation
- -                       */
- -                      struct list_head        pe_list;
- -                      struct mutex            pe_list_mutex;
- -
- -                      /* Reverse map of PEs, will have to extend if
- -                       * we are to support more than 256 PEs, indexed
- -                       * bus { bus, devfn }
- -                       */
- -                      unsigned char           pe_rmap[0x10000];
- -
- -                      /* 32-bit TCE tables allocation */
- -                      unsigned long           tce32_count;
- -
- -                      /* Total "weight" for the sake of DMA resources
- -                       * allocation
- -                       */
- -                      unsigned int            dma_weight;
- -                      unsigned int            dma_pe_count;
- -
- -                      /* Sorted list of used PE's, sorted at
- -                       * boot for resource allocation purposes
- -                       */
- -                      struct list_head        pe_dma_list;
- -
- -                      /* TCE cache invalidate registers (physical and
- -                       * remapped)
- -                       */
- -                      phys_addr_t             tce_inval_reg_phys;
- -                      __be64 __iomem          *tce_inval_reg;
- -              } ioda;
- -      };
+ +      struct {
+ +              /* Global bridge info */
+ +              unsigned int            total_pe;
+ +              unsigned int            reserved_pe;
+ +
+ +              /* 32-bit MMIO window */
+ +              unsigned int            m32_size;
+ +              unsigned int            m32_segsize;
+ +              unsigned int            m32_pci_base;
+ +
+ +              /* 64-bit MMIO window */
+ +              unsigned int            m64_bar_idx;
+ +              unsigned long           m64_size;
+ +              unsigned long           m64_segsize;
+ +              unsigned long           m64_base;
+ +              unsigned long           m64_bar_alloc;
+ +
+ +              /* IO ports */
+ +              unsigned int            io_size;
+ +              unsigned int            io_segsize;
+ +              unsigned int            io_pci_base;
+ +
+ +              /* PE allocation bitmap */
+ +              unsigned long           *pe_alloc;
+ +              /* PE allocation mutex */
+ +              struct mutex            pe_alloc_mutex;
+ +
+ +              /* M32 & IO segment maps */
+ +              unsigned int            *m32_segmap;
+ +              unsigned int            *io_segmap;
+ +              struct pnv_ioda_pe      *pe_array;
+ +
+ +              /* IRQ chip */
+ +              int                     irq_chip_init;
+ +              struct irq_chip         irq_chip;
+ +
+ +              /* Sorted list of used PE's based
+ +               * on the sequence of creation
+ +               */
+ +              struct list_head        pe_list;
+ +              struct mutex            pe_list_mutex;
+ +
+ +              /* Reverse map of PEs, will have to extend if
+ +               * we are to support more than 256 PEs, indexed
+ +               * bus { bus, devfn }
+ +               */
+ +              unsigned char           pe_rmap[0x10000];
+ +
+ +              /* 32-bit TCE tables allocation */
+ +              unsigned long           tce32_count;
+ +
+ +              /* Total "weight" for the sake of DMA resources
+ +               * allocation
+ +               */
+ +              unsigned int            dma_weight;
+ +              unsigned int            dma_pe_count;
+ +
+ +              /* Sorted list of used PE's, sorted at
+ +               * boot for resource allocation purposes
+ +               */
+ +              struct list_head        pe_dma_list;
+ +
+ +              /* TCE cache invalidate registers (physical and
+ +               * remapped)
+ +               */
+ +              phys_addr_t             tce_inval_reg_phys;
+ +              __be64 __iomem          *tce_inval_reg;
+ +      } ioda;
   
         /* PHB and hub status structure */
         union {
@@@ -223,6 -232,7 +223,6 @@@ extern void pnv_pci_unlink_table_and_gr
   extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
                                       void *tce_mem, u64 tce_size,
                                       u64 dma_offset, unsigned page_shift);
- -extern void pnv_pci_init_p5ioc2_hub(struct device_node *np);
   extern void pnv_pci_init_ioda_hub(struct device_node *np);
   extern void pnv_pci_init_ioda2_phb(struct device_node *np);
   extern void pnv_pci_init_npu_phb(struct device_node *np);
@@@ -232,6 -242,7 +232,7 @@@ extern void pnv_pci_reset_secondary_bus
   extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
   
   extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
+ extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
   extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
   extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
   
diff --combined mm/huge_memory.c

index 08fc0ba2207e555a9c734524cd607dc8f53e644a,de3f43cde129f6da955b4477f97e19825a205712..aea8f7a42df97d7185f626d5bbc445c64f376eb1
--- 1/mm/huge_memory.c
--- 2/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@@ -138,6 -138,9 +138,6 @@@ static struct khugepaged_scan khugepage
         .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
   };
   
- -static DEFINE_SPINLOCK(split_queue_lock);
- -static LIST_HEAD(split_queue);
- -static unsigned long split_queue_len;
   static struct shrinker deferred_split_shrinker;
   
   static void set_recommended_min_free_kbytes(void)
@@@ -858,8 -861,7 +858,8 @@@ static bool set_huge_zero_page(pgtable_
                 return false;
         entry = mk_pmd(zero_page, vma->vm_page_prot);
         entry = pmd_mkhuge(entry);
- -      pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ +      if (pgtable)
+ +              pgtable_trans_huge_deposit(mm, pmd, pgtable);
         set_pmd_at(mm, haddr, pmd, entry);
         atomic_long_inc(&mm->nr_ptes);
         return true;
@@@ -1037,15 -1039,13 +1037,15 @@@ int copy_huge_pmd(struct mm_struct *dst
         spinlock_t *dst_ptl, *src_ptl;
         struct page *src_page;
         pmd_t pmd;
- -      pgtable_t pgtable;
+ +      pgtable_t pgtable = NULL;
         int ret;
   
- -      ret = -ENOMEM;
- -      pgtable = pte_alloc_one(dst_mm, addr);
- -      if (unlikely(!pgtable))
- -              goto out;
+ +      if (!vma_is_dax(vma)) {
+ +              ret = -ENOMEM;
+ +              pgtable = pte_alloc_one(dst_mm, addr);
+ +              if (unlikely(!pgtable))
+ +                      goto out;
+ +      }
   
         dst_ptl = pmd_lock(dst_mm, dst_pmd);
         src_ptl = pmd_lockptr(src_mm, src_pmd);
@@@ -1076,7 -1076,7 +1076,7 @@@
                 goto out_unlock;
         }
   
- -      if (pmd_trans_huge(pmd)) {
+ +      if (!vma_is_dax(vma)) {
                 /* thp accounting separate from pmd_devmap accounting */
                 src_page = pmd_page(pmd);
                 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
@@@ -1560,8 -1560,7 +1560,8 @@@ int madvise_free_huge_pmd(struct mmu_ga
         struct mm_struct *mm = tlb->mm;
         int ret = 0;
   
- -      if (!pmd_trans_huge_lock(pmd, vma, &ptl))
+ +      ptl = pmd_trans_huge_lock(pmd, vma);
+ +      if (!ptl)
                 goto out_unlocked;
   
         orig_pmd = *pmd;
@@@ -1628,8 -1627,7 +1628,8 @@@ int zap_huge_pmd(struct mmu_gather *tlb
         pmd_t orig_pmd;
         spinlock_t *ptl;
   
- -      if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
+ +      ptl = __pmd_trans_huge_lock(pmd, vma);
+ +      if (!ptl)
                 return 0;
         /*
          * For architectures like ppc64 we look at deposited pgtable
@@@ -1692,8 -1690,7 +1692,8 @@@ bool move_huge_pmd(struct vm_area_struc
          * We don't have to worry about the ordering of src and dst
          * ptlocks because exclusive mmap_sem prevents deadlock.
          */
- -      if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
+ +      old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
+ +      if (old_ptl) {
                 new_ptl = pmd_lockptr(mm, new_pmd);
                 if (new_ptl != old_ptl)
                         spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@@ -1727,8 -1724,7 +1727,8 @@@ int change_huge_pmd(struct vm_area_stru
         spinlock_t *ptl;
         int ret = 0;
   
- -      if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
+ +      ptl = __pmd_trans_huge_lock(pmd, vma);
+ +      if (ptl) {
                 pmd_t entry;
                 bool preserve_write = prot_numa && pmd_write(*pmd);
                 ret = 1;
@@@ -1764,14 -1760,14 +1764,14 @@@
    * Note that if it returns true, this routine returns without unlocking page
    * table lock. So callers must unlock it.
    */
- -bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
- -              spinlock_t **ptl)
+ +spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
   {
- -      *ptl = pmd_lock(vma->vm_mm, pmd);
+ +      spinlock_t *ptl;
+ +      ptl = pmd_lock(vma->vm_mm, pmd);
         if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
- -              return true;
- -      spin_unlock(*ptl);
- -      return false;
+ +              return ptl;
+ +      spin_unlock(ptl);
+ +      return NULL;
   }
   
   #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
@@@ -2072,7 -2068,7 +2072,7 @@@ static int __collapse_huge_page_isolate
         if (likely(writable)) {
                 if (likely(referenced)) {
                         result = SCAN_SUCCEED;
- -                      trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+ +                      trace_mm_collapse_huge_page_isolate(page, none_or_zero,
                                                             referenced, writable, result);
                         return 1;
                 }
@@@ -2082,7 -2078,7 +2082,7 @@@
   
   out:
         release_pte_pages(pte, _pte);
- -      trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+ +      trace_mm_collapse_huge_page_isolate(page, none_or_zero,
                                             referenced, writable, result);
         return 0;
   }
@@@ -2580,7 -2576,7 +2580,7 @@@ out_unmap
                 collapse_huge_page(mm, address, hpage, vma, node);
         }
   out:
- -      trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
+ +      trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
                                      none_or_zero, result);
         return ret;
   }
@@@ -2860,6 -2856,7 +2860,7 @@@ static void __split_huge_pmd_locked(str
         young = pmd_young(*pmd);
         dirty = pmd_dirty(*pmd);
   
+       pmdp_huge_split_prepare(vma, haddr, pmd);
         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
         pmd_populate(mm, &_pmd, pgtable);
   
@@@ -3358,11 -3355,9 +3359,11 @@@ int total_mapcount(struct page *page
   int split_huge_page_to_list(struct page *page, struct list_head *list)
   {
         struct page *head = compound_head(page);
+ +      struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
         struct anon_vma *anon_vma;
         int count, mapcount, ret;
         bool mlocked;
+ +      unsigned long flags;
   
         VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
         VM_BUG_ON_PAGE(!PageAnon(page), page);
@@@ -3402,19 -3397,19 +3403,19 @@@
                 lru_add_drain();
   
         /* Prevent deferred_split_scan() touching ->_count */
- -      spin_lock(&split_queue_lock);
+ +      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
         count = page_count(head);
         mapcount = total_mapcount(head);
         if (!mapcount && count == 1) {
                 if (!list_empty(page_deferred_list(head))) {
- -                      split_queue_len--;
+ +                      pgdata->split_queue_len--;
                         list_del(page_deferred_list(head));
                 }
- -              spin_unlock(&split_queue_lock);
+ +              spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
                 __split_huge_page(page, list);
                 ret = 0;
         } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- -              spin_unlock(&split_queue_lock);
+ +              spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
                 pr_alert("total_mapcount: %u, page_count(): %u\n",
                                 mapcount, count);
                 if (PageTail(page))
@@@ -3422,7 -3417,7 +3423,7 @@@
                 dump_page(page, "total_mapcount(head) > 0");
                 BUG();
         } else {
- -              spin_unlock(&split_queue_lock);
+ +              spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
                 unfreeze_page(anon_vma, head);
                 ret = -EBUSY;
         }
@@@ -3437,65 -3432,64 +3438,65 @@@ out
   
   void free_transhuge_page(struct page *page)
   {
+ +      struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
         unsigned long flags;
   
- -      spin_lock_irqsave(&split_queue_lock, flags);
+ +      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
         if (!list_empty(page_deferred_list(page))) {
- -              split_queue_len--;
+ +              pgdata->split_queue_len--;
                 list_del(page_deferred_list(page));
         }
- -      spin_unlock_irqrestore(&split_queue_lock, flags);
+ +      spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
         free_compound_page(page);
   }
   
   void deferred_split_huge_page(struct page *page)
   {
+ +      struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
         unsigned long flags;
   
         VM_BUG_ON_PAGE(!PageTransHuge(page), page);
   
- -      spin_lock_irqsave(&split_queue_lock, flags);
+ +      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
         if (list_empty(page_deferred_list(page))) {
- -              list_add_tail(page_deferred_list(page), &split_queue);
- -              split_queue_len++;
+ +              list_add_tail(page_deferred_list(page), &pgdata->split_queue);
+ +              pgdata->split_queue_len++;
         }
- -      spin_unlock_irqrestore(&split_queue_lock, flags);
+ +      spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
   }
   
   static unsigned long deferred_split_count(struct shrinker *shrink,
                 struct shrink_control *sc)
   {
- -      /*
- -       * Split a page from split_queue will free up at least one page,
- -       * at most HPAGE_PMD_NR - 1. We don't track exact number.
- -       * Let's use HPAGE_PMD_NR / 2 as ballpark.
- -       */
- -      return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+ +      struct pglist_data *pgdata = NODE_DATA(sc->nid);
+ +      return ACCESS_ONCE(pgdata->split_queue_len);
   }
   
   static unsigned long deferred_split_scan(struct shrinker *shrink,
                 struct shrink_control *sc)
   {
+ +      struct pglist_data *pgdata = NODE_DATA(sc->nid);
         unsigned long flags;
         LIST_HEAD(list), *pos, *next;
         struct page *page;
         int split = 0;
   
- -      spin_lock_irqsave(&split_queue_lock, flags);
- -      list_splice_init(&split_queue, &list);
- -
+ +      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
         /* Take pin on all head pages to avoid freeing them under us */
- -      list_for_each_safe(pos, next, &list) {
+ +      list_for_each_safe(pos, next, &pgdata->split_queue) {
                 page = list_entry((void *)pos, struct page, mapping);
                 page = compound_head(page);
- -              /* race with put_compound_page() */
- -              if (!get_page_unless_zero(page)) {
+ +              if (get_page_unless_zero(page)) {
+ +                      list_move(page_deferred_list(page), &list);
+ +              } else {
+ +                      /* We lost race with put_compound_page() */
                         list_del_init(page_deferred_list(page));
- -                      split_queue_len--;
+ +                      pgdata->split_queue_len--;
                 }
+ +              if (!--sc->nr_to_scan)
+ +                      break;
         }
- -      spin_unlock_irqrestore(&split_queue_lock, flags);
+ +      spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
   
         list_for_each_safe(pos, next, &list) {
                 page = list_entry((void *)pos, struct page, mapping);
@@@ -3507,24 -3501,17 +3508,24 @@@
                 put_page(page);
         }
   
- -      spin_lock_irqsave(&split_queue_lock, flags);
- -      list_splice_tail(&list, &split_queue);
- -      spin_unlock_irqrestore(&split_queue_lock, flags);
+ +      spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ +      list_splice_tail(&list, &pgdata->split_queue);
+ +      spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
   
- -      return split * HPAGE_PMD_NR / 2;
+ +      /*
+ +       * Stop shrinker if we didn't split any page, but the queue is empty.
+ +       * This can happen if pages were freed under us.
+ +       */
+ +      if (!split && list_empty(&pgdata->split_queue))
+ +              return SHRINK_STOP;
+ +      return split;
   }
   
   static struct shrinker deferred_split_shrinker = {
         .count_objects = deferred_split_count,
         .scan_objects = deferred_split_scan,
         .seeks = DEFAULT_SEEKS,
+ +      .flags = SHRINKER_NUMA_AWARE,
   };
   
   #ifdef CONFIG_DEBUG_FS
author	Michael Ellerman <[email protected]>
	Thu, 25 Feb 2016 10:52:58 +0000 (21:52 +1100)
committer	Michael Ellerman <[email protected]>
	Thu, 25 Feb 2016 10:52:58 +0000 (21:52 +1100)
		1	2
arch/powerpc/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/eeh_driver.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/platforms/powernv/eeh-powernv.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/platforms/powernv/pci-ioda.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/platforms/powernv/pci.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/platforms/powernv/pci.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/huge_memory.c	patch \|	diff1 \|	diff2 \|	blob \| history