From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 25 Feb 2016 10:52:58 +0000 (+1100)
Subject: Merge tag 'powerpc-4.5-4' into next
X-Git-Tag: v4.6-rc1~89^2~86
X-Git-Url: https://repo.jachan.dev/linux.git/commitdiff_plain/2527083cb831024d22a03f68f54f6a46ecf5bf6c?hp=-c

Merge tag 'powerpc-4.5-4' into next

Pull in our current fixes from 4.5, in particular the "Fix Multi hit
ERAT" bug is causing folks some grief when testing next.
---

2527083cb831024d22a03f68f54f6a46ecf5bf6c
diff --combined arch/powerpc/Kconfig
index e4824fd04bb7,5ead6a31854b..9faa18c4f3f7
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@@ -108,6 -108,7 +108,6 @@@ config PP
  	select HAVE_ARCH_TRACEHOOK
  	select HAVE_MEMBLOCK
  	select HAVE_MEMBLOCK_NODE_MAP
 -	select HAVE_DMA_ATTRS
  	select HAVE_DMA_API_DEBUG
  	select HAVE_OPROFILE
  	select HAVE_DEBUG_KMEMLEAK
@@@ -157,7 -158,6 +157,7 @@@
  	select ARCH_HAS_DMA_SET_COHERENT_MASK
  	select ARCH_HAS_DEVMEM_IS_ALLOWED
  	select HAVE_ARCH_SECCOMP_FILTER
 +	select ARCH_HAS_UBSAN_SANITIZE_ALL
  
  config GENERIC_CSUM
  	def_bool CPU_LITTLE_ENDIAN
@@@ -557,7 -557,7 +557,7 @@@ choic
  
  config PPC_4K_PAGES
  	bool "4k page size"
- 	select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S
+ 	select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
  
  config PPC_16K_PAGES
  	bool "16k page size"
@@@ -566,7 -566,7 +566,7 @@@
  config PPC_64K_PAGES
  	bool "64k page size"
  	depends on !PPC_FSL_BOOK3E && (44x || PPC_STD_MMU_64 || PPC_BOOK3E_64)
- 	select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S
+ 	select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
  
  config PPC_256K_PAGES
  	bool "256k page size"
diff --combined arch/powerpc/kernel/eeh_driver.c
index 938742135ee0,52c1e273f8cd..650cfb31ea3d
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@@ -400,7 -400,7 +400,7 @@@ static void *eeh_rmv_device(void *data
  	 * support EEH. So we just care about PCI devices for
  	 * simplicity here.
  	 */
 -	if (!dev || (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE))
 +	if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
  		return NULL;
  
  	/*
@@@ -418,8 -418,7 +418,7 @@@
  		eeh_pcid_put(dev);
  		if (driver->err_handler &&
  		    driver->err_handler->error_detected &&
- 		    driver->err_handler->slot_reset &&
- 		    driver->err_handler->resume)
+ 		    driver->err_handler->slot_reset)
  			return NULL;
  	}
  
@@@ -564,6 -563,7 +563,7 @@@ static int eeh_reset_device(struct eeh_
  	 */
  	eeh_pe_state_mark(pe, EEH_PE_KEEP);
  	if (bus) {
+ 		eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
  		pci_lock_rescan_remove();
  		pcibios_remove_pci_devices(bus);
  		pci_unlock_rescan_remove();
@@@ -803,6 -803,7 +803,7 @@@ perm_error
  	 * the their PCI config any more.
  	 */
  	if (frozen_bus) {
+ 		eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
  		eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
  
  		pci_lock_rescan_remove();
@@@ -886,6 -887,7 +887,7 @@@ static void eeh_handle_special_event(vo
  					continue;
  
  				/* Notify all devices to be down */
+ 				eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
  				bus = eeh_pe_bus_get(phb_pe);
  				eeh_pe_dev_traverse(pe,
  					eeh_report_failure, NULL);
diff --combined arch/powerpc/platforms/powernv/eeh-powernv.c
index 3f1cb35d9cdf,87f47e55aab6..811917219bf1
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@@ -167,26 -167,42 +167,26 @@@ static int pnv_eeh_dbgfs_get(void *data
  	return 0;
  }
  
 -static int pnv_eeh_outb_dbgfs_set(void *data, u64 val)
 -{
 -	return pnv_eeh_dbgfs_set(data, 0xD10, val);
 -}
 -
 -static int pnv_eeh_outb_dbgfs_get(void *data, u64 *val)
 -{
 -	return pnv_eeh_dbgfs_get(data, 0xD10, val);
 -}
 -
 -static int pnv_eeh_inbA_dbgfs_set(void *data, u64 val)
 -{
 -	return pnv_eeh_dbgfs_set(data, 0xD90, val);
 -}
 -
 -static int pnv_eeh_inbA_dbgfs_get(void *data, u64 *val)
 -{
 -	return pnv_eeh_dbgfs_get(data, 0xD90, val);
 -}
 -
 -static int pnv_eeh_inbB_dbgfs_set(void *data, u64 val)
 -{
 -	return pnv_eeh_dbgfs_set(data, 0xE10, val);
 -}
 -
 -static int pnv_eeh_inbB_dbgfs_get(void *data, u64 *val)
 -{
 -	return pnv_eeh_dbgfs_get(data, 0xE10, val);
 -}
 +#define PNV_EEH_DBGFS_ENTRY(name, reg)				\
 +static int pnv_eeh_dbgfs_set_##name(void *data, u64 val)	\
 +{								\
 +	return pnv_eeh_dbgfs_set(data, reg, val);		\
 +}								\
 +								\
 +static int pnv_eeh_dbgfs_get_##name(void *data, u64 *val)	\
 +{								\
 +	return pnv_eeh_dbgfs_get(data, reg, val);		\
 +}								\
 +								\
 +DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_dbgfs_ops_##name,		\
 +			pnv_eeh_dbgfs_get_##name,		\
 +                        pnv_eeh_dbgfs_set_##name,		\
 +			"0x%llx\n")
 +
 +PNV_EEH_DBGFS_ENTRY(outb, 0xD10);
 +PNV_EEH_DBGFS_ENTRY(inbA, 0xD90);
 +PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
  
 -DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_outb_dbgfs_ops, pnv_eeh_outb_dbgfs_get,
 -			pnv_eeh_outb_dbgfs_set, "0x%llx\n");
 -DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_inbA_dbgfs_ops, pnv_eeh_inbA_dbgfs_get,
 -			pnv_eeh_inbA_dbgfs_set, "0x%llx\n");
 -DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_inbB_dbgfs_ops, pnv_eeh_inbB_dbgfs_get,
 -			pnv_eeh_inbB_dbgfs_set, "0x%llx\n");
  #endif /* CONFIG_DEBUG_FS */
  
  /**
@@@ -252,13 -268,13 +252,13 @@@ static int pnv_eeh_post_init(void
  
  		debugfs_create_file("err_injct_outbound", 0600,
  				    phb->dbgfs, hose,
 -				    &pnv_eeh_outb_dbgfs_ops);
 +				    &pnv_eeh_dbgfs_ops_outb);
  		debugfs_create_file("err_injct_inboundA", 0600,
  				    phb->dbgfs, hose,
 -				    &pnv_eeh_inbA_dbgfs_ops);
 +				    &pnv_eeh_dbgfs_ops_inbA);
  		debugfs_create_file("err_injct_inboundB", 0600,
  				    phb->dbgfs, hose,
 -				    &pnv_eeh_inbB_dbgfs_ops);
 +				    &pnv_eeh_dbgfs_ops_inbB);
  #endif /* CONFIG_DEBUG_FS */
  	}
  
@@@ -428,9 -444,12 +428,12 @@@ static void *pnv_eeh_probe(struct pci_d
  	 * PCI devices of the PE are expected to be removed prior
  	 * to PE reset.
  	 */
- 	if (!edev->pe->bus)
+ 	if (!(edev->pe->state & EEH_PE_PRI_BUS)) {
  		edev->pe->bus = pci_find_bus(hose->global_number,
  					     pdn->busno);
+ 		if (edev->pe->bus)
+ 			edev->pe->state |= EEH_PE_PRI_BUS;
+ 	}
  
  	/*
  	 * Enable EEH explicitly so that we will do EEH check
diff --combined arch/powerpc/platforms/powernv/pci-ioda.c
index dc868586315d,f90dc04395bf..c5baaf3cc4e5
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@@ -872,6 -872,9 +872,6 @@@ static int pnv_pci_vf_resource_shift(st
  		if (!res->flags || !res->parent)
  			continue;
  
 -		if (!pnv_pci_is_mem_pref_64(res->flags))
 -			continue;
 -
  		/*
  		 * The actual IOV BAR range is determined by the start address
  		 * and the actual size for num_vfs VFs BAR.  This check is to
@@@ -900,6 -903,9 +900,6 @@@
  		if (!res->flags || !res->parent)
  			continue;
  
 -		if (!pnv_pci_is_mem_pref_64(res->flags))
 -			continue;
 -
  		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
  		res2 = *res;
  		res->start += size * offset;
@@@ -1190,36 -1196,29 +1190,36 @@@ static void pnv_pci_ioda_setup_PEs(void
  }
  
  #ifdef CONFIG_PCI_IOV
 -static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
 +static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
  {
  	struct pci_bus        *bus;
  	struct pci_controller *hose;
  	struct pnv_phb        *phb;
  	struct pci_dn         *pdn;
  	int                    i, j;
 +	int                    m64_bars;
  
  	bus = pdev->bus;
  	hose = pci_bus_to_host(bus);
  	phb = hose->private_data;
  	pdn = pci_get_pdn(pdev);
  
 +	if (pdn->m64_single_mode)
 +		m64_bars = num_vfs;
 +	else
 +		m64_bars = 1;
 +
  	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
 -		for (j = 0; j < M64_PER_IOV; j++) {
 -			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
 +		for (j = 0; j < m64_bars; j++) {
 +			if (pdn->m64_map[j][i] == IODA_INVALID_M64)
  				continue;
  			opal_pci_phb_mmio_enable(phb->opal_id,
 -				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
 -			clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
 -			pdn->m64_wins[i][j] = IODA_INVALID_M64;
 +				OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
 +			clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
 +			pdn->m64_map[j][i] = IODA_INVALID_M64;
  		}
  
 +	kfree(pdn->m64_map);
  	return 0;
  }
  
@@@ -1236,7 -1235,8 +1236,7 @@@ static int pnv_pci_vf_assign_m64(struc
  	int                    total_vfs;
  	resource_size_t        size, start;
  	int                    pe_num;
 -	int                    vf_groups;
 -	int                    vf_per_group;
 +	int                    m64_bars;
  
  	bus = pdev->bus;
  	hose = pci_bus_to_host(bus);
@@@ -1244,26 -1244,29 +1244,26 @@@
  	pdn = pci_get_pdn(pdev);
  	total_vfs = pci_sriov_get_totalvfs(pdev);
  
 -	/* Initialize the m64_wins to IODA_INVALID_M64 */
 -	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
 -		for (j = 0; j < M64_PER_IOV; j++)
 -			pdn->m64_wins[i][j] = IODA_INVALID_M64;
 +	if (pdn->m64_single_mode)
 +		m64_bars = num_vfs;
 +	else
 +		m64_bars = 1;
 +
 +	pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL);
 +	if (!pdn->m64_map)
 +		return -ENOMEM;
 +	/* Initialize the m64_map to IODA_INVALID_M64 */
 +	for (i = 0; i < m64_bars ; i++)
 +		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
 +			pdn->m64_map[i][j] = IODA_INVALID_M64;
  
 -	if (pdn->m64_per_iov == M64_PER_IOV) {
 -		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
 -		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
 -			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
 -	} else {
 -		vf_groups = 1;
 -		vf_per_group = 1;
 -	}
  
  	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
  		res = &pdev->resource[i + PCI_IOV_RESOURCES];
  		if (!res->flags || !res->parent)
  			continue;
  
 -		if (!pnv_pci_is_mem_pref_64(res->flags))
 -			continue;
 -
 -		for (j = 0; j < vf_groups; j++) {
 +		for (j = 0; j < m64_bars; j++) {
  			do {
  				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
  						phb->ioda.m64_bar_idx + 1, 0);
@@@ -1272,11 -1275,12 +1272,11 @@@
  					goto m64_failed;
  			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
  
 -			pdn->m64_wins[i][j] = win;
 +			pdn->m64_map[j][i] = win;
  
 -			if (pdn->m64_per_iov == M64_PER_IOV) {
 +			if (pdn->m64_single_mode) {
  				size = pci_iov_resource_size(pdev,
  							PCI_IOV_RESOURCES + i);
 -				size = size * vf_per_group;
  				start = res->start + size * j;
  			} else {
  				size = resource_size(res);
@@@ -1284,16 -1288,16 +1284,16 @@@
  			}
  
  			/* Map the M64 here */
 -			if (pdn->m64_per_iov == M64_PER_IOV) {
 -				pe_num = pdn->offset + j;
 +			if (pdn->m64_single_mode) {
 +				pe_num = pdn->pe_num_map[j];
  				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
  						pe_num, OPAL_M64_WINDOW_TYPE,
 -						pdn->m64_wins[i][j], 0);
 +						pdn->m64_map[j][i], 0);
  			}
  
  			rc = opal_pci_set_phb_mem_window(phb->opal_id,
  						 OPAL_M64_WINDOW_TYPE,
 -						 pdn->m64_wins[i][j],
 +						 pdn->m64_map[j][i],
  						 start,
  						 0, /* unused */
  						 size);
@@@ -1305,12 -1309,12 +1305,12 @@@
  				goto m64_failed;
  			}
  
 -			if (pdn->m64_per_iov == M64_PER_IOV)
 +			if (pdn->m64_single_mode)
  				rc = opal_pci_phb_mmio_enable(phb->opal_id,
 -				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
 +				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
  			else
  				rc = opal_pci_phb_mmio_enable(phb->opal_id,
 -				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
 +				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
  
  			if (rc != OPAL_SUCCESS) {
  				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
@@@ -1322,7 -1326,7 +1322,7 @@@
  	return 0;
  
  m64_failed:
 -	pnv_pci_vf_release_m64(pdev);
 +	pnv_pci_vf_release_m64(pdev, num_vfs);
  	return -EBUSY;
  }
  
@@@ -1349,13 -1353,15 +1349,13 @@@ static void pnv_pci_ioda2_release_dma_p
  	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
  }
  
 -static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
  {
  	struct pci_bus        *bus;
  	struct pci_controller *hose;
  	struct pnv_phb        *phb;
  	struct pnv_ioda_pe    *pe, *pe_n;
  	struct pci_dn         *pdn;
 -	u16                    vf_index;
 -	int64_t                rc;
  
  	bus = pdev->bus;
  	hose = pci_bus_to_host(bus);
@@@ -1365,6 -1371,35 +1365,6 @@@
  	if (!pdev->is_physfn)
  		return;
  
 -	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
 -		int   vf_group;
 -		int   vf_per_group;
 -		int   vf_index1;
 -
 -		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
 -
 -		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
 -			for (vf_index = vf_group * vf_per_group;
 -				vf_index < (vf_group + 1) * vf_per_group &&
 -				vf_index < num_vfs;
 -				vf_index++)
 -				for (vf_index1 = vf_group * vf_per_group;
 -					vf_index1 < (vf_group + 1) * vf_per_group &&
 -					vf_index1 < num_vfs;
 -					vf_index1++){
 -
 -					rc = opal_pci_set_peltv(phb->opal_id,
 -						pdn->offset + vf_index,
 -						pdn->offset + vf_index1,
 -						OPAL_REMOVE_PE_FROM_DOMAIN);
 -
 -					if (rc)
 -					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
 -						__func__,
 -						pdn->offset + vf_index1, rc);
 -				}
 -	}
 -
  	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
  		if (pe->parent_dev != pdev)
  			continue;
@@@ -1389,7 -1424,7 +1389,7 @@@ void pnv_pci_sriov_disable(struct pci_d
  	struct pnv_phb        *phb;
  	struct pci_dn         *pdn;
  	struct pci_sriov      *iov;
 -	u16 num_vfs;
 +	u16                    num_vfs, i;
  
  	bus = pdev->bus;
  	hose = pci_bus_to_host(bus);
@@@ -1399,25 -1434,18 +1399,25 @@@
  	num_vfs = pdn->num_vfs;
  
  	/* Release VF PEs */
 -	pnv_ioda_release_vf_PE(pdev, num_vfs);
 +	pnv_ioda_release_vf_PE(pdev);
  
  	if (phb->type == PNV_PHB_IODA2) {
 -		if (pdn->m64_per_iov == 1)
 -			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
 +		if (!pdn->m64_single_mode)
 +			pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
  
  		/* Release M64 windows */
 -		pnv_pci_vf_release_m64(pdev);
 +		pnv_pci_vf_release_m64(pdev, num_vfs);
  
  		/* Release PE numbers */
 -		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
 -		pdn->offset = 0;
 +		if (pdn->m64_single_mode) {
 +			for (i = 0; i < num_vfs; i++) {
 +				if (pdn->pe_num_map[i] != IODA_INVALID_PE)
 +					pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
 +			}
 +		} else
 +			bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
 +		/* Releasing pe_num_map */
 +		kfree(pdn->pe_num_map);
  	}
  }
  
@@@ -1432,6 -1460,7 +1432,6 @@@ static void pnv_ioda_setup_vf_PE(struc
  	int                    pe_num;
  	u16                    vf_index;
  	struct pci_dn         *pdn;
 -	int64_t                rc;
  
  	bus = pdev->bus;
  	hose = pci_bus_to_host(bus);
@@@ -1443,10 -1472,7 +1443,10 @@@
  
  	/* Reserve PE for each VF */
  	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
 -		pe_num = pdn->offset + vf_index;
 +		if (pdn->m64_single_mode)
 +			pe_num = pdn->pe_num_map[vf_index];
 +		else
 +			pe_num = *pdn->pe_num_map + vf_index;
  
  		pe = &phb->ioda.pe_array[pe_num];
  		pe->pe_number = pe_num;
@@@ -1479,6 -1505,37 +1479,6 @@@
  
  		pnv_pci_ioda2_setup_dma_pe(phb, pe);
  	}
 -
 -	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
 -		int   vf_group;
 -		int   vf_per_group;
 -		int   vf_index1;
 -
 -		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
 -
 -		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
 -			for (vf_index = vf_group * vf_per_group;
 -			     vf_index < (vf_group + 1) * vf_per_group &&
 -			     vf_index < num_vfs;
 -			     vf_index++) {
 -				for (vf_index1 = vf_group * vf_per_group;
 -				     vf_index1 < (vf_group + 1) * vf_per_group &&
 -				     vf_index1 < num_vfs;
 -				     vf_index1++) {
 -
 -					rc = opal_pci_set_peltv(phb->opal_id,
 -						pdn->offset + vf_index,
 -						pdn->offset + vf_index1,
 -						OPAL_ADD_PE_TO_DOMAIN);
 -
 -					if (rc)
 -					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
 -						__func__,
 -						pdn->offset + vf_index1, rc);
 -				}
 -			}
 -		}
 -	}
  }
  
  int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
@@@ -1488,7 -1545,6 +1488,7 @@@
  	struct pnv_phb        *phb;
  	struct pci_dn         *pdn;
  	int                    ret;
 +	u16                    i;
  
  	bus = pdev->bus;
  	hose = pci_bus_to_host(bus);
@@@ -1496,59 -1552,20 +1496,59 @@@
  	pdn = pci_get_pdn(pdev);
  
  	if (phb->type == PNV_PHB_IODA2) {
 +		if (!pdn->vfs_expanded) {
 +			dev_info(&pdev->dev, "don't support this SRIOV device"
 +				" with non 64bit-prefetchable IOV BAR\n");
 +			return -ENOSPC;
 +		}
 +
 +		/*
 +		 * When M64 BARs functions in Single PE mode, the number of VFs
 +		 * could be enabled must be less than the number of M64 BARs.
 +		 */
 +		if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
 +			dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
 +			return -EBUSY;
 +		}
 +
 +		/* Allocating pe_num_map */
 +		if (pdn->m64_single_mode)
 +			pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map) * num_vfs,
 +					GFP_KERNEL);
 +		else
 +			pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
 +
 +		if (!pdn->pe_num_map)
 +			return -ENOMEM;
 +
 +		if (pdn->m64_single_mode)
 +			for (i = 0; i < num_vfs; i++)
 +				pdn->pe_num_map[i] = IODA_INVALID_PE;
 +
  		/* Calculate available PE for required VFs */
 -		mutex_lock(&phb->ioda.pe_alloc_mutex);
 -		pdn->offset = bitmap_find_next_zero_area(
 -			phb->ioda.pe_alloc, phb->ioda.total_pe,
 -			0, num_vfs, 0);
 -		if (pdn->offset >= phb->ioda.total_pe) {
 +		if (pdn->m64_single_mode) {
 +			for (i = 0; i < num_vfs; i++) {
 +				pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb);
 +				if (pdn->pe_num_map[i] == IODA_INVALID_PE) {
 +					ret = -EBUSY;
 +					goto m64_failed;
 +				}
 +			}
 +		} else {
 +			mutex_lock(&phb->ioda.pe_alloc_mutex);
 +			*pdn->pe_num_map = bitmap_find_next_zero_area(
 +				phb->ioda.pe_alloc, phb->ioda.total_pe,
 +				0, num_vfs, 0);
 +			if (*pdn->pe_num_map >= phb->ioda.total_pe) {
 +				mutex_unlock(&phb->ioda.pe_alloc_mutex);
 +				dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
 +				kfree(pdn->pe_num_map);
 +				return -EBUSY;
 +			}
 +			bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
  			mutex_unlock(&phb->ioda.pe_alloc_mutex);
 -			dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
 -			pdn->offset = 0;
 -			return -EBUSY;
  		}
 -		bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs);
  		pdn->num_vfs = num_vfs;
 -		mutex_unlock(&phb->ioda.pe_alloc_mutex);
  
  		/* Assign M64 window accordingly */
  		ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
@@@ -1562,8 -1579,8 +1562,8 @@@
  		 * the IOV BAR according to the PE# allocated to the VFs.
  		 * Otherwise, the PE# for the VF will conflict with others.
  		 */
 -		if (pdn->m64_per_iov == 1) {
 -			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
 +		if (!pdn->m64_single_mode) {
 +			ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
  			if (ret)
  				goto m64_failed;
  		}
@@@ -1575,16 -1592,8 +1575,16 @@@
  	return 0;
  
  m64_failed:
 -	bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
 -	pdn->offset = 0;
 +	if (pdn->m64_single_mode) {
 +		for (i = 0; i < num_vfs; i++) {
 +			if (pdn->pe_num_map[i] != IODA_INVALID_PE)
 +				pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
 +		}
 +	} else
 +		bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
 +
 +	/* Releasing pe_num_map */
 +	kfree(pdn->pe_num_map);
  
  	return ret;
  }
@@@ -1603,7 -1612,8 +1603,7 @@@ int pcibios_sriov_enable(struct pci_de
  	/* Allocate PCI data */
  	add_dev_pci_data(pdev);
  
 -	pnv_pci_sriov_enable(pdev, num_vfs);
 -	return 0;
 +	return pnv_pci_sriov_enable(pdev, num_vfs);
  }
  #endif /* CONFIG_PCI_IOV */
  
@@@ -2841,58 -2851,45 +2841,58 @@@ static void pnv_pci_init_ioda_msis(stru
  #ifdef CONFIG_PCI_IOV
  static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
  {
 -	struct pci_controller *hose;
 -	struct pnv_phb *phb;
 +	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 +	struct pnv_phb *phb = hose->private_data;
 +	const resource_size_t gate = phb->ioda.m64_segsize >> 2;
  	struct resource *res;
  	int i;
 -	resource_size_t size;
 +	resource_size_t size, total_vf_bar_sz;
  	struct pci_dn *pdn;
  	int mul, total_vfs;
  
  	if (!pdev->is_physfn || pdev->is_added)
  		return;
  
 -	hose = pci_bus_to_host(pdev->bus);
 -	phb = hose->private_data;
 -
  	pdn = pci_get_pdn(pdev);
  	pdn->vfs_expanded = 0;
 +	pdn->m64_single_mode = false;
  
  	total_vfs = pci_sriov_get_totalvfs(pdev);
 -	pdn->m64_per_iov = 1;
  	mul = phb->ioda.total_pe;
 +	total_vf_bar_sz = 0;
  
  	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
  		res = &pdev->resource[i + PCI_IOV_RESOURCES];
  		if (!res->flags || res->parent)
  			continue;
  		if (!pnv_pci_is_mem_pref_64(res->flags)) {
 -			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
 +			dev_warn(&pdev->dev, "Don't support SR-IOV with"
 +					" non M64 VF BAR%d: %pR. \n",
  				 i, res);
 -			continue;
 +			goto truncate_iov;
  		}
  
 -		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
 +		total_vf_bar_sz += pci_iov_resource_size(pdev,
 +				i + PCI_IOV_RESOURCES);
  
 -		/* bigger than 64M */
 -		if (size > (1 << 26)) {
 -			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
 -				 i, res);
 -			pdn->m64_per_iov = M64_PER_IOV;
 +		/*
 +		 * If bigger than quarter of M64 segment size, just round up
 +		 * power of two.
 +		 *
 +		 * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
 +		 * with other devices, IOV BAR size is expanded to be
 +		 * (total_pe * VF_BAR_size).  When VF_BAR_size is half of M64
 +		 * segment size , the expanded size would equal to half of the
 +		 * whole M64 space size, which will exhaust the M64 Space and
 +		 * limit the system flexibility.  This is a design decision to
 +		 * set the boundary to quarter of the M64 segment size.
 +		 */
 +		if (total_vf_bar_sz > gate) {
  			mul = roundup_pow_of_two(total_vfs);
 +			dev_info(&pdev->dev,
 +				"VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
 +				total_vf_bar_sz, gate, mul);
 +			pdn->m64_single_mode = true;
  			break;
  		}
  	}
@@@ -2901,31 -2898,20 +2901,31 @@@
  		res = &pdev->resource[i + PCI_IOV_RESOURCES];
  		if (!res->flags || res->parent)
  			continue;
 -		if (!pnv_pci_is_mem_pref_64(res->flags)) {
 -			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
 -				 i, res);
 -			continue;
 -		}
  
 -		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
  		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
 +		/*
 +		 * On PHB3, the minimum size alignment of M64 BAR in single
 +		 * mode is 32MB.
 +		 */
 +		if (pdn->m64_single_mode && (size < SZ_32M))
 +			goto truncate_iov;
 +		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
  		res->end = res->start + size * mul - 1;
  		dev_dbg(&pdev->dev, "                       %pR\n", res);
  		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
  			 i, res, mul);
  	}
  	pdn->vfs_expanded = mul;
 +
 +	return;
 +
 +truncate_iov:
 +	/* To save MMIO space, IOV BAR is truncated. */
 +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 +		res = &pdev->resource[i + PCI_IOV_RESOURCES];
 +		res->flags = 0;
 +		res->end = res->start - 1;
 +	}
  }
  #endif /* CONFIG_PCI_IOV */
  
@@@ -3139,35 -3125,18 +3139,35 @@@ static resource_size_t pnv_pci_window_a
  static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
  						      int resno)
  {
 +	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 +	struct pnv_phb *phb = hose->private_data;
  	struct pci_dn *pdn = pci_get_pdn(pdev);
 -	resource_size_t align, iov_align;
 -
 -	iov_align = resource_size(&pdev->resource[resno]);
 -	if (iov_align)
 -		return iov_align;
 +	resource_size_t align;
  
 +	/*
 +	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
 +	 * SR-IOV. While from hardware perspective, the range mapped by M64
 +	 * BAR should be size aligned.
 +	 *
 +	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
 +	 * powernv-specific hardware restriction is gone. But if just use the
 +	 * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
 +	 * in one segment of M64 #15, which introduces the PE conflict between
 +	 * PF and VF. Based on this, the minimum alignment of an IOV BAR is
 +	 * m64_segsize.
 +	 *
 +	 * This function returns the total IOV BAR size if M64 BAR is in
 +	 * Shared PE mode or just VF BAR size if not.
 +	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
 +	 * M64 segment size if IOV BAR size is less.
 +	 */
  	align = pci_iov_resource_size(pdev, resno);
 -	if (pdn->vfs_expanded)
 -		return pdn->vfs_expanded * align;
 +	if (!pdn->vfs_expanded)
 +		return align;
 +	if (pdn->m64_single_mode)
 +		return max(align, (resource_size_t)phb->ioda.m64_segsize);
  
 -	return align;
 +	return pdn->vfs_expanded * align;
  }
  #endif /* CONFIG_PCI_IOV */
  
@@@ -3211,6 -3180,7 +3211,7 @@@ static void pnv_pci_ioda_shutdown(struc
  
  static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
         .dma_dev_setup = pnv_pci_dma_dev_setup,
+        .dma_bus_setup = pnv_pci_dma_bus_setup,
  #ifdef CONFIG_PCI_MSI
         .setup_msi_irqs = pnv_setup_msi_irqs,
         .teardown_msi_irqs = pnv_teardown_msi_irqs,
diff --combined arch/powerpc/platforms/powernv/pci.c
index 8de0140332b2,b1ef84a6c9d1..73c8dc2a353f
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@@ -380,7 -380,10 +380,7 @@@ static void pnv_pci_config_check_eeh(st
  	 */
  	pe_no = pdn->pe_number;
  	if (pe_no == IODA_INVALID_PE) {
 -		if (phb->type == PNV_PHB_P5IOC2)
 -			pe_no = 0;
 -		else
 -			pe_no = phb->ioda.reserved_pe;
 +		pe_no = phb->ioda.reserved_pe;
  	}
  
  	/*
@@@ -596,6 -599,9 +596,9 @@@ int pnv_tce_build(struct iommu_table *t
  	u64 rpn = __pa(uaddr) >> tbl->it_page_shift;
  	long i;
  
+ 	if (proto_tce & TCE_PCI_WRITE)
+ 		proto_tce |= TCE_PCI_READ;
+ 
  	for (i = 0; i < npages; i++) {
  		unsigned long newtce = proto_tce |
  			((rpn + i) << tbl->it_page_shift);
@@@ -617,6 -623,9 +620,9 @@@ int pnv_tce_xchg(struct iommu_table *tb
  
  	BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl));
  
+ 	if (newtce & TCE_PCI_WRITE)
+ 		newtce |= TCE_PCI_READ;
+ 
  	oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce));
  	*hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE);
  	*direction = iommu_tce_direction(oldtce);
@@@ -757,6 -766,26 +763,26 @@@ void pnv_pci_dma_dev_setup(struct pci_d
  		phb->dma_dev_setup(phb, pdev);
  }
  
+ void pnv_pci_dma_bus_setup(struct pci_bus *bus)
+ {
+ 	struct pci_controller *hose = bus->sysdata;
+ 	struct pnv_phb *phb = hose->private_data;
+ 	struct pnv_ioda_pe *pe;
+ 
+ 	list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+ 		if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
+ 			continue;
+ 
+ 		if (!pe->pbus)
+ 			continue;
+ 
+ 		if (bus->number == ((pe->rid >> 8) & 0xFF)) {
+ 			pe->pbus = bus;
+ 			break;
+ 		}
+ 	}
+ }
+ 
  void pnv_pci_shutdown(void)
  {
  	struct pci_controller *hose;
@@@ -776,6 -805,7 +802,6 @@@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_I
  void __init pnv_pci_init(void)
  {
  	struct device_node *np;
 -	bool found_ioda = false;
  
  	pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN);
  
@@@ -783,11 -813,20 +809,11 @@@
  	if (!firmware_has_feature(FW_FEATURE_OPAL))
  		return;
  
 -	/* Look for IODA IO-Hubs. We don't support mixing IODA
 -	 * and p5ioc2 due to the need to change some global
 -	 * probing flags
 -	 */
 +	/* Look for IODA IO-Hubs. */
  	for_each_compatible_node(np, NULL, "ibm,ioda-hub") {
  		pnv_pci_init_ioda_hub(np);
 -		found_ioda = true;
  	}
  
 -	/* Look for p5ioc2 IO-Hubs */
 -	if (!found_ioda)
 -		for_each_compatible_node(np, NULL, "ibm,p5ioc2")
 -			pnv_pci_init_p5ioc2_hub(np);
 -
  	/* Look for ioda2 built-in PHB3's */
  	for_each_compatible_node(np, NULL, "ibm,ioda2-phb")
  		pnv_pci_init_ioda2_phb(np);
diff --combined arch/powerpc/platforms/powernv/pci.h
index 32cae3d8e011,00691a9b99af..3f814f382b2e
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@@ -4,14 -4,16 +4,14 @@@
  struct pci_dn;
  
  enum pnv_phb_type {
 -	PNV_PHB_P5IOC2	= 0,
 -	PNV_PHB_IODA1	= 1,
 -	PNV_PHB_IODA2	= 2,
 -	PNV_PHB_NPU	= 3,
 +	PNV_PHB_IODA1	= 0,
 +	PNV_PHB_IODA2	= 1,
 +	PNV_PHB_NPU	= 2,
  };
  
  /* Precise PHB model for error management */
  enum pnv_phb_model {
  	PNV_PHB_MODEL_UNKNOWN,
 -	PNV_PHB_MODEL_P5IOC2,
  	PNV_PHB_MODEL_P7IOC,
  	PNV_PHB_MODEL_PHB3,
  	PNV_PHB_MODEL_NPU,
@@@ -119,74 -121,81 +119,74 @@@ struct pnv_phb 
  	void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
  	int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
  
 -	union {
 -		struct {
 -			struct iommu_table iommu_table;
 -			struct iommu_table_group table_group;
 -		} p5ioc2;
 -
 -		struct {
 -			/* Global bridge info */
 -			unsigned int		total_pe;
 -			unsigned int		reserved_pe;
 -
 -			/* 32-bit MMIO window */
 -			unsigned int		m32_size;
 -			unsigned int		m32_segsize;
 -			unsigned int		m32_pci_base;
 -
 -			/* 64-bit MMIO window */
 -			unsigned int		m64_bar_idx;
 -			unsigned long		m64_size;
 -			unsigned long		m64_segsize;
 -			unsigned long		m64_base;
 -			unsigned long		m64_bar_alloc;
 -
 -			/* IO ports */
 -			unsigned int		io_size;
 -			unsigned int		io_segsize;
 -			unsigned int		io_pci_base;
 -
 -			/* PE allocation bitmap */
 -			unsigned long		*pe_alloc;
 -			/* PE allocation mutex */
 -			struct mutex		pe_alloc_mutex;
 -
 -			/* M32 & IO segment maps */
 -			unsigned int		*m32_segmap;
 -			unsigned int		*io_segmap;
 -			struct pnv_ioda_pe	*pe_array;
 -
 -			/* IRQ chip */
 -			int			irq_chip_init;
 -			struct irq_chip		irq_chip;
 -
 -			/* Sorted list of used PE's based
 -			 * on the sequence of creation
 -			 */
 -			struct list_head	pe_list;
 -			struct mutex            pe_list_mutex;
 -
 -			/* Reverse map of PEs, will have to extend if
 -			 * we are to support more than 256 PEs, indexed
 -			 * bus { bus, devfn }
 -			 */
 -			unsigned char		pe_rmap[0x10000];
 -
 -			/* 32-bit TCE tables allocation */
 -			unsigned long		tce32_count;
 -
 -			/* Total "weight" for the sake of DMA resources
 -			 * allocation
 -			 */
 -			unsigned int		dma_weight;
 -			unsigned int		dma_pe_count;
 -
 -			/* Sorted list of used PE's, sorted at
 -			 * boot for resource allocation purposes
 -			 */
 -			struct list_head	pe_dma_list;
 -
 -			/* TCE cache invalidate registers (physical and
 -			 * remapped)
 -			 */
 -			phys_addr_t		tce_inval_reg_phys;
 -			__be64 __iomem		*tce_inval_reg;
 -		} ioda;
 -	};
 +	struct {
 +		/* Global bridge info */
 +		unsigned int		total_pe;
 +		unsigned int		reserved_pe;
 +
 +		/* 32-bit MMIO window */
 +		unsigned int		m32_size;
 +		unsigned int		m32_segsize;
 +		unsigned int		m32_pci_base;
 +
 +		/* 64-bit MMIO window */
 +		unsigned int		m64_bar_idx;
 +		unsigned long		m64_size;
 +		unsigned long		m64_segsize;
 +		unsigned long		m64_base;
 +		unsigned long		m64_bar_alloc;
 +
 +		/* IO ports */
 +		unsigned int		io_size;
 +		unsigned int		io_segsize;
 +		unsigned int		io_pci_base;
 +
 +		/* PE allocation bitmap */
 +		unsigned long		*pe_alloc;
 +		/* PE allocation mutex */
 +		struct mutex		pe_alloc_mutex;
 +
 +		/* M32 & IO segment maps */
 +		unsigned int		*m32_segmap;
 +		unsigned int		*io_segmap;
 +		struct pnv_ioda_pe	*pe_array;
 +
 +		/* IRQ chip */
 +		int			irq_chip_init;
 +		struct irq_chip		irq_chip;
 +
 +		/* Sorted list of used PE's based
 +		 * on the sequence of creation
 +		 */
 +		struct list_head	pe_list;
 +		struct mutex            pe_list_mutex;
 +
 +		/* Reverse map of PEs, will have to extend if
 +		 * we are to support more than 256 PEs, indexed
 +		 * bus { bus, devfn }
 +		 */
 +		unsigned char		pe_rmap[0x10000];
 +
 +		/* 32-bit TCE tables allocation */
 +		unsigned long		tce32_count;
 +
 +		/* Total "weight" for the sake of DMA resources
 +		 * allocation
 +		 */
 +		unsigned int		dma_weight;
 +		unsigned int		dma_pe_count;
 +
 +		/* Sorted list of used PE's, sorted at
 +		 * boot for resource allocation purposes
 +		 */
 +		struct list_head	pe_dma_list;
 +
 +		/* TCE cache invalidate registers (physical and
 +		 * remapped)
 +		 */
 +		phys_addr_t		tce_inval_reg_phys;
 +		__be64 __iomem		*tce_inval_reg;
 +	} ioda;
  
  	/* PHB and hub status structure */
  	union {
@@@ -223,6 -232,7 +223,6 @@@ extern void pnv_pci_unlink_table_and_gr
  extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
  				      void *tce_mem, u64 tce_size,
  				      u64 dma_offset, unsigned page_shift);
 -extern void pnv_pci_init_p5ioc2_hub(struct device_node *np);
  extern void pnv_pci_init_ioda_hub(struct device_node *np);
  extern void pnv_pci_init_ioda2_phb(struct device_node *np);
  extern void pnv_pci_init_npu_phb(struct device_node *np);
@@@ -232,6 -242,7 +232,7 @@@ extern void pnv_pci_reset_secondary_bus
  extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
  
  extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
+ extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
  extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
  extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
  
diff --combined mm/huge_memory.c
index 08fc0ba2207e,de3f43cde129..aea8f7a42df9
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@@ -138,6 -138,9 +138,6 @@@ static struct khugepaged_scan khugepage
  	.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
  };
  
 -static DEFINE_SPINLOCK(split_queue_lock);
 -static LIST_HEAD(split_queue);
 -static unsigned long split_queue_len;
  static struct shrinker deferred_split_shrinker;
  
  static void set_recommended_min_free_kbytes(void)
@@@ -858,8 -861,7 +858,8 @@@ static bool set_huge_zero_page(pgtable_
  		return false;
  	entry = mk_pmd(zero_page, vma->vm_page_prot);
  	entry = pmd_mkhuge(entry);
 -	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 +	if (pgtable)
 +		pgtable_trans_huge_deposit(mm, pmd, pgtable);
  	set_pmd_at(mm, haddr, pmd, entry);
  	atomic_long_inc(&mm->nr_ptes);
  	return true;
@@@ -1037,15 -1039,13 +1037,15 @@@ int copy_huge_pmd(struct mm_struct *dst
  	spinlock_t *dst_ptl, *src_ptl;
  	struct page *src_page;
  	pmd_t pmd;
 -	pgtable_t pgtable;
 +	pgtable_t pgtable = NULL;
  	int ret;
  
 -	ret = -ENOMEM;
 -	pgtable = pte_alloc_one(dst_mm, addr);
 -	if (unlikely(!pgtable))
 -		goto out;
 +	if (!vma_is_dax(vma)) {
 +		ret = -ENOMEM;
 +		pgtable = pte_alloc_one(dst_mm, addr);
 +		if (unlikely(!pgtable))
 +			goto out;
 +	}
  
  	dst_ptl = pmd_lock(dst_mm, dst_pmd);
  	src_ptl = pmd_lockptr(src_mm, src_pmd);
@@@ -1076,7 -1076,7 +1076,7 @@@
  		goto out_unlock;
  	}
  
 -	if (pmd_trans_huge(pmd)) {
 +	if (!vma_is_dax(vma)) {
  		/* thp accounting separate from pmd_devmap accounting */
  		src_page = pmd_page(pmd);
  		VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
@@@ -1560,8 -1560,7 +1560,8 @@@ int madvise_free_huge_pmd(struct mmu_ga
  	struct mm_struct *mm = tlb->mm;
  	int ret = 0;
  
 -	if (!pmd_trans_huge_lock(pmd, vma, &ptl))
 +	ptl = pmd_trans_huge_lock(pmd, vma);
 +	if (!ptl)
  		goto out_unlocked;
  
  	orig_pmd = *pmd;
@@@ -1628,8 -1627,7 +1628,8 @@@ int zap_huge_pmd(struct mmu_gather *tlb
  	pmd_t orig_pmd;
  	spinlock_t *ptl;
  
 -	if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
 +	ptl = __pmd_trans_huge_lock(pmd, vma);
 +	if (!ptl)
  		return 0;
  	/*
  	 * For architectures like ppc64 we look at deposited pgtable
@@@ -1692,8 -1690,7 +1692,8 @@@ bool move_huge_pmd(struct vm_area_struc
  	 * We don't have to worry about the ordering of src and dst
  	 * ptlocks because exclusive mmap_sem prevents deadlock.
  	 */
 -	if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
 +	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
 +	if (old_ptl) {
  		new_ptl = pmd_lockptr(mm, new_pmd);
  		if (new_ptl != old_ptl)
  			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@@ -1727,8 -1724,7 +1727,8 @@@ int change_huge_pmd(struct vm_area_stru
  	spinlock_t *ptl;
  	int ret = 0;
  
 -	if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
 +	ptl = __pmd_trans_huge_lock(pmd, vma);
 +	if (ptl) {
  		pmd_t entry;
  		bool preserve_write = prot_numa && pmd_write(*pmd);
  		ret = 1;
@@@ -1764,14 -1760,14 +1764,14 @@@
   * Note that if it returns true, this routine returns without unlocking page
   * table lock. So callers must unlock it.
   */
 -bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
 -		spinlock_t **ptl)
 +spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
  {
 -	*ptl = pmd_lock(vma->vm_mm, pmd);
 +	spinlock_t *ptl;
 +	ptl = pmd_lock(vma->vm_mm, pmd);
  	if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
 -		return true;
 -	spin_unlock(*ptl);
 -	return false;
 +		return ptl;
 +	spin_unlock(ptl);
 +	return NULL;
  }
  
  #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
@@@ -2072,7 -2068,7 +2072,7 @@@ static int __collapse_huge_page_isolate
  	if (likely(writable)) {
  		if (likely(referenced)) {
  			result = SCAN_SUCCEED;
 -			trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
 +			trace_mm_collapse_huge_page_isolate(page, none_or_zero,
  							    referenced, writable, result);
  			return 1;
  		}
@@@ -2082,7 -2078,7 +2082,7 @@@
  
  out:
  	release_pte_pages(pte, _pte);
 -	trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
 +	trace_mm_collapse_huge_page_isolate(page, none_or_zero,
  					    referenced, writable, result);
  	return 0;
  }
@@@ -2580,7 -2576,7 +2580,7 @@@ out_unmap
  		collapse_huge_page(mm, address, hpage, vma, node);
  	}
  out:
 -	trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
 +	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
  				     none_or_zero, result);
  	return ret;
  }
@@@ -2860,6 -2856,7 +2860,7 @@@ static void __split_huge_pmd_locked(str
  	young = pmd_young(*pmd);
  	dirty = pmd_dirty(*pmd);
  
+ 	pmdp_huge_split_prepare(vma, haddr, pmd);
  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  	pmd_populate(mm, &_pmd, pgtable);
  
@@@ -3358,11 -3355,9 +3359,11 @@@ int total_mapcount(struct page *page
  int split_huge_page_to_list(struct page *page, struct list_head *list)
  {
  	struct page *head = compound_head(page);
 +	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
  	struct anon_vma *anon_vma;
  	int count, mapcount, ret;
  	bool mlocked;
 +	unsigned long flags;
  
  	VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
  	VM_BUG_ON_PAGE(!PageAnon(page), page);
@@@ -3402,19 -3397,19 +3403,19 @@@
  		lru_add_drain();
  
  	/* Prevent deferred_split_scan() touching ->_count */
 -	spin_lock(&split_queue_lock);
 +	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
  	count = page_count(head);
  	mapcount = total_mapcount(head);
  	if (!mapcount && count == 1) {
  		if (!list_empty(page_deferred_list(head))) {
 -			split_queue_len--;
 +			pgdata->split_queue_len--;
  			list_del(page_deferred_list(head));
  		}
 -		spin_unlock(&split_queue_lock);
 +		spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
  		__split_huge_page(page, list);
  		ret = 0;
  	} else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
 -		spin_unlock(&split_queue_lock);
 +		spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
  		pr_alert("total_mapcount: %u, page_count(): %u\n",
  				mapcount, count);
  		if (PageTail(page))
@@@ -3422,7 -3417,7 +3423,7 @@@
  		dump_page(page, "total_mapcount(head) > 0");
  		BUG();
  	} else {
 -		spin_unlock(&split_queue_lock);
 +		spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
  		unfreeze_page(anon_vma, head);
  		ret = -EBUSY;
  	}
@@@ -3437,65 -3432,64 +3438,65 @@@ out
  
  void free_transhuge_page(struct page *page)
  {
 +	struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
  	unsigned long flags;
  
 -	spin_lock_irqsave(&split_queue_lock, flags);
 +	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
  	if (!list_empty(page_deferred_list(page))) {
 -		split_queue_len--;
 +		pgdata->split_queue_len--;
  		list_del(page_deferred_list(page));
  	}
 -	spin_unlock_irqrestore(&split_queue_lock, flags);
 +	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
  	free_compound_page(page);
  }
  
  void deferred_split_huge_page(struct page *page)
  {
 +	struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
  	unsigned long flags;
  
  	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  
 -	spin_lock_irqsave(&split_queue_lock, flags);
 +	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
  	if (list_empty(page_deferred_list(page))) {
 -		list_add_tail(page_deferred_list(page), &split_queue);
 -		split_queue_len++;
 +		list_add_tail(page_deferred_list(page), &pgdata->split_queue);
 +		pgdata->split_queue_len++;
  	}
 -	spin_unlock_irqrestore(&split_queue_lock, flags);
 +	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
  }
  
  static unsigned long deferred_split_count(struct shrinker *shrink,
  		struct shrink_control *sc)
  {
 -	/*
 -	 * Split a page from split_queue will free up at least one page,
 -	 * at most HPAGE_PMD_NR - 1. We don't track exact number.
 -	 * Let's use HPAGE_PMD_NR / 2 as ballpark.
 -	 */
 -	return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
 +	struct pglist_data *pgdata = NODE_DATA(sc->nid);
 +	return ACCESS_ONCE(pgdata->split_queue_len);
  }
  
  static unsigned long deferred_split_scan(struct shrinker *shrink,
  		struct shrink_control *sc)
  {
 +	struct pglist_data *pgdata = NODE_DATA(sc->nid);
  	unsigned long flags;
  	LIST_HEAD(list), *pos, *next;
  	struct page *page;
  	int split = 0;
  
 -	spin_lock_irqsave(&split_queue_lock, flags);
 -	list_splice_init(&split_queue, &list);
 -
 +	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
  	/* Take pin on all head pages to avoid freeing them under us */
 -	list_for_each_safe(pos, next, &list) {
 +	list_for_each_safe(pos, next, &pgdata->split_queue) {
  		page = list_entry((void *)pos, struct page, mapping);
  		page = compound_head(page);
 -		/* race with put_compound_page() */
 -		if (!get_page_unless_zero(page)) {
 +		if (get_page_unless_zero(page)) {
 +			list_move(page_deferred_list(page), &list);
 +		} else {
 +			/* We lost race with put_compound_page() */
  			list_del_init(page_deferred_list(page));
 -			split_queue_len--;
 +			pgdata->split_queue_len--;
  		}
 +		if (!--sc->nr_to_scan)
 +			break;
  	}
 -	spin_unlock_irqrestore(&split_queue_lock, flags);
 +	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
  
  	list_for_each_safe(pos, next, &list) {
  		page = list_entry((void *)pos, struct page, mapping);
@@@ -3507,24 -3501,17 +3508,24 @@@
  		put_page(page);
  	}
  
 -	spin_lock_irqsave(&split_queue_lock, flags);
 -	list_splice_tail(&list, &split_queue);
 -	spin_unlock_irqrestore(&split_queue_lock, flags);
 +	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
 +	list_splice_tail(&list, &pgdata->split_queue);
 +	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
  
 -	return split * HPAGE_PMD_NR / 2;
 +	/*
 +	 * Stop shrinker if we didn't split any page, but the queue is empty.
 +	 * This can happen if pages were freed under us.
 +	 */
 +	if (!split && list_empty(&pgdata->split_queue))
 +		return SHRINK_STOP;
 +	return split;
  }
  
  static struct shrinker deferred_split_shrinker = {
  	.count_objects = deferred_split_count,
  	.scan_objects = deferred_split_scan,
  	.seeks = DEFAULT_SEEKS,
 +	.flags = SHRINKER_NUMA_AWARE,
  };
  
  #ifdef CONFIG_DEBUG_FS