mm/page_alloc: convert zone_pcp_update() to rely on memory barriers instead of stop_m...

[linux.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index d4bcc20ab6f00aa9b9aa598d5efa411734a6902f..8125263be60f98f848acd1ee375a3d8d3f199baa 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -65,6 +65,9 @@
  #include <asm/div64.h>
  #include "internal.h"
  
+/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
+static DEFINE_MUTEX(pcp_batch_high_lock);
+
  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
  DEFINE_PER_CPU(int, numa_node);
  EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -1179,10 +1182,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
  {
         unsigned long flags;
         int to_drain;
+       unsigned long batch;
  
         local_irq_save(flags);
-       if (pcp->count >= pcp->batch)
-               to_drain = pcp->batch;
+       batch = ACCESS_ONCE(pcp->batch);
+       if (pcp->count >= batch)
+               to_drain = batch;
         else
                 to_drain = pcp->count;
         if (to_drain > 0) {
@@ -1350,8 +1355,9 @@ void free_hot_cold_page(struct page *page, int cold)
                 list_add(&page->lru, &pcp->lists[migratetype]);
         pcp->count++;
         if (pcp->count >= pcp->high) {
-               free_pcppages_bulk(zone, pcp->batch, pcp);
-               pcp->count -= pcp->batch;
+               unsigned long batch = ACCESS_ONCE(pcp->batch);
+               free_pcppages_bulk(zone, batch, pcp);
+               pcp->count -= batch;
         }
  
  out:
@@ -4032,12 +4038,37 @@ static int __meminit zone_batchsize(struct zone *zone)
  #endif
  }
  
+/*
+ * pcp->high and pcp->batch values are related and dependent on one another:
+ * ->batch must never be higher then ->high.
+ * The following function updates them in a safe manner without read side
+ * locking.
+ *
+ * Any new users of pcp->batch and pcp->high should ensure they can cope with
+ * those fields changing asynchronously (acording the the above rule).
+ *
+ * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
+ * outside of boot time (or some other assurance that no concurrent updaters
+ * exist).
+ */
+static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
+               unsigned long batch)
+{
+       /* start with a fail safe value for batch */
+       pcp->batch = 1;
+       smp_wmb();
+
+       /* Update high, then batch, in order */
+       pcp->high = high;
+       smp_wmb();
+
+       pcp->batch = batch;
+}
+
  /* a companion to setup_pagelist_highmark() */
  static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
  {
-       struct per_cpu_pages *pcp = &p->pcp;
-       pcp->high = 6 * batch;
-       pcp->batch = max(1UL, 1 * batch);
+       pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
  }
  
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
@@ -4061,13 +4092,11 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
  static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                                 unsigned long high)
  {
-       struct per_cpu_pages *pcp;
+       unsigned long batch = max(1UL, high / 4);
+       if ((high / 4) > (PAGE_SHIFT * 8))
+               batch = PAGE_SHIFT * 8;
  
-       pcp = &p->pcp;
-       pcp->high = high;
-       pcp->batch = max(1UL, high/4);
-       if ((high/4) > (PAGE_SHIFT * 8))
-               pcp->batch = PAGE_SHIFT * 8;
+       pageset_update(&p->pcp, high, batch);
  }
  
  static void __meminit setup_zone_pageset(struct zone *zone)
@@ -5557,6 +5586,8 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
         if (!write || (ret < 0))
                 return ret;
+
+       mutex_lock(&pcp_batch_high_lock);
         for_each_populated_zone(zone) {
                 for_each_possible_cpu(cpu) {
                         unsigned long  high;
@@ -5565,6 +5596,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
                                 per_cpu_ptr(zone->pageset, cpu), high);
                 }
         }
+       mutex_unlock(&pcp_batch_high_lock);
         return 0;
  }
  
@@ -6053,32 +6085,19 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
  #endif
  
  #ifdef CONFIG_MEMORY_HOTPLUG
-static int __meminit __zone_pcp_update(void *data)
-{
-       struct zone *zone = data;
-       int cpu;
-       unsigned long batch = zone_batchsize(zone), flags;
-
-       for_each_possible_cpu(cpu) {
-               struct per_cpu_pageset *pset;
-               struct per_cpu_pages *pcp;
-
-               pset = per_cpu_ptr(zone->pageset, cpu);
-               pcp = &pset->pcp;
-
-               local_irq_save(flags);
-               if (pcp->count > 0)
-                       free_pcppages_bulk(zone, pcp->count, pcp);
-               drain_zonestat(zone, pset);
-               setup_pageset(pset, batch);
-               local_irq_restore(flags);
-       }
-       return 0;
-}
-
+/*
+ * The zone indicated has a new number of managed_pages; batch sizes and percpu
+ * page high values need to be recalulated.
+ */
  void __meminit zone_pcp_update(struct zone *zone)
  {
-       stop_machine(__zone_pcp_update, zone, NULL);
+       unsigned cpu;
+       unsigned long batch;
+       mutex_lock(&pcp_batch_high_lock);
+       batch = zone_batchsize(zone);
+       for_each_possible_cpu(cpu)
+               pageset_set_batch(per_cpu_ptr(zone->pageset, cpu), batch);
+       mutex_unlock(&pcp_batch_high_lock);
  }
  #endif