unlock_page(page);
}
+/* Request for sync pageout. */
+enum pageout_io {
+ PAGEOUT_IO_ASYNC,
+ PAGEOUT_IO_SYNC,
+};
+
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().
*/
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping,
+ enum pageout_io sync_writeback)
{
/*
* If the page is dirty, only perform writeback if that write
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}
+
+ /*
+ * Wait on writeback if requested to. This happens when
+ * direct reclaiming a large contiguous area and the
+ * first attempt to free a range of pages fails.
+ */
+ if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+ wait_on_page_writeback(page);
+
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
- struct scan_control *sc)
+ struct scan_control *sc,
+ enum pageout_io sync_writeback)
{
LIST_HEAD(ret_pages);
struct pagevec freed_pvec;
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
- if (PageWriteback(page))
- goto keep_locked;
+ may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+ (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+
+ if (PageWriteback(page)) {
+ /*
+ * Synchronous reclaim is performed in two passes,
+ * first an asynchronous pass over the list to
+ * start parallel writeback, and a second synchronous
+ * pass to wait for the IO to complete. Wait here
+ * for any page for which writeback has already
+ * started.
+ */
+ if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
+ wait_on_page_writeback(page);
+ else
+ goto keep_locked;
+ }
referenced = page_referenced(page, 1);
/* In active use or really unfreeable? Activate it. */
#endif /* CONFIG_SWAP */
mapping = page_mapping(page);
- may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
- (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
/*
* The page is mapped into the page tables of one or more
goto keep_locked;
/* Page is dirty, try to write it out here */
- switch(pageout(page, mapping)) {
+ switch (pageout(page, mapping, sync_writeback)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
(sc->order > PAGE_ALLOC_COSTLY_ORDER)?
ISOLATE_BOTH : ISOLATE_INACTIVE);
nr_active = clear_active_flags(&page_list);
+ __count_vm_events(PGDEACTIVATE, nr_active);
__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
__mod_zone_page_state(zone, NR_INACTIVE,
spin_unlock_irq(&zone->lru_lock);
nr_scanned += nr_scan;
- nr_freed = shrink_page_list(&page_list, sc);
+ nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+
+ /*
+ * If we are direct reclaiming for contiguous pages and we do
+ * not reclaim everything in the list, try again and wait
+ * for IO to complete. This will stall high-order allocations
+ * but that should be acceptable to the caller
+ */
+ if (nr_freed < nr_taken && !current_is_kswapd() &&
+ sc->order > PAGE_ALLOC_COSTLY_ORDER) {
+ congestion_wait(WRITE, HZ/10);
+
+ /*
+ * The attempt at page out may have made some
+ * of the pages active, mark them inactive again.
+ */
+ nr_active = clear_active_flags(&page_list);
+ count_vm_events(PGDEACTIVATE, nr_active);
+
+ nr_freed += shrink_page_list(&page_list, sc,
+ PAGEOUT_IO_SYNC);
+ }
+
nr_reclaimed += nr_freed;
local_irq_disable();
if (current_is_kswapd()) {
long mapped_ratio;
long distress;
long swap_tendency;
+ long imbalance;
if (zone_is_near_oom(zone))
goto force_reclaim_mapped;
*/
swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
+ /*
+ * If there's huge imbalance between active and inactive
+ * (think active 100 times larger than inactive) we should
+ * become more permissive, or the system will take too much
+ * cpu before it start swapping during memory pressure.
+ * Distress is about avoiding early-oom, this is about
+ * making swappiness graceful despite setting it to low
+ * values.
+ *
+ * Avoid div by zero with nr_inactive+1, and max resulting
+ * value is vm_total_pages.
+ */
+ imbalance = zone_page_state(zone, NR_ACTIVE);
+ imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
+
+ /*
+ * Reduce the effect of imbalance if swappiness is low,
+ * this means for a swappiness very low, the imbalance
+ * must be much higher than 100 for this logic to make
+ * the difference.
+ *
+ * Max temporary value is vm_total_pages*100.
+ */
+ imbalance *= (vm_swappiness + 1);
+ imbalance /= 100;
+
+ /*
+ * If not much of the ram is mapped, makes the imbalance
+ * less relevant, it's high priority we refill the inactive
+ * list with mapped pages only in presence of high ratio of
+ * mapped pages.
+ *
+ * Max temporary value is vm_total_pages*100.
+ */
+ imbalance *= mapped_ratio;
+ imbalance /= 100;
+
+ /* apply imbalance feedback to swap_tendency */
+ swap_tendency += imbalance;
+
/*
* Now use this metric to decide whether to start moving mapped
* memory onto the inactive list.
unsigned long nr_to_scan;
unsigned long nr_reclaimed = 0;
- atomic_inc(&zone->reclaim_in_progress);
+ zone_set_flag(zone, ZONE_RECLAIM_LOCKED);
/*
* Add one to `nr_to_scan' just to make sure that the kernel will
throttle_vm_writeout(sc->gfp_mask);
- atomic_dec(&zone->reclaim_in_progress);
+ zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
return nr_reclaimed;
}
note_zone_scanning_priority(zone, priority);
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
sc->all_unreclaimable = 0;
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone_is_all_unreclaimable(zone) &&
+ priority != DEF_PRIORITY)
continue;
if (!zone_watermark_ok(zone, order, zone->pages_high,
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone_is_all_unreclaimable(zone) &&
+ priority != DEF_PRIORITY)
continue;
if (!zone_watermark_ok(zone, order, zone->pages_high,
temp_priority[i] = priority;
sc.nr_scanned = 0;
note_zone_scanning_priority(zone, priority);
- nr_reclaimed += shrink_zone(priority, zone, &sc);
+ /*
+ * We put equal pressure on every zone, unless one
+ * zone has way too many pages free already.
+ */
+ if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
+ end_zone, 0))
+ nr_reclaimed += shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);
nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
- if (zone->all_unreclaimable)
+ if (zone_is_all_unreclaimable(zone))
continue;
if (nr_slab == 0 && zone->pages_scanned >=
(zone_page_state(zone, NR_ACTIVE)
+ zone_page_state(zone, NR_INACTIVE)) * 6)
- zone->all_unreclaimable = 1;
+ zone_set_flag(zone,
+ ZONE_ALL_UNRECLAIMABLE);
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+ if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
continue;
/* For pass = 0 we don't shrink the active list */
{
pg_data_t *pgdat;
cpumask_t mask;
+ int nid;
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
- for_each_online_pgdat(pgdat) {
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ pgdat = NODE_DATA(nid);
mask = node_to_cpumask(pgdat->node_id);
if (any_online_cpu(mask) != NR_CPUS)
/* One of our CPUs online: restore mask */
int nid;
swap_setup();
- for_each_online_node(nid)
+ for_each_node_state(nid, N_HIGH_MEMORY)
kswapd_run(nid);
hotcpu_notifier(cpu_callback, 0);
return 0;
int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
- cpumask_t mask;
int node_id;
/*
* not have reclaimable pages and if we should not delay the allocation
* then do not scan.
*/
- if (!(gfp_mask & __GFP_WAIT) ||
- zone->all_unreclaimable ||
- atomic_read(&zone->reclaim_in_progress) > 0 ||
- (current->flags & PF_MEMALLOC))
+ if (!(gfp_mask & __GFP_WAIT) || zone_is_all_unreclaimable(zone) ||
+ zone_is_reclaim_locked(zone) || (current->flags & PF_MEMALLOC))
return 0;
/*
* as wide as possible.
*/
node_id = zone_to_nid(zone);
- mask = node_to_cpumask(node_id);
- if (!cpus_empty(mask) && node_id != numa_node_id())
+ if (node_state(node_id, N_CPU) && node_id != numa_node_id())
return 0;
return __zone_reclaim(zone, gfp_mask, order);
}