Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* fs/fs-writeback.c
	3	*
	4	* Copyright (C) 2002, Linus Torvalds.
	5	*
	6	* Contains all the functions related to writing back and waiting
	7	* upon dirty inodes against superblocks, and writing back dirty
	8	* pages against inodes. ie: data writeback. Writeout of the
	9	* inode itself is not handled here.
	10	*
	11	* 10Apr2002 Andrew Morton
	12	* Split out of fs/inode.c
	13	* Additions for address_space-based writeback
	14	*/
	15
	16	#include <linux/kernel.h>
	17	#include <linux/export.h>
	18	#include <linux/spinlock.h>
	19	#include <linux/slab.h>
	20	#include <linux/sched.h>
	21	#include <linux/fs.h>
	22	#include <linux/mm.h>
	23	#include <linux/pagemap.h>
	24	#include <linux/kthread.h>
	25	#include <linux/freezer.h>
	26	#include <linux/writeback.h>
	27	#include <linux/blkdev.h>
	28	#include <linux/backing-dev.h>
	29	#include <linux/tracepoint.h>
	30	#include "internal.h"
	31
	32	/*
	33	* 4MB minimal write chunk size
	34	*/
	35	#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
	36
	37	/*
	38	* Passed into wb_writeback(), essentially a subset of writeback_control
	39	*/
	40	struct wb_writeback_work {
	41	long nr_pages;
	42	struct super_block *sb;
	43	unsigned long *older_than_this;
	44	enum writeback_sync_modes sync_mode;
	45	unsigned int tagged_writepages:1;
	46	unsigned int for_kupdate:1;
	47	unsigned int range_cyclic:1;
	48	unsigned int for_background:1;
	49	enum wb_reason reason; /* why was writeback initiated? */
	50
	51	struct list_head list; /* pending work list */
	52	struct completion done; / set if the caller waits */
	53	};
	54
	55	/**
	56	* writeback_in_progress - determine whether there is writeback in progress
	57	* @bdi: the device's backing_dev_info structure.
	58	*
	59	* Determine whether there is writeback waiting to be handled against a
	60	* backing device.
	61	*/
	62	int writeback_in_progress(struct backing_dev_info *bdi)
	63	{
	64	return test_bit(BDI_writeback_running, &bdi->state);
	65	}
	66	EXPORT_SYMBOL(writeback_in_progress);
	67
	68	static inline struct backing_dev_info inode_to_bdi(struct inode inode)
	69	{
	70	struct super_block *sb = inode->i_sb;
	71
	72	if (strcmp(sb->s_type->name, "bdev") == 0)
	73	return inode->i_mapping->backing_dev_info;
	74
	75	return sb->s_bdi;
	76	}
	77
	78	static inline struct inode wb_inode(struct list_head head)
	79	{
	80	return list_entry(head, struct inode, i_wb_list);
	81	}
	82
	83	/*
	84	* Include the creation of the trace points after defining the
	85	* wb_writeback_work structure and inline functions so that the definition
	86	* remains local to this file.
	87	*/
	88	#define CREATE_TRACE_POINTS
	89	#include <trace/events/writeback.h>
	90
	91	/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
	92	static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
	93	{
	94	if (bdi->wb.task) {
	95	wake_up_process(bdi->wb.task);
	96	} else {
	97	/*
	98	* The bdi thread isn't there, wake up the forker thread which
	99	* will create and run it.
	100	*/
	101	wake_up_process(default_backing_dev_info.wb.task);
	102	}
	103	}
	104
	105	static void bdi_queue_work(struct backing_dev_info *bdi,
	106	struct wb_writeback_work *work)
	107	{
	108	trace_writeback_queue(bdi, work);
	109
	110	spin_lock_bh(&bdi->wb_lock);
	111	list_add_tail(&work->list, &bdi->work_list);
	112	if (!bdi->wb.task)
	113	trace_writeback_nothread(bdi, work);
	114	bdi_wakeup_flusher(bdi);
	115	spin_unlock_bh(&bdi->wb_lock);
	116	}
	117
	118	static void
	119	__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
	120	bool range_cyclic, enum wb_reason reason)
	121	{
	122	struct wb_writeback_work *work;
	123
	124	/*
	125	* This is WB_SYNC_NONE writeback, so if allocation fails just
	126	* wakeup the thread for old dirty data writeback
	127	*/
	128	work = kzalloc(sizeof(*work), GFP_ATOMIC);
	129	if (!work) {
	130	if (bdi->wb.task) {
	131	trace_writeback_nowork(bdi);
	132	wake_up_process(bdi->wb.task);
	133	}
	134	return;
	135	}
	136
	137	work->sync_mode = WB_SYNC_NONE;
	138	work->nr_pages = nr_pages;
	139	work->range_cyclic = range_cyclic;
	140	work->reason = reason;
	141
	142	bdi_queue_work(bdi, work);
	143	}
	144
	145	/**
	146	* bdi_start_writeback - start writeback
	147	* @bdi: the backing device to write from
	148	* @nr_pages: the number of pages to write
	149	* @reason: reason why some writeback work was initiated
	150	*
	151	* Description:
	152	* This does WB_SYNC_NONE opportunistic writeback. The IO is only
	153	* started when this function returns, we make no guarantees on
	154	* completion. Caller need not hold sb s_umount semaphore.
	155	*
	156	*/
	157	void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
	158	enum wb_reason reason)
	159	{
	160	__bdi_start_writeback(bdi, nr_pages, true, reason);
	161	}
	162
	163	/**
	164	* bdi_start_background_writeback - start background writeback
	165	* @bdi: the backing device to write from
	166	*
	167	* Description:
	168	* This makes sure WB_SYNC_NONE background writeback happens. When
	169	* this function returns, it is only guaranteed that for given BDI
	170	* some IO is happening if we are over background dirty threshold.
	171	* Caller need not hold sb s_umount semaphore.
	172	*/
	173	void bdi_start_background_writeback(struct backing_dev_info *bdi)
	174	{
	175	/*
	176	* We just wake up the flusher thread. It will perform background
	177	* writeback as soon as there is no other work to do.
	178	*/
	179	trace_writeback_wake_background(bdi);
	180	spin_lock_bh(&bdi->wb_lock);
	181	bdi_wakeup_flusher(bdi);
	182	spin_unlock_bh(&bdi->wb_lock);
	183	}
	184
	185	/*
	186	* Remove the inode from the writeback list it is on.
	187	*/
	188	void inode_wb_list_del(struct inode *inode)
	189	{
	190	struct backing_dev_info *bdi = inode_to_bdi(inode);
	191
	192	spin_lock(&bdi->wb.list_lock);
	193	list_del_init(&inode->i_wb_list);
	194	spin_unlock(&bdi->wb.list_lock);
	195	}
	196
	197	/*
	198	* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
	199	* furthest end of its superblock's dirty-inode list.
	200	*
	201	* Before stamping the inode's ->dirtied_when, we check to see whether it is
	202	* already the most-recently-dirtied inode on the b_dirty list. If that is
	203	* the case then the inode must have been redirtied while it was being written
	204	* out and we don't reset its dirtied_when.
	205	*/
	206	static void redirty_tail(struct inode inode, struct bdi_writeback wb)
	207	{
	208	assert_spin_locked(&wb->list_lock);
	209	if (!list_empty(&wb->b_dirty)) {
	210	struct inode *tail;
	211
	212	tail = wb_inode(wb->b_dirty.next);
	213	if (time_before(inode->dirtied_when, tail->dirtied_when))
	214	inode->dirtied_when = jiffies;
	215	}
	216	list_move(&inode->i_wb_list, &wb->b_dirty);
	217	}
	218
	219	/*
	220	* requeue inode for re-scanning after bdi->b_io list is exhausted.
	221	*/
	222	static void requeue_io(struct inode inode, struct bdi_writeback wb)
	223	{
	224	assert_spin_locked(&wb->list_lock);
	225	list_move(&inode->i_wb_list, &wb->b_more_io);
	226	}
	227
	228	static void inode_sync_complete(struct inode *inode)
	229	{
	230	inode->i_state &= ~I_SYNC;
	231	/* If inode is clean an unused, put it into LRU now... */
	232	inode_add_lru(inode);
	233	/* Waiters must see I_SYNC cleared before being woken up */
	234	smp_mb();
	235	wake_up_bit(&inode->i_state, __I_SYNC);
	236	}
	237
	238	static bool inode_dirtied_after(struct inode *inode, unsigned long t)
	239	{
	240	bool ret = time_after(inode->dirtied_when, t);
	241	#ifndef CONFIG_64BIT
	242	/*
	243	* For inodes being constantly redirtied, dirtied_when can get stuck.
	244	* It _appears_ to be in the future, but is actually in distant past.
	245	* This test is necessary to prevent such wrapped-around relative times
	246	* from permanently stopping the whole bdi writeback.
	247	*/
	248	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
	249	#endif
	250	return ret;
	251	}
	252
	253	/*
	254	* Move expired (dirtied before work->older_than_this) dirty inodes from
	255	* @delaying_queue to @dispatch_queue.
	256	*/
	257	static int move_expired_inodes(struct list_head *delaying_queue,
	258	struct list_head *dispatch_queue,
	259	struct wb_writeback_work *work)
	260	{
	261	LIST_HEAD(tmp);
	262	struct list_head pos, node;
	263	struct super_block *sb = NULL;
	264	struct inode *inode;
	265	int do_sb_sort = 0;
	266	int moved = 0;
	267
	268	while (!list_empty(delaying_queue)) {
	269	inode = wb_inode(delaying_queue->prev);
	270	if (work->older_than_this &&
	271	inode_dirtied_after(inode, *work->older_than_this))
	272	break;
	273	if (sb && sb != inode->i_sb)
	274	do_sb_sort = 1;
	275	sb = inode->i_sb;
	276	list_move(&inode->i_wb_list, &tmp);
	277	moved++;
	278	}
	279
	280	/* just one sb in list, splice to dispatch_queue and we're done */
	281	if (!do_sb_sort) {
	282	list_splice(&tmp, dispatch_queue);
	283	goto out;
	284	}
	285
	286	/* Move inodes from one superblock together */
	287	while (!list_empty(&tmp)) {
	288	sb = wb_inode(tmp.prev)->i_sb;
	289	list_for_each_prev_safe(pos, node, &tmp) {
	290	inode = wb_inode(pos);
	291	if (inode->i_sb == sb)
	292	list_move(&inode->i_wb_list, dispatch_queue);
	293	}
	294	}
	295	out:
	296	return moved;
	297	}
	298
	299	/*
	300	* Queue all expired dirty inodes for io, eldest first.
	301	* Before
	302	* newly dirtied b_dirty b_io b_more_io
	303	* =============> gf edc BA
	304	* After
	305	* newly dirtied b_dirty b_io b_more_io
	306	* =============> g fBAedc
	307	* \|
	308	* +--> dequeue for IO
	309	*/
	310	static void queue_io(struct bdi_writeback wb, struct wb_writeback_work work)
	311	{
	312	int moved;
	313	assert_spin_locked(&wb->list_lock);
	314	list_splice_init(&wb->b_more_io, &wb->b_io);
	315	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
	316	trace_writeback_queue_io(wb, work, moved);
	317	}
	318
	319	static int write_inode(struct inode inode, struct writeback_control wbc)
	320	{
	321	int ret;
	322
	323	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
	324	trace_writeback_write_inode_start(inode, wbc);
	325	ret = inode->i_sb->s_op->write_inode(inode, wbc);
	326	trace_writeback_write_inode(inode, wbc);
	327	return ret;
	328	}
	329	return 0;
	330	}
	331
	332	/*
	333	* Wait for writeback on an inode to complete. Called with i_lock held.
	334	* Caller must make sure inode cannot go away when we drop i_lock.
	335	*/
	336	static void __inode_wait_for_writeback(struct inode *inode)
	337	__releases(inode->i_lock)
	338	__acquires(inode->i_lock)
	339	{
	340	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
	341	wait_queue_head_t *wqh;
	342
	343	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
	344	while (inode->i_state & I_SYNC) {
	345	spin_unlock(&inode->i_lock);
	346	__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
	347	spin_lock(&inode->i_lock);
	348	}
	349	}
	350
	351	/*
	352	* Wait for writeback on an inode to complete. Caller must have inode pinned.
	353	*/
	354	void inode_wait_for_writeback(struct inode *inode)
	355	{
	356	spin_lock(&inode->i_lock);
	357	__inode_wait_for_writeback(inode);
	358	spin_unlock(&inode->i_lock);
	359	}
	360
	361	/*
	362	* Sleep until I_SYNC is cleared. This function must be called with i_lock
	363	* held and drops it. It is aimed for callers not holding any inode reference
	364	* so once i_lock is dropped, inode can go away.
	365	*/
	366	static void inode_sleep_on_writeback(struct inode *inode)
	367	__releases(inode->i_lock)
	368	{
	369	DEFINE_WAIT(wait);
	370	wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
	371	int sleep;
	372
	373	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
	374	sleep = inode->i_state & I_SYNC;
	375	spin_unlock(&inode->i_lock);
	376	if (sleep)
	377	schedule();
	378	finish_wait(wqh, &wait);
	379	}
	380
	381	/*
	382	* Find proper writeback list for the inode depending on its current state and
	383	* possibly also change of its state while we were doing writeback. Here we
	384	* handle things such as livelock prevention or fairness of writeback among
	385	* inodes. This function can be called only by flusher thread - noone else
	386	* processes all inodes in writeback lists and requeueing inodes behind flusher
	387	* thread's back can have unexpected consequences.
	388	*/
	389	static void requeue_inode(struct inode inode, struct bdi_writeback wb,
	390	struct writeback_control *wbc)
	391	{
	392	if (inode->i_state & I_FREEING)
	393	return;
	394
	395	/*
	396	* Sync livelock prevention. Each inode is tagged and synced in one
	397	* shot. If still dirty, it will be redirty_tail()'ed below. Update
	398	* the dirty time to prevent enqueue and sync it again.
	399	*/
	400	if ((inode->i_state & I_DIRTY) &&
	401	(wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages))
	402	inode->dirtied_when = jiffies;
	403
	404	if (wbc->pages_skipped) {
	405	/*
	406	* writeback is not making progress due to locked
	407	* buffers. Skip this inode for now.
	408	*/
	409	redirty_tail(inode, wb);
	410	return;
	411	}
	412
	413	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
	414	/*
	415	* We didn't write back all the pages. nfs_writepages()
	416	* sometimes bales out without doing anything.
	417	*/
	418	if (wbc->nr_to_write <= 0) {
	419	/* Slice used up. Queue for next turn. */
	420	requeue_io(inode, wb);
	421	} else {
	422	/*
	423	* Writeback blocked by something other than
	424	* congestion. Delay the inode for some time to
	425	* avoid spinning on the CPU (100% iowait)
	426	* retrying writeback of the dirty page/inode
	427	* that cannot be performed immediately.
	428	*/
	429	redirty_tail(inode, wb);
	430	}
	431	} else if (inode->i_state & I_DIRTY) {
	432	/*
	433	* Filesystems can dirty the inode during writeback operations,
	434	* such as delayed allocation during submission or metadata
	435	* updates after data IO completion.
	436	*/
	437	redirty_tail(inode, wb);
	438	} else {
	439	/* The inode is clean. Remove from writeback lists. */
	440	list_del_init(&inode->i_wb_list);
	441	}
	442	}
	443
	444	/*
	445	* Write out an inode and its dirty pages. Do not update the writeback list
	446	* linkage. That is left to the caller. The caller is also responsible for
	447	* setting I_SYNC flag and calling inode_sync_complete() to clear it.
	448	*/
	449	static int
	450	__writeback_single_inode(struct inode inode, struct writeback_control wbc)
	451	{
	452	struct address_space *mapping = inode->i_mapping;
	453	long nr_to_write = wbc->nr_to_write;
	454	unsigned dirty;
	455	int ret;
	456
	457	WARN_ON(!(inode->i_state & I_SYNC));
	458
	459	trace_writeback_single_inode_start(inode, wbc, nr_to_write);
	460
	461	ret = do_writepages(mapping, wbc);
	462
	463	/*
	464	* Make sure to wait on the data before writing out the metadata.
	465	* This is important for filesystems that modify metadata on data
	466	* I/O completion.
	467	*/
	468	if (wbc->sync_mode == WB_SYNC_ALL) {
	469	int err = filemap_fdatawait(mapping);
	470	if (ret == 0)
	471	ret = err;
	472	}
	473
	474	/*
	475	* Some filesystems may redirty the inode during the writeback
	476	* due to delalloc, clear dirty metadata flags right before
	477	* write_inode()
	478	*/
	479	spin_lock(&inode->i_lock);
	480	/* Clear I_DIRTY_PAGES if we've written out all dirty pages */
	481	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
	482	inode->i_state &= ~I_DIRTY_PAGES;
	483	dirty = inode->i_state & I_DIRTY;
	484	inode->i_state &= ~(I_DIRTY_SYNC \| I_DIRTY_DATASYNC);
	485	spin_unlock(&inode->i_lock);
	486	/* Don't write the inode if only I_DIRTY_PAGES was set */
	487	if (dirty & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
	488	int err = write_inode(inode, wbc);
	489	if (ret == 0)
	490	ret = err;
	491	}
	492	trace_writeback_single_inode(inode, wbc, nr_to_write);
	493	return ret;
	494	}
	495
	496	/*
	497	* Write out an inode's dirty pages. Either the caller has an active reference
	498	* on the inode or the inode has I_WILL_FREE set.
	499	*
	500	* This function is designed to be called for writing back one inode which
	501	* we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
	502	* and does more profound writeback list handling in writeback_sb_inodes().
	503	*/
	504	static int
	505	writeback_single_inode(struct inode inode, struct bdi_writeback wb,
	506	struct writeback_control *wbc)
	507	{
	508	int ret = 0;
	509
	510	spin_lock(&inode->i_lock);
	511	if (!atomic_read(&inode->i_count))
	512	WARN_ON(!(inode->i_state & (I_WILL_FREE\|I_FREEING)));
	513	else
	514	WARN_ON(inode->i_state & I_WILL_FREE);
	515
	516	if (inode->i_state & I_SYNC) {
	517	if (wbc->sync_mode != WB_SYNC_ALL)
	518	goto out;
	519	/*
	520	* It's a data-integrity sync. We must wait. Since callers hold
	521	* inode reference or inode has I_WILL_FREE set, it cannot go
	522	* away under us.
	523	*/
	524	__inode_wait_for_writeback(inode);
	525	}
	526	WARN_ON(inode->i_state & I_SYNC);
	527	/*
	528	* Skip inode if it is clean. We don't want to mess with writeback
	529	* lists in this function since flusher thread may be doing for example
	530	* sync in parallel and if we move the inode, it could get skipped. So
	531	* here we make sure inode is on some writeback list and leave it there
	532	* unless we have completely cleaned the inode.
	533	*/
	534	if (!(inode->i_state & I_DIRTY))
	535	goto out;
	536	inode->i_state \|= I_SYNC;
	537	spin_unlock(&inode->i_lock);
	538
	539	ret = __writeback_single_inode(inode, wbc);
	540
	541	spin_lock(&wb->list_lock);
	542	spin_lock(&inode->i_lock);
	543	/*
	544	* If inode is clean, remove it from writeback lists. Otherwise don't
	545	* touch it. See comment above for explanation.
	546	*/
	547	if (!(inode->i_state & I_DIRTY))
	548	list_del_init(&inode->i_wb_list);
	549	spin_unlock(&wb->list_lock);
	550	inode_sync_complete(inode);
	551	out:
	552	spin_unlock(&inode->i_lock);
	553	return ret;
	554	}
	555
	556	static long writeback_chunk_size(struct backing_dev_info *bdi,
	557	struct wb_writeback_work *work)
	558	{
	559	long pages;
	560
	561	/*
	562	* WB_SYNC_ALL mode does livelock avoidance by syncing dirty
	563	* inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
	564	* here avoids calling into writeback_inodes_wb() more than once.
	565	*
	566	* The intended call sequence for WB_SYNC_ALL writeback is:
	567	*
	568	* wb_writeback()
	569	* writeback_sb_inodes() <== called only once
	570	* write_cache_pages() <== called once for each inode
	571	* (quickly) tag currently dirty pages
	572	* (maybe slowly) sync all tagged pages
	573	*/
	574	if (work->sync_mode == WB_SYNC_ALL \|\| work->tagged_writepages)
	575	pages = LONG_MAX;
	576	else {
	577	pages = min(bdi->avg_write_bandwidth / 2,
	578	global_dirty_limit / DIRTY_SCOPE);
	579	pages = min(pages, work->nr_pages);
	580	pages = round_down(pages + MIN_WRITEBACK_PAGES,
	581	MIN_WRITEBACK_PAGES);
	582	}
	583
	584	return pages;
	585	}
	586
	587	/*
	588	* Write a portion of b_io inodes which belong to @sb.
	589	*
	590	* Return the number of pages and/or inodes written.
	591	*/
	592	static long writeback_sb_inodes(struct super_block *sb,
	593	struct bdi_writeback *wb,
	594	struct wb_writeback_work *work)
	595	{
	596	struct writeback_control wbc = {
	597	.sync_mode = work->sync_mode,
	598	.tagged_writepages = work->tagged_writepages,
	599	.for_kupdate = work->for_kupdate,
	600	.for_background = work->for_background,
	601	.range_cyclic = work->range_cyclic,
	602	.range_start = 0,
	603	.range_end = LLONG_MAX,
	604	};
	605	unsigned long start_time = jiffies;
	606	long write_chunk;
	607	long wrote = 0; /* count both pages and inodes */
	608
	609	while (!list_empty(&wb->b_io)) {
	610	struct inode *inode = wb_inode(wb->b_io.prev);
	611
	612	if (inode->i_sb != sb) {
	613	if (work->sb) {
	614	/*
	615	* We only want to write back data for this
	616	* superblock, move all inodes not belonging
	617	* to it back onto the dirty list.
	618	*/
	619	redirty_tail(inode, wb);
	620	continue;
	621	}
	622
	623	/*
	624	* The inode belongs to a different superblock.
	625	* Bounce back to the caller to unpin this and
	626	* pin the next superblock.
	627	*/
	628	break;
	629	}
	630
	631	/*
	632	* Don't bother with new inodes or inodes being freed, first
	633	* kind does not need periodic writeout yet, and for the latter
	634	* kind writeout is handled by the freer.
	635	*/
	636	spin_lock(&inode->i_lock);
	637	if (inode->i_state & (I_NEW \| I_FREEING \| I_WILL_FREE)) {
	638	spin_unlock(&inode->i_lock);
	639	redirty_tail(inode, wb);
	640	continue;
	641	}
	642	if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
	643	/*
	644	* If this inode is locked for writeback and we are not
	645	* doing writeback-for-data-integrity, move it to
	646	* b_more_io so that writeback can proceed with the
	647	* other inodes on s_io.
	648	*
	649	* We'll have another go at writing back this inode
	650	* when we completed a full scan of b_io.
	651	*/
	652	spin_unlock(&inode->i_lock);
	653	requeue_io(inode, wb);
	654	trace_writeback_sb_inodes_requeue(inode);
	655	continue;
	656	}
	657	spin_unlock(&wb->list_lock);
	658
	659	/*
	660	* We already requeued the inode if it had I_SYNC set and we
	661	* are doing WB_SYNC_NONE writeback. So this catches only the
	662	* WB_SYNC_ALL case.
	663	*/
	664	if (inode->i_state & I_SYNC) {
	665	/* Wait for I_SYNC. This function drops i_lock... */
	666	inode_sleep_on_writeback(inode);
	667	/* Inode may be gone, start again */
	668	spin_lock(&wb->list_lock);
	669	continue;
	670	}
	671	inode->i_state \|= I_SYNC;
	672	spin_unlock(&inode->i_lock);
	673
	674	write_chunk = writeback_chunk_size(wb->bdi, work);
	675	wbc.nr_to_write = write_chunk;
	676	wbc.pages_skipped = 0;
	677
	678	/*
	679	* We use I_SYNC to pin the inode in memory. While it is set
	680	* evict_inode() will wait so the inode cannot be freed.
	681	*/
	682	__writeback_single_inode(inode, &wbc);
	683
	684	work->nr_pages -= write_chunk - wbc.nr_to_write;
	685	wrote += write_chunk - wbc.nr_to_write;
	686	spin_lock(&wb->list_lock);
	687	spin_lock(&inode->i_lock);
	688	if (!(inode->i_state & I_DIRTY))
	689	wrote++;
	690	requeue_inode(inode, wb, &wbc);
	691	inode_sync_complete(inode);
	692	spin_unlock(&inode->i_lock);
	693	cond_resched_lock(&wb->list_lock);
	694	/*
	695	* bail out to wb_writeback() often enough to check
	696	* background threshold and other termination conditions.
	697	*/
	698	if (wrote) {
	699	if (time_is_before_jiffies(start_time + HZ / 10UL))
	700	break;
	701	if (work->nr_pages <= 0)
	702	break;
	703	}
	704	}
	705	return wrote;
	706	}
	707
	708	static long __writeback_inodes_wb(struct bdi_writeback *wb,
	709	struct wb_writeback_work *work)
	710	{
	711	unsigned long start_time = jiffies;
	712	long wrote = 0;
	713
	714	while (!list_empty(&wb->b_io)) {
	715	struct inode *inode = wb_inode(wb->b_io.prev);
	716	struct super_block *sb = inode->i_sb;
	717
	718	if (!grab_super_passive(sb)) {
	719	/*
	720	* grab_super_passive() may fail consistently due to
	721	* s_umount being grabbed by someone else. Don't use
	722	* requeue_io() to avoid busy retrying the inode/sb.
	723	*/
	724	redirty_tail(inode, wb);
	725	continue;
	726	}
	727	wrote += writeback_sb_inodes(sb, wb, work);
	728	drop_super(sb);
	729
	730	/* refer to the same tests at the end of writeback_sb_inodes */
	731	if (wrote) {
	732	if (time_is_before_jiffies(start_time + HZ / 10UL))
	733	break;
	734	if (work->nr_pages <= 0)
	735	break;
	736	}
	737	}
	738	/* Leave any unwritten inodes on b_io */
	739	return wrote;
	740	}
	741
	742	long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
	743	enum wb_reason reason)
	744	{
	745	struct wb_writeback_work work = {
	746	.nr_pages = nr_pages,
	747	.sync_mode = WB_SYNC_NONE,
	748	.range_cyclic = 1,
	749	.reason = reason,
	750	};
	751
	752	spin_lock(&wb->list_lock);
	753	if (list_empty(&wb->b_io))
	754	queue_io(wb, &work);
	755	__writeback_inodes_wb(wb, &work);
	756	spin_unlock(&wb->list_lock);
	757
	758	return nr_pages - work.nr_pages;
	759	}
	760
	761	static bool over_bground_thresh(struct backing_dev_info *bdi)
	762	{
	763	unsigned long background_thresh, dirty_thresh;
	764
	765	global_dirty_limits(&background_thresh, &dirty_thresh);
	766
	767	if (global_page_state(NR_FILE_DIRTY) +
	768	global_page_state(NR_UNSTABLE_NFS) > background_thresh)
	769	return true;
	770
	771	if (bdi_stat(bdi, BDI_RECLAIMABLE) >
	772	bdi_dirty_limit(bdi, background_thresh))
	773	return true;
	774
	775	return false;
	776	}
	777
	778	/*
	779	* Called under wb->list_lock. If there are multiple wb per bdi,
	780	* only the flusher working on the first wb should do it.
	781	*/
	782	static void wb_update_bandwidth(struct bdi_writeback *wb,
	783	unsigned long start_time)
	784	{
	785	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
	786	}
	787
	788	/*
	789	* Explicit flushing or periodic writeback of "old" data.
	790	*
	791	* Define "old": the first time one of an inode's pages is dirtied, we mark the
	792	* dirtying-time in the inode's address_space. So this periodic writeback code
	793	* just walks the superblock inode list, writing back any inodes which are
	794	* older than a specific point in time.
	795	*
	796	* Try to run once per dirty_writeback_interval. But if a writeback event
	797	* takes longer than a dirty_writeback_interval interval, then leave a
	798	* one-second gap.
	799	*
	800	* older_than_this takes precedence over nr_to_write. So we'll only write back
	801	* all dirty pages if they are all attached to "old" mappings.
	802	*/
	803	static long wb_writeback(struct bdi_writeback *wb,
	804	struct wb_writeback_work *work)
	805	{
	806	unsigned long wb_start = jiffies;
	807	long nr_pages = work->nr_pages;
	808	unsigned long oldest_jif;
	809	struct inode *inode;
	810	long progress;
	811
	812	oldest_jif = jiffies;
	813	work->older_than_this = &oldest_jif;
	814
	815	spin_lock(&wb->list_lock);
	816	for (;;) {
	817	/*
	818	* Stop writeback when nr_pages has been consumed
	819	*/
	820	if (work->nr_pages <= 0)
	821	break;
	822
	823	/*
	824	* Background writeout and kupdate-style writeback may
	825	* run forever. Stop them if there is other work to do
	826	* so that e.g. sync can proceed. They'll be restarted
	827	* after the other works are all done.
	828	*/
	829	if ((work->for_background \|\| work->for_kupdate) &&
	830	!list_empty(&wb->bdi->work_list))
	831	break;
	832
	833	/*
	834	* For background writeout, stop when we are below the
	835	* background dirty threshold
	836	*/
	837	if (work->for_background && !over_bground_thresh(wb->bdi))
	838	break;
	839
	840	/*
	841	* Kupdate and background works are special and we want to
	842	* include all inodes that need writing. Livelock avoidance is
	843	* handled by these works yielding to any other work so we are
	844	* safe.
	845	*/
	846	if (work->for_kupdate) {
	847	oldest_jif = jiffies -
	848	msecs_to_jiffies(dirty_expire_interval * 10);
	849	} else if (work->for_background)
	850	oldest_jif = jiffies;
	851
	852	trace_writeback_start(wb->bdi, work);
	853	if (list_empty(&wb->b_io))
	854	queue_io(wb, work);
	855	if (work->sb)
	856	progress = writeback_sb_inodes(work->sb, wb, work);
	857	else
	858	progress = __writeback_inodes_wb(wb, work);
	859	trace_writeback_written(wb->bdi, work);
	860
	861	wb_update_bandwidth(wb, wb_start);
	862
	863	/*
	864	* Did we write something? Try for more
	865	*
	866	* Dirty inodes are moved to b_io for writeback in batches.
	867	* The completion of the current batch does not necessarily
	868	* mean the overall work is done. So we keep looping as long
	869	* as made some progress on cleaning pages or inodes.
	870	*/
	871	if (progress)
	872	continue;
	873	/*
	874	* No more inodes for IO, bail
	875	*/
	876	if (list_empty(&wb->b_more_io))
	877	break;
	878	/*
	879	* Nothing written. Wait for some inode to
	880	* become available for writeback. Otherwise
	881	* we'll just busyloop.
	882	*/
	883	if (!list_empty(&wb->b_more_io)) {
	884	trace_writeback_wait(wb->bdi, work);
	885	inode = wb_inode(wb->b_more_io.prev);
	886	spin_lock(&inode->i_lock);
	887	spin_unlock(&wb->list_lock);
	888	/* This function drops i_lock... */
	889	inode_sleep_on_writeback(inode);
	890	spin_lock(&wb->list_lock);
	891	}
	892	}
	893	spin_unlock(&wb->list_lock);
	894
	895	return nr_pages - work->nr_pages;
	896	}
	897
	898	/*
	899	* Return the next wb_writeback_work struct that hasn't been processed yet.
	900	*/
	901	static struct wb_writeback_work *
	902	get_next_work_item(struct backing_dev_info *bdi)
	903	{
	904	struct wb_writeback_work *work = NULL;
	905
	906	spin_lock_bh(&bdi->wb_lock);
	907	if (!list_empty(&bdi->work_list)) {
	908	work = list_entry(bdi->work_list.next,
	909	struct wb_writeback_work, list);
	910	list_del_init(&work->list);
	911	}
	912	spin_unlock_bh(&bdi->wb_lock);
	913	return work;
	914	}
	915
	916	/*
	917	* Add in the number of potentially dirty inodes, because each inode
	918	* write can dirty pagecache in the underlying blockdev.
	919	*/
	920	static unsigned long get_nr_dirty_pages(void)
	921	{
	922	return global_page_state(NR_FILE_DIRTY) +
	923	global_page_state(NR_UNSTABLE_NFS) +
	924	get_nr_dirty_inodes();
	925	}
	926
	927	static long wb_check_background_flush(struct bdi_writeback *wb)
	928	{
	929	if (over_bground_thresh(wb->bdi)) {
	930
	931	struct wb_writeback_work work = {
	932	.nr_pages = LONG_MAX,
	933	.sync_mode = WB_SYNC_NONE,
	934	.for_background = 1,
	935	.range_cyclic = 1,
	936	.reason = WB_REASON_BACKGROUND,
	937	};
	938
	939	return wb_writeback(wb, &work);
	940	}
	941
	942	return 0;
	943	}
	944
	945	static long wb_check_old_data_flush(struct bdi_writeback *wb)
	946	{
	947	unsigned long expired;
	948	long nr_pages;
	949
	950	/*
	951	* When set to zero, disable periodic writeback
	952	*/
	953	if (!dirty_writeback_interval)
	954	return 0;
	955
	956	expired = wb->last_old_flush +
	957	msecs_to_jiffies(dirty_writeback_interval * 10);
	958	if (time_before(jiffies, expired))
	959	return 0;
	960
	961	wb->last_old_flush = jiffies;
	962	nr_pages = get_nr_dirty_pages();
	963
	964	if (nr_pages) {
	965	struct wb_writeback_work work = {
	966	.nr_pages = nr_pages,
	967	.sync_mode = WB_SYNC_NONE,
	968	.for_kupdate = 1,
	969	.range_cyclic = 1,
	970	.reason = WB_REASON_PERIODIC,
	971	};
	972
	973	return wb_writeback(wb, &work);
	974	}
	975
	976	return 0;
	977	}
	978
	979	/*
	980	* Retrieve work items and do the writeback they describe
	981	*/
	982	long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
	983	{
	984	struct backing_dev_info *bdi = wb->bdi;
	985	struct wb_writeback_work *work;
	986	long wrote = 0;
	987
	988	set_bit(BDI_writeback_running, &wb->bdi->state);
	989	while ((work = get_next_work_item(bdi)) != NULL) {
	990	/*
	991	* Override sync mode, in case we must wait for completion
	992	* because this thread is exiting now.
	993	*/
	994	if (force_wait)
	995	work->sync_mode = WB_SYNC_ALL;
	996
	997	trace_writeback_exec(bdi, work);
	998
	999	wrote += wb_writeback(wb, work);
	1000
	1001	/*
	1002	* Notify the caller of completion if this is a synchronous
	1003	* work item, otherwise just free it.
	1004	*/
	1005	if (work->done)
	1006	complete(work->done);
	1007	else
	1008	kfree(work);
	1009	}
	1010
	1011	/*
	1012	* Check for periodic writeback, kupdated() style
	1013	*/
	1014	wrote += wb_check_old_data_flush(wb);
	1015	wrote += wb_check_background_flush(wb);
	1016	clear_bit(BDI_writeback_running, &wb->bdi->state);
	1017
	1018	return wrote;
	1019	}
	1020
	1021	/*
	1022	* Handle writeback of dirty data for the device backed by this bdi. Also
	1023	* wakes up periodically and does kupdated style flushing.
	1024	*/
	1025	int bdi_writeback_thread(void *data)
	1026	{
	1027	struct bdi_writeback *wb = data;
	1028	struct backing_dev_info *bdi = wb->bdi;
	1029	long pages_written;
	1030
	1031	set_worker_desc("flush-%s", dev_name(bdi->dev));
	1032	current->flags \|= PF_SWAPWRITE;
	1033	set_freezable();
	1034	wb->last_active = jiffies;
	1035
	1036	/*
	1037	* Our parent may run at a different priority, just set us to normal
	1038	*/
	1039	set_user_nice(current, 0);
	1040
	1041	trace_writeback_thread_start(bdi);
	1042
	1043	while (!kthread_freezable_should_stop(NULL)) {
	1044	/*
	1045	* Remove own delayed wake-up timer, since we are already awake
	1046	* and we'll take care of the periodic write-back.
	1047	*/
	1048	del_timer(&wb->wakeup_timer);
	1049
	1050	pages_written = wb_do_writeback(wb, 0);
	1051
	1052	trace_writeback_pages_written(pages_written);
	1053
	1054	if (pages_written)
	1055	wb->last_active = jiffies;
	1056
	1057	set_current_state(TASK_INTERRUPTIBLE);
	1058	if (!list_empty(&bdi->work_list) \|\| kthread_should_stop()) {
	1059	__set_current_state(TASK_RUNNING);
	1060	continue;
	1061	}
	1062
	1063	if (wb_has_dirty_io(wb) && dirty_writeback_interval)
	1064	schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
	1065	else {
	1066	/*
	1067	* We have nothing to do, so can go sleep without any
	1068	* timeout and save power. When a work is queued or
	1069	* something is made dirty - we will be woken up.
	1070	*/
	1071	schedule();
	1072	}
	1073	}
	1074
	1075	/* Flush any work that raced with us exiting */
	1076	if (!list_empty(&bdi->work_list))
	1077	wb_do_writeback(wb, 1);
	1078
	1079	trace_writeback_thread_stop(bdi);
	1080	return 0;
	1081	}
	1082
	1083
	1084	/*
	1085	* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
	1086	* the whole world.
	1087	*/
	1088	void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
	1089	{
	1090	struct backing_dev_info *bdi;
	1091
	1092	if (!nr_pages) {
	1093	nr_pages = global_page_state(NR_FILE_DIRTY) +
	1094	global_page_state(NR_UNSTABLE_NFS);
	1095	}
	1096
	1097	rcu_read_lock();
	1098	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
	1099	if (!bdi_has_dirty_io(bdi))
	1100	continue;
	1101	__bdi_start_writeback(bdi, nr_pages, false, reason);
	1102	}
	1103	rcu_read_unlock();
	1104	}
	1105
	1106	static noinline void block_dump___mark_inode_dirty(struct inode *inode)
	1107	{
	1108	if (inode->i_ino \|\| strcmp(inode->i_sb->s_id, "bdev")) {
	1109	struct dentry *dentry;
	1110	const char *name = "?";
	1111
	1112	dentry = d_find_alias(inode);
	1113	if (dentry) {
	1114	spin_lock(&dentry->d_lock);
	1115	name = (const char *) dentry->d_name.name;
	1116	}
	1117	printk(KERN_DEBUG
	1118	"%s(%d): dirtied inode %lu (%s) on %s\n",
	1119	current->comm, task_pid_nr(current), inode->i_ino,
	1120	name, inode->i_sb->s_id);
	1121	if (dentry) {
	1122	spin_unlock(&dentry->d_lock);
	1123	dput(dentry);
	1124	}
	1125	}
	1126	}
	1127
	1128	/**
	1129	* __mark_inode_dirty - internal function
	1130	* @inode: inode to mark
	1131	* @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
	1132	* Mark an inode as dirty. Callers should use mark_inode_dirty or
	1133	* mark_inode_dirty_sync.
	1134	*
	1135	* Put the inode on the super block's dirty list.
	1136	*
	1137	* CAREFUL! We mark it dirty unconditionally, but move it onto the
	1138	* dirty list only if it is hashed or if it refers to a blockdev.
	1139	* If it was not hashed, it will never be added to the dirty list
	1140	* even if it is later hashed, as it will have been marked dirty already.
	1141	*
	1142	* In short, make sure you hash any inodes _before_ you start marking
	1143	* them dirty.
	1144	*
	1145	* Note that for blockdevs, inode->dirtied_when represents the dirtying time of
	1146	* the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
	1147	* the kernel-internal blockdev inode represents the dirtying time of the
	1148	* blockdev's pages. This is why for I_DIRTY_PAGES we always use
	1149	* page->mapping->host, so the page-dirtying time is recorded in the internal
	1150	* blockdev inode.
	1151	*/
	1152	void __mark_inode_dirty(struct inode *inode, int flags)
	1153	{
	1154	struct super_block *sb = inode->i_sb;
	1155	struct backing_dev_info *bdi = NULL;
	1156
	1157	/*
	1158	* Don't do this for I_DIRTY_PAGES - that doesn't actually
	1159	* dirty the inode itself
	1160	*/
	1161	if (flags & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
	1162	trace_writeback_dirty_inode_start(inode, flags);
	1163
	1164	if (sb->s_op->dirty_inode)
	1165	sb->s_op->dirty_inode(inode, flags);
	1166
	1167	trace_writeback_dirty_inode(inode, flags);
	1168	}
	1169
	1170	/*
	1171	* make sure that changes are seen by all cpus before we test i_state
	1172	* -- mikulas
	1173	*/
	1174	smp_mb();
	1175
	1176	/* avoid the locking if we can */
	1177	if ((inode->i_state & flags) == flags)
	1178	return;
	1179
	1180	if (unlikely(block_dump))
	1181	block_dump___mark_inode_dirty(inode);
	1182
	1183	spin_lock(&inode->i_lock);
	1184	if ((inode->i_state & flags) != flags) {
	1185	const int was_dirty = inode->i_state & I_DIRTY;
	1186
	1187	inode->i_state \|= flags;
	1188
	1189	/*
	1190	* If the inode is being synced, just update its dirty state.
	1191	* The unlocker will place the inode on the appropriate
	1192	* superblock list, based upon its state.
	1193	*/
	1194	if (inode->i_state & I_SYNC)
	1195	goto out_unlock_inode;
	1196
	1197	/*
	1198	* Only add valid (hashed) inodes to the superblock's
	1199	* dirty list. Add blockdev inodes as well.
	1200	*/
	1201	if (!S_ISBLK(inode->i_mode)) {
	1202	if (inode_unhashed(inode))
	1203	goto out_unlock_inode;
	1204	}
	1205	if (inode->i_state & I_FREEING)
	1206	goto out_unlock_inode;
	1207
	1208	/*
	1209	* If the inode was already on b_dirty/b_io/b_more_io, don't
	1210	* reposition it (that would break b_dirty time-ordering).
	1211	*/
	1212	if (!was_dirty) {
	1213	bool wakeup_bdi = false;
	1214	bdi = inode_to_bdi(inode);
	1215
	1216	if (bdi_cap_writeback_dirty(bdi)) {
	1217	WARN(!test_bit(BDI_registered, &bdi->state),
	1218	"bdi-%s not registered\n", bdi->name);
	1219
	1220	/*
	1221	* If this is the first dirty inode for this
	1222	* bdi, we have to wake-up the corresponding
	1223	* bdi thread to make sure background
	1224	* write-back happens later.
	1225	*/
	1226	if (!wb_has_dirty_io(&bdi->wb))
	1227	wakeup_bdi = true;
	1228	}
	1229
	1230	spin_unlock(&inode->i_lock);
	1231	spin_lock(&bdi->wb.list_lock);
	1232	inode->dirtied_when = jiffies;
	1233	list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
	1234	spin_unlock(&bdi->wb.list_lock);
	1235
	1236	if (wakeup_bdi)
	1237	bdi_wakeup_thread_delayed(bdi);
	1238	return;
	1239	}
	1240	}
	1241	out_unlock_inode:
	1242	spin_unlock(&inode->i_lock);
	1243
	1244	}
	1245	EXPORT_SYMBOL(__mark_inode_dirty);
	1246
	1247	static void wait_sb_inodes(struct super_block *sb)
	1248	{
	1249	struct inode inode, old_inode = NULL;
	1250
	1251	/*
	1252	* We need to be protected against the filesystem going from
	1253	* r/o to r/w or vice versa.
	1254	*/
	1255	WARN_ON(!rwsem_is_locked(&sb->s_umount));
	1256
	1257	spin_lock(&inode_sb_list_lock);
	1258
	1259	/*
	1260	* Data integrity sync. Must wait for all pages under writeback,
	1261	* because there may have been pages dirtied before our sync
	1262	* call, but which had writeout started before we write it out.
	1263	* In which case, the inode may not be on the dirty list, but
	1264	* we still have to wait for that writeout.
	1265	*/
	1266	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
	1267	struct address_space *mapping = inode->i_mapping;
	1268
	1269	spin_lock(&inode->i_lock);
	1270	if ((inode->i_state & (I_FREEING\|I_WILL_FREE\|I_NEW)) \|\|
	1271	(mapping->nrpages == 0)) {
	1272	spin_unlock(&inode->i_lock);
	1273	continue;
	1274	}
	1275	__iget(inode);
	1276	spin_unlock(&inode->i_lock);
	1277	spin_unlock(&inode_sb_list_lock);
	1278
	1279	/*
	1280	* We hold a reference to 'inode' so it couldn't have been
	1281	* removed from s_inodes list while we dropped the
	1282	* inode_sb_list_lock. We cannot iput the inode now as we can
	1283	* be holding the last reference and we cannot iput it under
	1284	* inode_sb_list_lock. So we keep the reference and iput it
	1285	* later.
	1286	*/
	1287	iput(old_inode);
	1288	old_inode = inode;
	1289
	1290	filemap_fdatawait(mapping);
	1291
	1292	cond_resched();
	1293
	1294	spin_lock(&inode_sb_list_lock);
	1295	}
	1296	spin_unlock(&inode_sb_list_lock);
	1297	iput(old_inode);
	1298	}
	1299
	1300	/**
	1301	* writeback_inodes_sb_nr - writeback dirty inodes from given super_block
	1302	* @sb: the superblock
	1303	* @nr: the number of pages to write
	1304	* @reason: reason why some writeback work initiated
	1305	*
	1306	* Start writeback on some inodes on this super_block. No guarantees are made
	1307	* on how many (if any) will be written, and this function does not wait
	1308	* for IO completion of submitted IO.
	1309	*/
	1310	void writeback_inodes_sb_nr(struct super_block *sb,
	1311	unsigned long nr,
	1312	enum wb_reason reason)
	1313	{
	1314	DECLARE_COMPLETION_ONSTACK(done);
	1315	struct wb_writeback_work work = {
	1316	.sb = sb,
	1317	.sync_mode = WB_SYNC_NONE,
	1318	.tagged_writepages = 1,
	1319	.done = &done,
	1320	.nr_pages = nr,
	1321	.reason = reason,
	1322	};
	1323
	1324	if (sb->s_bdi == &noop_backing_dev_info)
	1325	return;
	1326	WARN_ON(!rwsem_is_locked(&sb->s_umount));
	1327	bdi_queue_work(sb->s_bdi, &work);
	1328	wait_for_completion(&done);
	1329	}
	1330	EXPORT_SYMBOL(writeback_inodes_sb_nr);
	1331
	1332	/**
	1333	* writeback_inodes_sb - writeback dirty inodes from given super_block
	1334	* @sb: the superblock
	1335	* @reason: reason why some writeback work was initiated
	1336	*
	1337	* Start writeback on some inodes on this super_block. No guarantees are made
	1338	* on how many (if any) will be written, and this function does not wait
	1339	* for IO completion of submitted IO.
	1340	*/
	1341	void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
	1342	{
	1343	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
	1344	}
	1345	EXPORT_SYMBOL(writeback_inodes_sb);
	1346
	1347	/**
	1348	* try_to_writeback_inodes_sb_nr - try to start writeback if none underway
	1349	* @sb: the superblock
	1350	* @nr: the number of pages to write
	1351	* @reason: the reason of writeback
	1352	*
	1353	* Invoke writeback_inodes_sb_nr if no writeback is currently underway.
	1354	* Returns 1 if writeback was started, 0 if not.
	1355	*/
	1356	int try_to_writeback_inodes_sb_nr(struct super_block *sb,
	1357	unsigned long nr,
	1358	enum wb_reason reason)
	1359	{
	1360	if (writeback_in_progress(sb->s_bdi))
	1361	return 1;
	1362
	1363	if (!down_read_trylock(&sb->s_umount))
	1364	return 0;
	1365
	1366	writeback_inodes_sb_nr(sb, nr, reason);
	1367	up_read(&sb->s_umount);
	1368	return 1;
	1369	}
	1370	EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
	1371
	1372	/**
	1373	* try_to_writeback_inodes_sb - try to start writeback if none underway
	1374	* @sb: the superblock
	1375	* @reason: reason why some writeback work was initiated
	1376	*
	1377	* Implement by try_to_writeback_inodes_sb_nr()
	1378	* Returns 1 if writeback was started, 0 if not.
	1379	*/
	1380	int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
	1381	{
	1382	return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
	1383	}
	1384	EXPORT_SYMBOL(try_to_writeback_inodes_sb);
	1385
	1386	/**
	1387	* sync_inodes_sb - sync sb inode pages
	1388	* @sb: the superblock
	1389	*
	1390	* This function writes and waits on any dirty inode belonging to this
	1391	* super_block.
	1392	*/
	1393	void sync_inodes_sb(struct super_block *sb)
	1394	{
	1395	DECLARE_COMPLETION_ONSTACK(done);
	1396	struct wb_writeback_work work = {
	1397	.sb = sb,
	1398	.sync_mode = WB_SYNC_ALL,
	1399	.nr_pages = LONG_MAX,
	1400	.range_cyclic = 0,
	1401	.done = &done,
	1402	.reason = WB_REASON_SYNC,
	1403	};
	1404
	1405	/* Nothing to do? */
	1406	if (sb->s_bdi == &noop_backing_dev_info)
	1407	return;
	1408	WARN_ON(!rwsem_is_locked(&sb->s_umount));
	1409
	1410	bdi_queue_work(sb->s_bdi, &work);
	1411	wait_for_completion(&done);
	1412
	1413	wait_sb_inodes(sb);
	1414	}
	1415	EXPORT_SYMBOL(sync_inodes_sb);
	1416
	1417	/**
	1418	* write_inode_now - write an inode to disk
	1419	* @inode: inode to write to disk
	1420	* @sync: whether the write should be synchronous or not
	1421	*
	1422	* This function commits an inode to disk immediately if it is dirty. This is
	1423	* primarily needed by knfsd.
	1424	*
	1425	* The caller must either have a ref on the inode or must have set I_WILL_FREE.
	1426	*/
	1427	int write_inode_now(struct inode *inode, int sync)
	1428	{
	1429	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
	1430	struct writeback_control wbc = {
	1431	.nr_to_write = LONG_MAX,
	1432	.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
	1433	.range_start = 0,
	1434	.range_end = LLONG_MAX,
	1435	};
	1436
	1437	if (!mapping_cap_writeback_dirty(inode->i_mapping))
	1438	wbc.nr_to_write = 0;
	1439
	1440	might_sleep();
	1441	return writeback_single_inode(inode, wb, &wbc);
	1442	}
	1443	EXPORT_SYMBOL(write_inode_now);
	1444
	1445	/**
	1446	* sync_inode - write an inode and its pages to disk.
	1447	* @inode: the inode to sync
	1448	* @wbc: controls the writeback mode
	1449	*
	1450	* sync_inode() will write an inode and its pages to disk. It will also
	1451	* correctly update the inode on its superblock's dirty inode lists and will
	1452	* update inode->i_state.
	1453	*
	1454	* The caller must have a ref on the inode.
	1455	*/
	1456	int sync_inode(struct inode inode, struct writeback_control wbc)
	1457	{
	1458	return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
	1459	}
	1460	EXPORT_SYMBOL(sync_inode);
	1461
	1462	/**
	1463	* sync_inode_metadata - write an inode to disk
	1464	* @inode: the inode to sync
	1465	* @wait: wait for I/O to complete.
	1466	*
	1467	* Write an inode to disk and adjust its dirty state after completion.
	1468	*
	1469	* Note: only writes the actual inode, no associated data or other metadata.
	1470	*/
	1471	int sync_inode_metadata(struct inode *inode, int wait)
	1472	{
	1473	struct writeback_control wbc = {
	1474	.sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
	1475	.nr_to_write = 0, /* metadata-only */
	1476	};
	1477
	1478	return sync_inode(inode, &wbc);
	1479	}
	1480	EXPORT_SYMBOL(sync_inode_metadata);