Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* High-level sync()-related operations
	3	*/
	4
	5	#include <linux/kernel.h>
	6	#include <linux/file.h>
	7	#include <linux/fs.h>
	8	#include <linux/slab.h>
	9	#include <linux/export.h>
	10	#include <linux/namei.h>
	11	#include <linux/sched.h>
	12	#include <linux/writeback.h>
	13	#include <linux/syscalls.h>
	14	#include <linux/linkage.h>
	15	#include <linux/pagemap.h>
	16	#include <linux/quotaops.h>
	17	#include <linux/backing-dev.h>
	18	#include "internal.h"
	19
	20	#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE\| \
	21	SYNC_FILE_RANGE_WAIT_AFTER)
	22
	23	/*
	24	* Do the filesystem syncing work. For simple filesystems
	25	* writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
	26	* submit IO for these buffers via __sync_blockdev(). This also speeds up the
	27	* wait == 1 case since in that case write_inode() functions do
	28	* sync_dirty_buffer() and thus effectively write one block at a time.
	29	*/
	30	static int __sync_filesystem(struct super_block *sb, int wait)
	31	{
	32	if (wait)
	33	sync_inodes_sb(sb);
	34	else
	35	writeback_inodes_sb(sb, WB_REASON_SYNC);
	36
	37	if (sb->s_op->sync_fs)
	38	sb->s_op->sync_fs(sb, wait);
	39	return __sync_blockdev(sb->s_bdev, wait);
	40	}
	41
	42	/*
	43	* Write out and wait upon all dirty data associated with this
	44	* superblock. Filesystem data as well as the underlying block
	45	* device. Takes the superblock lock.
	46	*/
	47	int sync_filesystem(struct super_block *sb)
	48	{
	49	int ret;
	50
	51	/*
	52	* We need to be protected against the filesystem going from
	53	* r/o to r/w or vice versa.
	54	*/
	55	WARN_ON(!rwsem_is_locked(&sb->s_umount));
	56
	57	/*
	58	* No point in syncing out anything if the filesystem is read-only.
	59	*/
	60	if (sb->s_flags & MS_RDONLY)
	61	return 0;
	62
	63	ret = __sync_filesystem(sb, 0);
	64	if (ret < 0)
	65	return ret;
	66	return __sync_filesystem(sb, 1);
	67	}
	68	EXPORT_SYMBOL(sync_filesystem);
	69
	70	static void sync_inodes_one_sb(struct super_block sb, void arg)
	71	{
	72	if (!(sb->s_flags & MS_RDONLY))
	73	sync_inodes_sb(sb);
	74	}
	75
	76	static void sync_fs_one_sb(struct super_block sb, void arg)
	77	{
	78	if (!(sb->s_flags & MS_RDONLY) && sb->s_op->sync_fs)
	79	sb->s_op->sync_fs(sb, (int )arg);
	80	}
	81
	82	static void fdatawrite_one_bdev(struct block_device bdev, void arg)
	83	{
	84	filemap_fdatawrite(bdev->bd_inode->i_mapping);
	85	}
	86
	87	static void fdatawait_one_bdev(struct block_device bdev, void arg)
	88	{
	89	/*
	90	* We keep the error status of individual mapping so that
	91	* applications can catch the writeback error using fsync(2).
	92	* See filemap_fdatawait_keep_errors() for details.
	93	*/
	94	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
	95	}
	96
	97	/*
	98	* Sync everything. We start by waking flusher threads so that most of
	99	* writeback runs on all devices in parallel. Then we sync all inodes reliably
	100	* which effectively also waits for all flusher threads to finish doing
	101	* writeback. At this point all data is on disk so metadata should be stable
	102	* and we tell filesystems to sync their metadata via ->sync_fs() calls.
	103	* Finally, we writeout all block devices because some filesystems (e.g. ext2)
	104	* just write metadata (such as inodes or bitmaps) to block device page cache
	105	* and do not sync it on their own in ->sync_fs().
	106	*/
	107	SYSCALL_DEFINE0(sync)
	108	{
	109	int nowait = 0, wait = 1;
	110
	111	wakeup_flusher_threads(0, WB_REASON_SYNC);
	112	iterate_supers(sync_inodes_one_sb, NULL);
	113	iterate_supers(sync_fs_one_sb, &nowait);
	114	iterate_supers(sync_fs_one_sb, &wait);
	115	iterate_bdevs(fdatawrite_one_bdev, NULL);
	116	iterate_bdevs(fdatawait_one_bdev, NULL);
	117	if (unlikely(laptop_mode))
	118	laptop_sync_completion();
	119	return 0;
	120	}
	121
	122	static void do_sync_work(struct work_struct *work)
	123	{
	124	int nowait = 0;
	125
	126	/*
	127	* Sync twice to reduce the possibility we skipped some inodes / pages
	128	* because they were temporarily locked
	129	*/
	130	iterate_supers(sync_inodes_one_sb, &nowait);
	131	iterate_supers(sync_fs_one_sb, &nowait);
	132	iterate_bdevs(fdatawrite_one_bdev, NULL);
	133	iterate_supers(sync_inodes_one_sb, &nowait);
	134	iterate_supers(sync_fs_one_sb, &nowait);
	135	iterate_bdevs(fdatawrite_one_bdev, NULL);
	136	printk("Emergency Sync complete\n");
	137	kfree(work);
	138	}
	139
	140	void emergency_sync(void)
	141	{
	142	struct work_struct *work;
	143
	144	work = kmalloc(sizeof(*work), GFP_ATOMIC);
	145	if (work) {
	146	INIT_WORK(work, do_sync_work);
	147	schedule_work(work);
	148	}
	149	}
	150
	151	/*
	152	* sync a single super
	153	*/
	154	SYSCALL_DEFINE1(syncfs, int, fd)
	155	{
	156	struct fd f = fdget(fd);
	157	struct super_block *sb;
	158	int ret;
	159
	160	if (!f.file)
	161	return -EBADF;
	162	sb = f.file->f_path.dentry->d_sb;
	163
	164	down_read(&sb->s_umount);
	165	ret = sync_filesystem(sb);
	166	up_read(&sb->s_umount);
	167
	168	fdput(f);
	169	return ret;
	170	}
	171
	172	/**
	173	* vfs_fsync_range - helper to sync a range of data & metadata to disk
	174	* @file: file to sync
	175	* @start: offset in bytes of the beginning of data range to sync
	176	* @end: offset in bytes of the end of data range (inclusive)
	177	* @datasync: perform only datasync
	178	*
	179	* Write back data in range @start..@end and metadata for @file to disk. If
	180	* @datasync is set only metadata needed to access modified file data is
	181	* written.
	182	*/
	183	int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
	184	{
	185	struct inode *inode = file->f_mapping->host;
	186
	187	if (!file->f_op->fsync)
	188	return -EINVAL;
	189	if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
	190	spin_lock(&inode->i_lock);
	191	inode->i_state &= ~I_DIRTY_TIME;
	192	spin_unlock(&inode->i_lock);
	193	mark_inode_dirty_sync(inode);
	194	}
	195	return file->f_op->fsync(file, start, end, datasync);
	196	}
	197	EXPORT_SYMBOL(vfs_fsync_range);
	198
	199	/**
	200	* vfs_fsync - perform a fsync or fdatasync on a file
	201	* @file: file to sync
	202	* @datasync: only perform a fdatasync operation
	203	*
	204	* Write back data and metadata for @file to disk. If @datasync is
	205	* set only metadata needed to access modified file data is written.
	206	*/
	207	int vfs_fsync(struct file *file, int datasync)
	208	{
	209	return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
	210	}
	211	EXPORT_SYMBOL(vfs_fsync);
	212
	213	static int do_fsync(unsigned int fd, int datasync)
	214	{
	215	struct fd f = fdget(fd);
	216	int ret = -EBADF;
	217
	218	if (f.file) {
	219	ret = vfs_fsync(f.file, datasync);
	220	fdput(f);
	221	}
	222	return ret;
	223	}
	224
	225	SYSCALL_DEFINE1(fsync, unsigned int, fd)
	226	{
	227	return do_fsync(fd, 0);
	228	}
	229
	230	SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
	231	{
	232	return do_fsync(fd, 1);
	233	}
	234
	235	/*
	236	* sys_sync_file_range() permits finely controlled syncing over a segment of
	237	* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
	238	* zero then sys_sync_file_range() will operate from offset out to EOF.
	239	*
	240	* The flag bits are:
	241	*
	242	* SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
	243	* before performing the write.
	244	*
	245	* SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
	246	* range which are not presently under writeback. Note that this may block for
	247	* significant periods due to exhaustion of disk request structures.
	248	*
	249	* SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
	250	* after performing the write.
	251	*
	252	* Useful combinations of the flag bits are:
	253	*
	254	* SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE: ensures that all pages
	255	* in the range which were dirty on entry to sys_sync_file_range() are placed
	256	* under writeout. This is a start-write-for-data-integrity operation.
	257	*
	258	* SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
	259	* are not presently under writeout. This is an asynchronous flush-to-disk
	260	* operation. Not suitable for data integrity operations.
	261	*
	262	* SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
	263	* completion of writeout of all pages in the range. This will be used after an
	264	* earlier SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE operation to wait
	265	* for that operation to complete and to return the result.
	266	*
	267	* SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE\|SYNC_FILE_RANGE_WAIT_AFTER:
	268	* a traditional sync() operation. This is a write-for-data-integrity operation
	269	* which will ensure that all pages in the range which were dirty on entry to
	270	* sys_sync_file_range() are committed to disk.
	271	*
	272	*
	273	* SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
	274	* I/O errors or ENOSPC conditions and will return those to the caller, after
	275	* clearing the EIO and ENOSPC flags in the address_space.
	276	*
	277	* It should be noted that none of these operations write out the file's
	278	* metadata. So unless the application is strictly performing overwrites of
	279	* already-instantiated disk blocks, there are no guarantees here that the data
	280	* will be available after a crash.
	281	*/
	282	SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
	283	unsigned int, flags)
	284	{
	285	int ret;
	286	struct fd f;
	287	struct address_space *mapping;
	288	loff_t endbyte; /* inclusive */
	289	umode_t i_mode;
	290
	291	ret = -EINVAL;
	292	if (flags & ~VALID_FLAGS)
	293	goto out;
	294
	295	endbyte = offset + nbytes;
	296
	297	if ((s64)offset < 0)
	298	goto out;
	299	if ((s64)endbyte < 0)
	300	goto out;
	301	if (endbyte < offset)
	302	goto out;
	303
	304	if (sizeof(pgoff_t) == 4) {
	305	if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
	306	/*
	307	* The range starts outside a 32 bit machine's
	308	* pagecache addressing capabilities. Let it "succeed"
	309	*/
	310	ret = 0;
	311	goto out;
	312	}
	313	if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
	314	/*
	315	* Out to EOF
	316	*/
	317	nbytes = 0;
	318	}
	319	}
	320
	321	if (nbytes == 0)
	322	endbyte = LLONG_MAX;
	323	else
	324	endbyte--; /* inclusive */
	325
	326	ret = -EBADF;
	327	f = fdget(fd);
	328	if (!f.file)
	329	goto out;
	330
	331	i_mode = file_inode(f.file)->i_mode;
	332	ret = -ESPIPE;
	333	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
	334	!S_ISLNK(i_mode))
	335	goto out_put;
	336
	337	mapping = f.file->f_mapping;
	338	if (!mapping) {
	339	ret = -EINVAL;
	340	goto out_put;
	341	}
	342
	343	ret = 0;
	344	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
	345	ret = filemap_fdatawait_range(mapping, offset, endbyte);
	346	if (ret < 0)
	347	goto out_put;
	348	}
	349
	350	if (flags & SYNC_FILE_RANGE_WRITE) {
	351	ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
	352	WB_SYNC_NONE);
	353	if (ret < 0)
	354	goto out_put;
	355	}
	356
	357	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
	358	ret = filemap_fdatawait_range(mapping, offset, endbyte);
	359
	360	out_put:
	361	fdput(f);
	362	out:
	363	return ret;
	364	}
	365
	366	/* It would be nice if people remember that not all the world's an i386
	367	when they introduce new system calls */
	368	SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
	369	loff_t, offset, loff_t, nbytes)
	370	{
	371	return sys_sync_file_range(fd, offset, nbytes, flags);
	372	}