Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	// SPDX-License-Identifier: GPL-2.0
	2	/*
	3	* High-level sync()-related operations
	4	*/
	5
	6	#include <linux/kernel.h>
	7	#include <linux/file.h>
	8	#include <linux/fs.h>
	9	#include <linux/slab.h>
	10	#include <linux/export.h>
	11	#include <linux/namei.h>
	12	#include <linux/sched.h>
	13	#include <linux/writeback.h>
	14	#include <linux/syscalls.h>
	15	#include <linux/linkage.h>
	16	#include <linux/pagemap.h>
	17	#include <linux/quotaops.h>
	18	#include <linux/backing-dev.h>
	19	#include "internal.h"
	20
	21	#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE\| \
	22	SYNC_FILE_RANGE_WAIT_AFTER)
	23
	24	/*
	25	* Do the filesystem syncing work. For simple filesystems
	26	* writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
	27	* submit IO for these buffers via __sync_blockdev(). This also speeds up the
	28	* wait == 1 case since in that case write_inode() functions do
	29	* sync_dirty_buffer() and thus effectively write one block at a time.
	30	*/
	31	static int __sync_filesystem(struct super_block *sb, int wait)
	32	{
	33	if (wait)
	34	sync_inodes_sb(sb);
	35	else
	36	writeback_inodes_sb(sb, WB_REASON_SYNC);
	37
	38	if (sb->s_op->sync_fs)
	39	sb->s_op->sync_fs(sb, wait);
	40	return __sync_blockdev(sb->s_bdev, wait);
	41	}
	42
	43	/*
	44	* Write out and wait upon all dirty data associated with this
	45	* superblock. Filesystem data as well as the underlying block
	46	* device. Takes the superblock lock.
	47	*/
	48	int sync_filesystem(struct super_block *sb)
	49	{
	50	int ret;
	51
	52	/*
	53	* We need to be protected against the filesystem going from
	54	* r/o to r/w or vice versa.
	55	*/
	56	WARN_ON(!rwsem_is_locked(&sb->s_umount));
	57
	58	/*
	59	* No point in syncing out anything if the filesystem is read-only.
	60	*/
	61	if (sb_rdonly(sb))
	62	return 0;
	63
	64	ret = __sync_filesystem(sb, 0);
	65	if (ret < 0)
	66	return ret;
	67	return __sync_filesystem(sb, 1);
	68	}
	69	EXPORT_SYMBOL(sync_filesystem);
	70
	71	static void sync_inodes_one_sb(struct super_block sb, void arg)
	72	{
	73	if (!sb_rdonly(sb))
	74	sync_inodes_sb(sb);
	75	}
	76
	77	static void sync_fs_one_sb(struct super_block sb, void arg)
	78	{
	79	if (!sb_rdonly(sb) && sb->s_op->sync_fs)
	80	sb->s_op->sync_fs(sb, (int )arg);
	81	}
	82
	83	static void fdatawrite_one_bdev(struct block_device bdev, void arg)
	84	{
	85	filemap_fdatawrite(bdev->bd_inode->i_mapping);
	86	}
	87
	88	static void fdatawait_one_bdev(struct block_device bdev, void arg)
	89	{
	90	/*
	91	* We keep the error status of individual mapping so that
	92	* applications can catch the writeback error using fsync(2).
	93	* See filemap_fdatawait_keep_errors() for details.
	94	*/
	95	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
	96	}
	97
	98	/*
	99	* Sync everything. We start by waking flusher threads so that most of
	100	* writeback runs on all devices in parallel. Then we sync all inodes reliably
	101	* which effectively also waits for all flusher threads to finish doing
	102	* writeback. At this point all data is on disk so metadata should be stable
	103	* and we tell filesystems to sync their metadata via ->sync_fs() calls.
	104	* Finally, we writeout all block devices because some filesystems (e.g. ext2)
	105	* just write metadata (such as inodes or bitmaps) to block device page cache
	106	* and do not sync it on their own in ->sync_fs().
	107	*/
	108	void ksys_sync(void)
	109	{
	110	int nowait = 0, wait = 1;
	111
	112	wakeup_flusher_threads(WB_REASON_SYNC);
	113	iterate_supers(sync_inodes_one_sb, NULL);
	114	iterate_supers(sync_fs_one_sb, &nowait);
	115	iterate_supers(sync_fs_one_sb, &wait);
	116	iterate_bdevs(fdatawrite_one_bdev, NULL);
	117	iterate_bdevs(fdatawait_one_bdev, NULL);
	118	if (unlikely(laptop_mode))
	119	laptop_sync_completion();
	120	}
	121
	122	SYSCALL_DEFINE0(sync)
	123	{
	124	ksys_sync();
	125	return 0;
	126	}
	127
	128	static void do_sync_work(struct work_struct *work)
	129	{
	130	int nowait = 0;
	131
	132	/*
	133	* Sync twice to reduce the possibility we skipped some inodes / pages
	134	* because they were temporarily locked
	135	*/
	136	iterate_supers(sync_inodes_one_sb, &nowait);
	137	iterate_supers(sync_fs_one_sb, &nowait);
	138	iterate_bdevs(fdatawrite_one_bdev, NULL);
	139	iterate_supers(sync_inodes_one_sb, &nowait);
	140	iterate_supers(sync_fs_one_sb, &nowait);
	141	iterate_bdevs(fdatawrite_one_bdev, NULL);
	142	printk("Emergency Sync complete\n");
	143	kfree(work);
	144	}
	145
	146	void emergency_sync(void)
	147	{
	148	struct work_struct *work;
	149
	150	work = kmalloc(sizeof(*work), GFP_ATOMIC);
	151	if (work) {
	152	INIT_WORK(work, do_sync_work);
	153	schedule_work(work);
	154	}
	155	}
	156
	157	/*
	158	* sync a single super
	159	*/
	160	SYSCALL_DEFINE1(syncfs, int, fd)
	161	{
	162	struct fd f = fdget(fd);
	163	struct super_block *sb;
	164	int ret;
	165
	166	if (!f.file)
	167	return -EBADF;
	168	sb = f.file->f_path.dentry->d_sb;
	169
	170	down_read(&sb->s_umount);
	171	ret = sync_filesystem(sb);
	172	up_read(&sb->s_umount);
	173
	174	fdput(f);
	175	return ret;
	176	}
	177
	178	/**
	179	* vfs_fsync_range - helper to sync a range of data & metadata to disk
	180	* @file: file to sync
	181	* @start: offset in bytes of the beginning of data range to sync
	182	* @end: offset in bytes of the end of data range (inclusive)
	183	* @datasync: perform only datasync
	184	*
	185	* Write back data in range @start..@end and metadata for @file to disk. If
	186	* @datasync is set only metadata needed to access modified file data is
	187	* written.
	188	*/
	189	int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
	190	{
	191	struct inode *inode = file->f_mapping->host;
	192
	193	if (!file->f_op->fsync)
	194	return -EINVAL;
	195	if (!datasync && (inode->i_state & I_DIRTY_TIME))
	196	mark_inode_dirty_sync(inode);
	197	return file->f_op->fsync(file, start, end, datasync);
	198	}
	199	EXPORT_SYMBOL(vfs_fsync_range);
	200
	201	/**
	202	* vfs_fsync - perform a fsync or fdatasync on a file
	203	* @file: file to sync
	204	* @datasync: only perform a fdatasync operation
	205	*
	206	* Write back data and metadata for @file to disk. If @datasync is
	207	* set only metadata needed to access modified file data is written.
	208	*/
	209	int vfs_fsync(struct file *file, int datasync)
	210	{
	211	return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
	212	}
	213	EXPORT_SYMBOL(vfs_fsync);
	214
	215	static int do_fsync(unsigned int fd, int datasync)
	216	{
	217	struct fd f = fdget(fd);
	218	int ret = -EBADF;
	219
	220	if (f.file) {
	221	ret = vfs_fsync(f.file, datasync);
	222	fdput(f);
	223	}
	224	return ret;
	225	}
	226
	227	SYSCALL_DEFINE1(fsync, unsigned int, fd)
	228	{
	229	return do_fsync(fd, 0);
	230	}
	231
	232	SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
	233	{
	234	return do_fsync(fd, 1);
	235	}
	236
	237	/*
	238	* sys_sync_file_range() permits finely controlled syncing over a segment of
	239	* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
	240	* zero then sys_sync_file_range() will operate from offset out to EOF.
	241	*
	242	* The flag bits are:
	243	*
	244	* SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
	245	* before performing the write.
	246	*
	247	* SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
	248	* range which are not presently under writeback. Note that this may block for
	249	* significant periods due to exhaustion of disk request structures.
	250	*
	251	* SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
	252	* after performing the write.
	253	*
	254	* Useful combinations of the flag bits are:
	255	*
	256	* SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE: ensures that all pages
	257	* in the range which were dirty on entry to sys_sync_file_range() are placed
	258	* under writeout. This is a start-write-for-data-integrity operation.
	259	*
	260	* SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
	261	* are not presently under writeout. This is an asynchronous flush-to-disk
	262	* operation. Not suitable for data integrity operations.
	263	*
	264	* SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
	265	* completion of writeout of all pages in the range. This will be used after an
	266	* earlier SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE operation to wait
	267	* for that operation to complete and to return the result.
	268	*
	269	* SYNC_FILE_RANGE_WAIT_BEFORE\|SYNC_FILE_RANGE_WRITE\|SYNC_FILE_RANGE_WAIT_AFTER:
	270	* a traditional sync() operation. This is a write-for-data-integrity operation
	271	* which will ensure that all pages in the range which were dirty on entry to
	272	* sys_sync_file_range() are committed to disk.
	273	*
	274	*
	275	* SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
	276	* I/O errors or ENOSPC conditions and will return those to the caller, after
	277	* clearing the EIO and ENOSPC flags in the address_space.
	278	*
	279	* It should be noted that none of these operations write out the file's
	280	* metadata. So unless the application is strictly performing overwrites of
	281	* already-instantiated disk blocks, there are no guarantees here that the data
	282	* will be available after a crash.
	283	*/
	284	int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
	285	unsigned int flags)
	286	{
	287	int ret;
	288	struct fd f;
	289	struct address_space *mapping;
	290	loff_t endbyte; /* inclusive */
	291	umode_t i_mode;
	292
	293	ret = -EINVAL;
	294	if (flags & ~VALID_FLAGS)
	295	goto out;
	296
	297	endbyte = offset + nbytes;
	298
	299	if ((s64)offset < 0)
	300	goto out;
	301	if ((s64)endbyte < 0)
	302	goto out;
	303	if (endbyte < offset)
	304	goto out;
	305
	306	if (sizeof(pgoff_t) == 4) {
	307	if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
	308	/*
	309	* The range starts outside a 32 bit machine's
	310	* pagecache addressing capabilities. Let it "succeed"
	311	*/
	312	ret = 0;
	313	goto out;
	314	}
	315	if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
	316	/*
	317	* Out to EOF
	318	*/
	319	nbytes = 0;
	320	}
	321	}
	322
	323	if (nbytes == 0)
	324	endbyte = LLONG_MAX;
	325	else
	326	endbyte--; /* inclusive */
	327
	328	ret = -EBADF;
	329	f = fdget(fd);
	330	if (!f.file)
	331	goto out;
	332
	333	i_mode = file_inode(f.file)->i_mode;
	334	ret = -ESPIPE;
	335	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
	336	!S_ISLNK(i_mode))
	337	goto out_put;
	338
	339	mapping = f.file->f_mapping;
	340	ret = 0;
	341	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
	342	ret = file_fdatawait_range(f.file, offset, endbyte);
	343	if (ret < 0)
	344	goto out_put;
	345	}
	346
	347	if (flags & SYNC_FILE_RANGE_WRITE) {
	348	ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
	349	WB_SYNC_NONE);
	350	if (ret < 0)
	351	goto out_put;
	352	}
	353
	354	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
	355	ret = file_fdatawait_range(f.file, offset, endbyte);
	356
	357	out_put:
	358	fdput(f);
	359	out:
	360	return ret;
	361	}
	362
	363	SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
	364	unsigned int, flags)
	365	{
	366	return ksys_sync_file_range(fd, offset, nbytes, flags);
	367	}
	368
	369	/* It would be nice if people remember that not all the world's an i386
	370	when they introduce new system calls */
	371	SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
	372	loff_t, offset, loff_t, nbytes)
	373	{
	374	return ksys_sync_file_range(fd, offset, nbytes, flags);
	375	}