Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	// SPDX-License-Identifier: GPL-2.0
	2	/*
	3	* fs/mpage.c
	4	*
	5	* Copyright (C) 2002, Linus Torvalds.
	6	*
	7	* Contains functions related to preparing and submitting BIOs which contain
	8	* multiple pagecache pages.
	9	*
	10	* 15May2002 Andrew Morton
	11	* Initial version
	12	* 27Jun2002 [email protected]
	13	* use bio_add_page() to build bio's just the right size
	14	*/
	15
	16	#include <linux/kernel.h>
	17	#include <linux/export.h>
	18	#include <linux/mm.h>
	19	#include <linux/kdev_t.h>
	20	#include <linux/gfp.h>
	21	#include <linux/bio.h>
	22	#include <linux/fs.h>
	23	#include <linux/buffer_head.h>
	24	#include <linux/blkdev.h>
	25	#include <linux/highmem.h>
	26	#include <linux/prefetch.h>
	27	#include <linux/mpage.h>
	28	#include <linux/mm_inline.h>
	29	#include <linux/writeback.h>
	30	#include <linux/backing-dev.h>
	31	#include <linux/pagevec.h>
	32	#include "internal.h"
	33
	34	/*
	35	* I/O completion handler for multipage BIOs.
	36	*
	37	* The mpage code never puts partial pages into a BIO (except for end-of-file).
	38	* If a page does not map to a contiguous run of blocks then it simply falls
	39	* back to block_read_full_folio().
	40	*
	41	* Why is this? If a page's completion depends on a number of different BIOs
	42	* which can complete in any order (or at the same time) then determining the
	43	* status of that page is hard. See end_buffer_async_read() for the details.
	44	* There is no point in duplicating all that complexity.
	45	*/
	46	static void mpage_read_end_io(struct bio *bio)
	47	{
	48	struct folio_iter fi;
	49	int err = blk_status_to_errno(bio->bi_status);
	50
	51	bio_for_each_folio_all(fi, bio) {
	52	if (err)
	53	folio_set_error(fi.folio);
	54	else
	55	folio_mark_uptodate(fi.folio);
	56	folio_unlock(fi.folio);
	57	}
	58
	59	bio_put(bio);
	60	}
	61
	62	static void mpage_write_end_io(struct bio *bio)
	63	{
	64	struct folio_iter fi;
	65	int err = blk_status_to_errno(bio->bi_status);
	66
	67	bio_for_each_folio_all(fi, bio) {
	68	if (err) {
	69	folio_set_error(fi.folio);
	70	mapping_set_error(fi.folio->mapping, err);
	71	}
	72	folio_end_writeback(fi.folio);
	73	}
	74
	75	bio_put(bio);
	76	}
	77
	78	static struct bio mpage_bio_submit_read(struct bio bio)
	79	{
	80	bio->bi_end_io = mpage_read_end_io;
	81	guard_bio_eod(bio);
	82	submit_bio(bio);
	83	return NULL;
	84	}
	85
	86	static struct bio mpage_bio_submit_write(struct bio bio)
	87	{
	88	bio->bi_end_io = mpage_write_end_io;
	89	guard_bio_eod(bio);
	90	submit_bio(bio);
	91	return NULL;
	92	}
	93
	94	/*
	95	* support function for mpage_readahead. The fs supplied get_block might
	96	* return an up to date buffer. This is used to map that buffer into
	97	* the page, which allows read_folio to avoid triggering a duplicate call
	98	* to get_block.
	99	*
	100	* The idea is to avoid adding buffers to pages that don't already have
	101	* them. So when the buffer is up to date and the page size == block size,
	102	* this marks the page up to date instead of adding new buffers.
	103	*/
	104	static void map_buffer_to_folio(struct folio folio, struct buffer_head bh,
	105	int page_block)
	106	{
	107	struct inode *inode = folio->mapping->host;
	108	struct buffer_head page_bh, head;
	109	int block = 0;
	110
	111	head = folio_buffers(folio);
	112	if (!head) {
	113	/*
	114	* don't make any buffers if there is only one buffer on
	115	* the folio and the folio just needs to be set up to date
	116	*/
	117	if (inode->i_blkbits == PAGE_SHIFT &&
	118	buffer_uptodate(bh)) {
	119	folio_mark_uptodate(folio);
	120	return;
	121	}
	122	head = create_empty_buffers(folio, i_blocksize(inode), 0);
	123	}
	124
	125	page_bh = head;
	126	do {
	127	if (block == page_block) {
	128	page_bh->b_state = bh->b_state;
	129	page_bh->b_bdev = bh->b_bdev;
	130	page_bh->b_blocknr = bh->b_blocknr;
	131	break;
	132	}
	133	page_bh = page_bh->b_this_page;
	134	block++;
	135	} while (page_bh != head);
	136	}
	137
	138	struct mpage_readpage_args {
	139	struct bio *bio;
	140	struct folio *folio;
	141	unsigned int nr_pages;
	142	bool is_readahead;
	143	sector_t last_block_in_bio;
	144	struct buffer_head map_bh;
	145	unsigned long first_logical_block;
	146	get_block_t *get_block;
	147	};
	148
	149	/*
	150	* This is the worker routine which does all the work of mapping the disk
	151	* blocks and constructs largest possible bios, submits them for IO if the
	152	* blocks are not contiguous on the disk.
	153	*
	154	* We pass a buffer_head back and forth and use its buffer_mapped() flag to
	155	* represent the validity of its disk mapping and to decide when to do the next
	156	* get_block() call.
	157	*/
	158	static struct bio do_mpage_readpage(struct mpage_readpage_args args)
	159	{
	160	struct folio *folio = args->folio;
	161	struct inode *inode = folio->mapping->host;
	162	const unsigned blkbits = inode->i_blkbits;
	163	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
	164	const unsigned blocksize = 1 << blkbits;
	165	struct buffer_head *map_bh = &args->map_bh;
	166	sector_t block_in_file;
	167	sector_t last_block;
	168	sector_t last_block_in_file;
	169	sector_t first_block;
	170	unsigned page_block;
	171	unsigned first_hole = blocks_per_page;
	172	struct block_device *bdev = NULL;
	173	int length;
	174	int fully_mapped = 1;
	175	blk_opf_t opf = REQ_OP_READ;
	176	unsigned nblocks;
	177	unsigned relative_block;
	178	gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
	179
	180	/* MAX_BUF_PER_PAGE, for example */
	181	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
	182
	183	if (args->is_readahead) {
	184	opf \|= REQ_RAHEAD;
	185	gfp \|= __GFP_NORETRY \| __GFP_NOWARN;
	186	}
	187
	188	if (folio_buffers(folio))
	189	goto confused;
	190
	191	block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
	192	last_block = block_in_file + args->nr_pages * blocks_per_page;
	193	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
	194	if (last_block > last_block_in_file)
	195	last_block = last_block_in_file;
	196	page_block = 0;
	197
	198	/*
	199	* Map blocks using the result from the previous get_blocks call first.
	200	*/
	201	nblocks = map_bh->b_size >> blkbits;
	202	if (buffer_mapped(map_bh) &&
	203	block_in_file > args->first_logical_block &&
	204	block_in_file < (args->first_logical_block + nblocks)) {
	205	unsigned map_offset = block_in_file - args->first_logical_block;
	206	unsigned last = nblocks - map_offset;
	207
	208	first_block = map_bh->b_blocknr + map_offset;
	209	for (relative_block = 0; ; relative_block++) {
	210	if (relative_block == last) {
	211	clear_buffer_mapped(map_bh);
	212	break;
	213	}
	214	if (page_block == blocks_per_page)
	215	break;
	216	page_block++;
	217	block_in_file++;
	218	}
	219	bdev = map_bh->b_bdev;
	220	}
	221
	222	/*
	223	* Then do more get_blocks calls until we are done with this folio.
	224	*/
	225	map_bh->b_folio = folio;
	226	while (page_block < blocks_per_page) {
	227	map_bh->b_state = 0;
	228	map_bh->b_size = 0;
	229
	230	if (block_in_file < last_block) {
	231	map_bh->b_size = (last_block-block_in_file) << blkbits;
	232	if (args->get_block(inode, block_in_file, map_bh, 0))
	233	goto confused;
	234	args->first_logical_block = block_in_file;
	235	}
	236
	237	if (!buffer_mapped(map_bh)) {
	238	fully_mapped = 0;
	239	if (first_hole == blocks_per_page)
	240	first_hole = page_block;
	241	page_block++;
	242	block_in_file++;
	243	continue;
	244	}
	245
	246	/* some filesystems will copy data into the page during
	247	* the get_block call, in which case we don't want to
	248	* read it again. map_buffer_to_folio copies the data
	249	* we just collected from get_block into the folio's buffers
	250	* so read_folio doesn't have to repeat the get_block call
	251	*/
	252	if (buffer_uptodate(map_bh)) {
	253	map_buffer_to_folio(folio, map_bh, page_block);
	254	goto confused;
	255	}
	256
	257	if (first_hole != blocks_per_page)
	258	goto confused; /* hole -> non-hole */
	259
	260	/* Contiguous blocks? */
	261	if (!page_block)
	262	first_block = map_bh->b_blocknr;
	263	else if (first_block + page_block != map_bh->b_blocknr)
	264	goto confused;
	265	nblocks = map_bh->b_size >> blkbits;
	266	for (relative_block = 0; ; relative_block++) {
	267	if (relative_block == nblocks) {
	268	clear_buffer_mapped(map_bh);
	269	break;
	270	} else if (page_block == blocks_per_page)
	271	break;
	272	page_block++;
	273	block_in_file++;
	274	}
	275	bdev = map_bh->b_bdev;
	276	}
	277
	278	if (first_hole != blocks_per_page) {
	279	folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE);
	280	if (first_hole == 0) {
	281	folio_mark_uptodate(folio);
	282	folio_unlock(folio);
	283	goto out;
	284	}
	285	} else if (fully_mapped) {
	286	folio_set_mappedtodisk(folio);
	287	}
	288
	289	/*
	290	* This folio will go to BIO. Do we need to send this BIO off first?
	291	*/
	292	if (args->bio && (args->last_block_in_bio != first_block - 1))
	293	args->bio = mpage_bio_submit_read(args->bio);
	294
	295	alloc_new:
	296	if (args->bio == NULL) {
	297	args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf,
	298	gfp);
	299	if (args->bio == NULL)
	300	goto confused;
	301	args->bio->bi_iter.bi_sector = first_block << (blkbits - 9);
	302	}
	303
	304	length = first_hole << blkbits;
	305	if (!bio_add_folio(args->bio, folio, length, 0)) {
	306	args->bio = mpage_bio_submit_read(args->bio);
	307	goto alloc_new;
	308	}
	309
	310	relative_block = block_in_file - args->first_logical_block;
	311	nblocks = map_bh->b_size >> blkbits;
	312	if ((buffer_boundary(map_bh) && relative_block == nblocks) \|\|
	313	(first_hole != blocks_per_page))
	314	args->bio = mpage_bio_submit_read(args->bio);
	315	else
	316	args->last_block_in_bio = first_block + blocks_per_page - 1;
	317	out:
	318	return args->bio;
	319
	320	confused:
	321	if (args->bio)
	322	args->bio = mpage_bio_submit_read(args->bio);
	323	if (!folio_test_uptodate(folio))
	324	block_read_full_folio(folio, args->get_block);
	325	else
	326	folio_unlock(folio);
	327	goto out;
	328	}
	329
	330	/**
	331	* mpage_readahead - start reads against pages
	332	* @rac: Describes which pages to read.
	333	* @get_block: The filesystem's block mapper function.
	334	*
	335	* This function walks the pages and the blocks within each page, building and
	336	* emitting large BIOs.
	337	*
	338	* If anything unusual happens, such as:
	339	*
	340	* - encountering a page which has buffers
	341	* - encountering a page which has a non-hole after a hole
	342	* - encountering a page with non-contiguous blocks
	343	*
	344	* then this code just gives up and calls the buffer_head-based read function.
	345	* It does handle a page which has holes at the end - that is a common case:
	346	* the end-of-file on blocksize < PAGE_SIZE setups.
	347	*
	348	* BH_Boundary explanation:
	349	*
	350	* There is a problem. The mpage read code assembles several pages, gets all
	351	* their disk mappings, and then submits them all. That's fine, but obtaining
	352	* the disk mappings may require I/O. Reads of indirect blocks, for example.
	353	*
	354	* So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
	355	* submitted in the following order:
	356	*
	357	* 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
	358	*
	359	* because the indirect block has to be read to get the mappings of blocks
	360	* 13,14,15,16. Obviously, this impacts performance.
	361	*
	362	* So what we do it to allow the filesystem's get_block() function to set
	363	* BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block
	364	* after this one will require I/O against a block which is probably close to
	365	* this one. So you should push what I/O you have currently accumulated.
	366	*
	367	* This all causes the disk requests to be issued in the correct order.
	368	*/
	369	void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
	370	{
	371	struct folio *folio;
	372	struct mpage_readpage_args args = {
	373	.get_block = get_block,
	374	.is_readahead = true,
	375	};
	376
	377	while ((folio = readahead_folio(rac))) {
	378	prefetchw(&folio->flags);
	379	args.folio = folio;
	380	args.nr_pages = readahead_count(rac);
	381	args.bio = do_mpage_readpage(&args);
	382	}
	383	if (args.bio)
	384	mpage_bio_submit_read(args.bio);
	385	}
	386	EXPORT_SYMBOL(mpage_readahead);
	387
	388	/*
	389	* This isn't called much at all
	390	*/
	391	int mpage_read_folio(struct folio *folio, get_block_t get_block)
	392	{
	393	struct mpage_readpage_args args = {
	394	.folio = folio,
	395	.nr_pages = 1,
	396	.get_block = get_block,
	397	};
	398
	399	args.bio = do_mpage_readpage(&args);
	400	if (args.bio)
	401	mpage_bio_submit_read(args.bio);
	402	return 0;
	403	}
	404	EXPORT_SYMBOL(mpage_read_folio);
	405
	406	/*
	407	* Writing is not so simple.
	408	*
	409	* If the page has buffers then they will be used for obtaining the disk
	410	* mapping. We only support pages which are fully mapped-and-dirty, with a
	411	* special case for pages which are unmapped at the end: end-of-file.
	412	*
	413	* If the page has no buffers (preferred) then the page is mapped here.
	414	*
	415	* If all blocks are found to be contiguous then the page can go into the
	416	* BIO. Otherwise fall back to the mapping's writepage().
	417	*
	418	* FIXME: This code wants an estimate of how many pages are still to be
	419	* written, so it can intelligently allocate a suitably-sized BIO. For now,
	420	* just allocate full-size (16-page) BIOs.
	421	*/
	422
	423	struct mpage_data {
	424	struct bio *bio;
	425	sector_t last_block_in_bio;
	426	get_block_t *get_block;
	427	};
	428
	429	/*
	430	* We have our BIO, so we can now mark the buffers clean. Make
	431	* sure to only clean buffers which we know we'll be writing.
	432	*/
	433	static void clean_buffers(struct folio *folio, unsigned first_unmapped)
	434	{
	435	unsigned buffer_counter = 0;
	436	struct buffer_head bh, head = folio_buffers(folio);
	437
	438	if (!head)
	439	return;
	440	bh = head;
	441
	442	do {
	443	if (buffer_counter++ == first_unmapped)
	444	break;
	445	clear_buffer_dirty(bh);
	446	bh = bh->b_this_page;
	447	} while (bh != head);
	448
	449	/*
	450	* we cannot drop the bh if the page is not uptodate or a concurrent
	451	* read_folio would fail to serialize with the bh and it would read from
	452	* disk before we reach the platter.
	453	*/
	454	if (buffer_heads_over_limit && folio_test_uptodate(folio))
	455	try_to_free_buffers(folio);
	456	}
	457
	458	static int __mpage_writepage(struct folio folio, struct writeback_control wbc,
	459	void *data)
	460	{
	461	struct mpage_data *mpd = data;
	462	struct bio *bio = mpd->bio;
	463	struct address_space *mapping = folio->mapping;
	464	struct inode *inode = mapping->host;
	465	const unsigned blkbits = inode->i_blkbits;
	466	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
	467	sector_t last_block;
	468	sector_t block_in_file;
	469	sector_t first_block;
	470	unsigned page_block;
	471	unsigned first_unmapped = blocks_per_page;
	472	struct block_device *bdev = NULL;
	473	int boundary = 0;
	474	sector_t boundary_block = 0;
	475	struct block_device *boundary_bdev = NULL;
	476	size_t length;
	477	struct buffer_head map_bh;
	478	loff_t i_size = i_size_read(inode);
	479	int ret = 0;
	480	struct buffer_head *head = folio_buffers(folio);
	481
	482	if (head) {
	483	struct buffer_head *bh = head;
	484
	485	/* If they're all mapped and dirty, do it */
	486	page_block = 0;
	487	do {
	488	BUG_ON(buffer_locked(bh));
	489	if (!buffer_mapped(bh)) {
	490	/*
	491	* unmapped dirty buffers are created by
	492	* block_dirty_folio -> mmapped data
	493	*/
	494	if (buffer_dirty(bh))
	495	goto confused;
	496	if (first_unmapped == blocks_per_page)
	497	first_unmapped = page_block;
	498	continue;
	499	}
	500
	501	if (first_unmapped != blocks_per_page)
	502	goto confused; /* hole -> non-hole */
	503
	504	if (!buffer_dirty(bh) \|\| !buffer_uptodate(bh))
	505	goto confused;
	506	if (page_block) {
	507	if (bh->b_blocknr != first_block + page_block)
	508	goto confused;
	509	} else {
	510	first_block = bh->b_blocknr;
	511	}
	512	page_block++;
	513	boundary = buffer_boundary(bh);
	514	if (boundary) {
	515	boundary_block = bh->b_blocknr;
	516	boundary_bdev = bh->b_bdev;
	517	}
	518	bdev = bh->b_bdev;
	519	} while ((bh = bh->b_this_page) != head);
	520
	521	if (first_unmapped)
	522	goto page_is_mapped;
	523
	524	/*
	525	* Page has buffers, but they are all unmapped. The page was
	526	* created by pagein or read over a hole which was handled by
	527	* block_read_full_folio(). If this address_space is also
	528	* using mpage_readahead then this can rarely happen.
	529	*/
	530	goto confused;
	531	}
	532
	533	/*
	534	* The page has no buffers: map it to disk
	535	*/
	536	BUG_ON(!folio_test_uptodate(folio));
	537	block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
	538	/*
	539	* Whole page beyond EOF? Skip allocating blocks to avoid leaking
	540	* space.
	541	*/
	542	if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits)
	543	goto page_is_mapped;
	544	last_block = (i_size - 1) >> blkbits;
	545	map_bh.b_folio = folio;
	546	for (page_block = 0; page_block < blocks_per_page; ) {
	547
	548	map_bh.b_state = 0;
	549	map_bh.b_size = 1 << blkbits;
	550	if (mpd->get_block(inode, block_in_file, &map_bh, 1))
	551	goto confused;
	552	if (!buffer_mapped(&map_bh))
	553	goto confused;
	554	if (buffer_new(&map_bh))
	555	clean_bdev_bh_alias(&map_bh);
	556	if (buffer_boundary(&map_bh)) {
	557	boundary_block = map_bh.b_blocknr;
	558	boundary_bdev = map_bh.b_bdev;
	559	}
	560	if (page_block) {
	561	if (map_bh.b_blocknr != first_block + page_block)
	562	goto confused;
	563	} else {
	564	first_block = map_bh.b_blocknr;
	565	}
	566	page_block++;
	567	boundary = buffer_boundary(&map_bh);
	568	bdev = map_bh.b_bdev;
	569	if (block_in_file == last_block)
	570	break;
	571	block_in_file++;
	572	}
	573	BUG_ON(page_block == 0);
	574
	575	first_unmapped = page_block;
	576
	577	page_is_mapped:
	578	/* Don't bother writing beyond EOF, truncate will discard the folio */
	579	if (folio_pos(folio) >= i_size)
	580	goto confused;
	581	length = folio_size(folio);
	582	if (folio_pos(folio) + length > i_size) {
	583	/*
	584	* The page straddles i_size. It must be zeroed out on each
	585	* and every writepage invocation because it may be mmapped.
	586	* "A file is mapped in multiples of the page size. For a file
	587	* that is not a multiple of the page size, the remaining memory
	588	* is zeroed when mapped, and writes to that region are not
	589	* written out to the file."
	590	*/
	591	length = i_size - folio_pos(folio);
	592	folio_zero_segment(folio, length, folio_size(folio));
	593	}
	594
	595	/*
	596	* This page will go to BIO. Do we need to send this BIO off first?
	597	*/
	598	if (bio && mpd->last_block_in_bio != first_block - 1)
	599	bio = mpage_bio_submit_write(bio);
	600
	601	alloc_new:
	602	if (bio == NULL) {
	603	bio = bio_alloc(bdev, BIO_MAX_VECS,
	604	REQ_OP_WRITE \| wbc_to_write_flags(wbc),
	605	GFP_NOFS);
	606	bio->bi_iter.bi_sector = first_block << (blkbits - 9);
	607	wbc_init_bio(wbc, bio);
	608	}
	609
	610	/*
	611	* Must try to add the page before marking the buffer clean or
	612	* the confused fail path above (OOM) will be very confused when
	613	* it finds all bh marked clean (i.e. it will not write anything)
	614	*/
	615	wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio));
	616	length = first_unmapped << blkbits;
	617	if (!bio_add_folio(bio, folio, length, 0)) {
	618	bio = mpage_bio_submit_write(bio);
	619	goto alloc_new;
	620	}
	621
	622	clean_buffers(folio, first_unmapped);
	623
	624	BUG_ON(folio_test_writeback(folio));
	625	folio_start_writeback(folio);
	626	folio_unlock(folio);
	627	if (boundary \|\| (first_unmapped != blocks_per_page)) {
	628	bio = mpage_bio_submit_write(bio);
	629	if (boundary_block) {
	630	write_boundary_block(boundary_bdev,
	631	boundary_block, 1 << blkbits);
	632	}
	633	} else {
	634	mpd->last_block_in_bio = first_block + blocks_per_page - 1;
	635	}
	636	goto out;
	637
	638	confused:
	639	if (bio)
	640	bio = mpage_bio_submit_write(bio);
	641
	642	/*
	643	* The caller has a ref on the inode, so *mapping is stable
	644	*/
	645	ret = block_write_full_folio(folio, wbc, mpd->get_block);
	646	mapping_set_error(mapping, ret);
	647	out:
	648	mpd->bio = bio;
	649	return ret;
	650	}
	651
	652	/**
	653	* mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
	654	* @mapping: address space structure to write
	655	* @wbc: subtract the number of written pages from *@wbc->nr_to_write
	656	* @get_block: the filesystem's block mapper function.
	657	*
	658	* This is a library function, which implements the writepages()
	659	* address_space_operation.
	660	*/
	661	int
	662	mpage_writepages(struct address_space *mapping,
	663	struct writeback_control *wbc, get_block_t get_block)
	664	{
	665	struct mpage_data mpd = {
	666	.get_block = get_block,
	667	};
	668	struct blk_plug plug;
	669	int ret;
	670
	671	blk_start_plug(&plug);
	672	ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
	673	if (mpd.bio)
	674	mpage_bio_submit_write(mpd.bio);
	675	blk_finish_plug(&plug);
	676	return ret;
	677	}
	678	EXPORT_SYMBOL(mpage_writepages);