Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	// SPDX-License-Identifier: GPL-2.0-only
	2	/*
	3	* linux/fs/buffer.c
	4	*
	5	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
	6	*/
	7
	8	/*
	9	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
	10	*
	11	* Removed a lot of unnecessary code and simplified things now that
	12	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
	13	*
	14	* Speed up hash, lru, and free list operations. Use gfp() for allocating
	15	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
	16	*
	17	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
	18	*
	19	* async buffer flushing, 1999 Andrea Arcangeli <[email protected]>
	20	*/
	21
	22	#include <linux/kernel.h>
	23	#include <linux/sched/signal.h>
	24	#include <linux/syscalls.h>
	25	#include <linux/fs.h>
	26	#include <linux/iomap.h>
	27	#include <linux/mm.h>
	28	#include <linux/percpu.h>
	29	#include <linux/slab.h>
	30	#include <linux/capability.h>
	31	#include <linux/blkdev.h>
	32	#include <linux/file.h>
	33	#include <linux/quotaops.h>
	34	#include <linux/highmem.h>
	35	#include <linux/export.h>
	36	#include <linux/backing-dev.h>
	37	#include <linux/writeback.h>
	38	#include <linux/hash.h>
	39	#include <linux/suspend.h>
	40	#include <linux/buffer_head.h>
	41	#include <linux/task_io_accounting_ops.h>
	42	#include <linux/bio.h>
	43	#include <linux/cpu.h>
	44	#include <linux/bitops.h>
	45	#include <linux/mpage.h>
	46	#include <linux/bit_spinlock.h>
	47	#include <linux/pagevec.h>
	48	#include <linux/sched/mm.h>
	49	#include <trace/events/block.h>
	50	#include <linux/fscrypt.h>
	51	#include <linux/fsverity.h>
	52	#include <linux/sched/isolation.h>
	53
	54	#include "internal.h"
	55
	56	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
	57	static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
	58	struct writeback_control *wbc);
	59
	60	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
	61
	62	inline void touch_buffer(struct buffer_head *bh)
	63	{
	64	trace_block_touch_buffer(bh);
	65	folio_mark_accessed(bh->b_folio);
	66	}
	67	EXPORT_SYMBOL(touch_buffer);
	68
	69	void __lock_buffer(struct buffer_head *bh)
	70	{
	71	wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
	72	}
	73	EXPORT_SYMBOL(__lock_buffer);
	74
	75	void unlock_buffer(struct buffer_head *bh)
	76	{
	77	clear_bit_unlock(BH_Lock, &bh->b_state);
	78	smp_mb__after_atomic();
	79	wake_up_bit(&bh->b_state, BH_Lock);
	80	}
	81	EXPORT_SYMBOL(unlock_buffer);
	82
	83	/*
	84	* Returns if the folio has dirty or writeback buffers. If all the buffers
	85	* are unlocked and clean then the folio_test_dirty information is stale. If
	86	* any of the buffers are locked, it is assumed they are locked for IO.
	87	*/
	88	void buffer_check_dirty_writeback(struct folio *folio,
	89	bool dirty, bool writeback)
	90	{
	91	struct buffer_head head, bh;
	92	*dirty = false;
	93	*writeback = false;
	94
	95	BUG_ON(!folio_test_locked(folio));
	96
	97	head = folio_buffers(folio);
	98	if (!head)
	99	return;
	100
	101	if (folio_test_writeback(folio))
	102	*writeback = true;
	103
	104	bh = head;
	105	do {
	106	if (buffer_locked(bh))
	107	*writeback = true;
	108
	109	if (buffer_dirty(bh))
	110	*dirty = true;
	111
	112	bh = bh->b_this_page;
	113	} while (bh != head);
	114	}
	115
	116	/*
	117	* Block until a buffer comes unlocked. This doesn't stop it
	118	* from becoming locked again - you have to lock it yourself
	119	* if you want to preserve its state.
	120	*/
	121	void __wait_on_buffer(struct buffer_head * bh)
	122	{
	123	wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
	124	}
	125	EXPORT_SYMBOL(__wait_on_buffer);
	126
	127	static void buffer_io_error(struct buffer_head bh, char msg)
	128	{
	129	if (!test_bit(BH_Quiet, &bh->b_state))
	130	printk_ratelimited(KERN_ERR
	131	"Buffer I/O error on dev %pg, logical block %llu%s\n",
	132	bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
	133	}
	134
	135	/*
	136	* End-of-IO handler helper function which does not touch the bh after
	137	* unlocking it.
	138	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
	139	* a race there is benign: unlock_buffer() only use the bh's address for
	140	* hashing after unlocking the buffer, so it doesn't actually touch the bh
	141	* itself.
	142	*/
	143	static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
	144	{
	145	if (uptodate) {
	146	set_buffer_uptodate(bh);
	147	} else {
	148	/* This happens, due to failed read-ahead attempts. */
	149	clear_buffer_uptodate(bh);
	150	}
	151	unlock_buffer(bh);
	152	}
	153
	154	/*
	155	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
	156	* unlock the buffer.
	157	*/
	158	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
	159	{
	160	__end_buffer_read_notouch(bh, uptodate);
	161	put_bh(bh);
	162	}
	163	EXPORT_SYMBOL(end_buffer_read_sync);
	164
	165	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
	166	{
	167	if (uptodate) {
	168	set_buffer_uptodate(bh);
	169	} else {
	170	buffer_io_error(bh, ", lost sync page write");
	171	mark_buffer_write_io_error(bh);
	172	clear_buffer_uptodate(bh);
	173	}
	174	unlock_buffer(bh);
	175	put_bh(bh);
	176	}
	177	EXPORT_SYMBOL(end_buffer_write_sync);
	178
	179	/*
	180	* Various filesystems appear to want __find_get_block to be non-blocking.
	181	* But it's the page lock which protects the buffers. To get around this,
	182	* we get exclusion from try_to_free_buffers with the blockdev mapping's
	183	* private_lock.
	184	*
	185	* Hack idea: for the blockdev mapping, private_lock contention
	186	* may be quite high. This code could TryLock the page, and if that
	187	* succeeds, there is no need to take private_lock.
	188	*/
	189	static struct buffer_head *
	190	__find_get_block_slow(struct block_device *bdev, sector_t block)
	191	{
	192	struct inode *bd_inode = bdev->bd_inode;
	193	struct address_space *bd_mapping = bd_inode->i_mapping;
	194	struct buffer_head *ret = NULL;
	195	pgoff_t index;
	196	struct buffer_head *bh;
	197	struct buffer_head *head;
	198	struct folio *folio;
	199	int all_mapped = 1;
	200	static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
	201
	202	index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE;
	203	folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
	204	if (IS_ERR(folio))
	205	goto out;
	206
	207	spin_lock(&bd_mapping->private_lock);
	208	head = folio_buffers(folio);
	209	if (!head)
	210	goto out_unlock;
	211	bh = head;
	212	do {
	213	if (!buffer_mapped(bh))
	214	all_mapped = 0;
	215	else if (bh->b_blocknr == block) {
	216	ret = bh;
	217	get_bh(bh);
	218	goto out_unlock;
	219	}
	220	bh = bh->b_this_page;
	221	} while (bh != head);
	222
	223	/* we might be here because some of the buffers on this page are
	224	* not mapped. This is due to various races between
	225	* file io on the block device and getblk. It gets dealt with
	226	* elsewhere, don't buffer_error if we had some unmapped buffers
	227	*/
	228	ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
	229	if (all_mapped && __ratelimit(&last_warned)) {
	230	printk("__find_get_block_slow() failed. block=%llu, "
	231	"b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
	232	"device %pg blocksize: %d\n",
	233	(unsigned long long)block,
	234	(unsigned long long)bh->b_blocknr,
	235	bh->b_state, bh->b_size, bdev,
	236	1 << bd_inode->i_blkbits);
	237	}
	238	out_unlock:
	239	spin_unlock(&bd_mapping->private_lock);
	240	folio_put(folio);
	241	out:
	242	return ret;
	243	}
	244
	245	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
	246	{
	247	unsigned long flags;
	248	struct buffer_head *first;
	249	struct buffer_head *tmp;
	250	struct folio *folio;
	251	int folio_uptodate = 1;
	252
	253	BUG_ON(!buffer_async_read(bh));
	254
	255	folio = bh->b_folio;
	256	if (uptodate) {
	257	set_buffer_uptodate(bh);
	258	} else {
	259	clear_buffer_uptodate(bh);
	260	buffer_io_error(bh, ", async page read");
	261	folio_set_error(folio);
	262	}
	263
	264	/*
	265	* Be _very_ careful from here on. Bad things can happen if
	266	* two buffer heads end IO at almost the same time and both
	267	* decide that the page is now completely done.
	268	*/
	269	first = folio_buffers(folio);
	270	spin_lock_irqsave(&first->b_uptodate_lock, flags);
	271	clear_buffer_async_read(bh);
	272	unlock_buffer(bh);
	273	tmp = bh;
	274	do {
	275	if (!buffer_uptodate(tmp))
	276	folio_uptodate = 0;
	277	if (buffer_async_read(tmp)) {
	278	BUG_ON(!buffer_locked(tmp));
	279	goto still_busy;
	280	}
	281	tmp = tmp->b_this_page;
	282	} while (tmp != bh);
	283	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
	284
	285	folio_end_read(folio, folio_uptodate);
	286	return;
	287
	288	still_busy:
	289	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
	290	return;
	291	}
	292
	293	struct postprocess_bh_ctx {
	294	struct work_struct work;
	295	struct buffer_head *bh;
	296	};
	297
	298	static void verify_bh(struct work_struct *work)
	299	{
	300	struct postprocess_bh_ctx *ctx =
	301	container_of(work, struct postprocess_bh_ctx, work);
	302	struct buffer_head *bh = ctx->bh;
	303	bool valid;
	304
	305	valid = fsverity_verify_blocks(bh->b_folio, bh->b_size, bh_offset(bh));
	306	end_buffer_async_read(bh, valid);
	307	kfree(ctx);
	308	}
	309
	310	static bool need_fsverity(struct buffer_head *bh)
	311	{
	312	struct folio *folio = bh->b_folio;
	313	struct inode *inode = folio->mapping->host;
	314
	315	return fsverity_active(inode) &&
	316	/* needed by ext4 */
	317	folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
	318	}
	319
	320	static void decrypt_bh(struct work_struct *work)
	321	{
	322	struct postprocess_bh_ctx *ctx =
	323	container_of(work, struct postprocess_bh_ctx, work);
	324	struct buffer_head *bh = ctx->bh;
	325	int err;
	326
	327	err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
	328	bh_offset(bh));
	329	if (err == 0 && need_fsverity(bh)) {
	330	/*
	331	* We use different work queues for decryption and for verity
	332	* because verity may require reading metadata pages that need
	333	* decryption, and we shouldn't recurse to the same workqueue.
	334	*/
	335	INIT_WORK(&ctx->work, verify_bh);
	336	fsverity_enqueue_verify_work(&ctx->work);
	337	return;
	338	}
	339	end_buffer_async_read(bh, err == 0);
	340	kfree(ctx);
	341	}
	342
	343	/*
	344	* I/O completion handler for block_read_full_folio() - pages
	345	* which come unlocked at the end of I/O.
	346	*/
	347	static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
	348	{
	349	struct inode *inode = bh->b_folio->mapping->host;
	350	bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
	351	bool verify = need_fsverity(bh);
	352
	353	/* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
	354	if (uptodate && (decrypt \|\| verify)) {
	355	struct postprocess_bh_ctx *ctx =
	356	kmalloc(sizeof(*ctx), GFP_ATOMIC);
	357
	358	if (ctx) {
	359	ctx->bh = bh;
	360	if (decrypt) {
	361	INIT_WORK(&ctx->work, decrypt_bh);
	362	fscrypt_enqueue_decrypt_work(&ctx->work);
	363	} else {
	364	INIT_WORK(&ctx->work, verify_bh);
	365	fsverity_enqueue_verify_work(&ctx->work);
	366	}
	367	return;
	368	}
	369	uptodate = 0;
	370	}
	371	end_buffer_async_read(bh, uptodate);
	372	}
	373
	374	/*
	375	* Completion handler for block_write_full_page() - pages which are unlocked
	376	* during I/O, and which have PageWriteback cleared upon I/O completion.
	377	*/
	378	void end_buffer_async_write(struct buffer_head *bh, int uptodate)
	379	{
	380	unsigned long flags;
	381	struct buffer_head *first;
	382	struct buffer_head *tmp;
	383	struct folio *folio;
	384
	385	BUG_ON(!buffer_async_write(bh));
	386
	387	folio = bh->b_folio;
	388	if (uptodate) {
	389	set_buffer_uptodate(bh);
	390	} else {
	391	buffer_io_error(bh, ", lost async page write");
	392	mark_buffer_write_io_error(bh);
	393	clear_buffer_uptodate(bh);
	394	folio_set_error(folio);
	395	}
	396
	397	first = folio_buffers(folio);
	398	spin_lock_irqsave(&first->b_uptodate_lock, flags);
	399
	400	clear_buffer_async_write(bh);
	401	unlock_buffer(bh);
	402	tmp = bh->b_this_page;
	403	while (tmp != bh) {
	404	if (buffer_async_write(tmp)) {
	405	BUG_ON(!buffer_locked(tmp));
	406	goto still_busy;
	407	}
	408	tmp = tmp->b_this_page;
	409	}
	410	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
	411	folio_end_writeback(folio);
	412	return;
	413
	414	still_busy:
	415	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
	416	return;
	417	}
	418	EXPORT_SYMBOL(end_buffer_async_write);
	419
	420	/*
	421	* If a page's buffers are under async readin (end_buffer_async_read
	422	* completion) then there is a possibility that another thread of
	423	* control could lock one of the buffers after it has completed
	424	* but while some of the other buffers have not completed. This
	425	* locked buffer would confuse end_buffer_async_read() into not unlocking
	426	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
	427	* that this buffer is not under async I/O.
	428	*
	429	* The page comes unlocked when it has no locked buffer_async buffers
	430	* left.
	431	*
	432	* PageLocked prevents anyone starting new async I/O reads any of
	433	* the buffers.
	434	*
	435	* PageWriteback is used to prevent simultaneous writeout of the same
	436	* page.
	437	*
	438	* PageLocked prevents anyone from starting writeback of a page which is
	439	* under read I/O (PageWriteback is only ever set against a locked page).
	440	*/
	441	static void mark_buffer_async_read(struct buffer_head *bh)
	442	{
	443	bh->b_end_io = end_buffer_async_read_io;
	444	set_buffer_async_read(bh);
	445	}
	446
	447	static void mark_buffer_async_write_endio(struct buffer_head *bh,
	448	bh_end_io_t *handler)
	449	{
	450	bh->b_end_io = handler;
	451	set_buffer_async_write(bh);
	452	}
	453
	454	void mark_buffer_async_write(struct buffer_head *bh)
	455	{
	456	mark_buffer_async_write_endio(bh, end_buffer_async_write);
	457	}
	458	EXPORT_SYMBOL(mark_buffer_async_write);
	459
	460
	461	/*
	462	* fs/buffer.c contains helper functions for buffer-backed address space's
	463	* fsync functions. A common requirement for buffer-based filesystems is
	464	* that certain data from the backing blockdev needs to be written out for
	465	* a successful fsync(). For example, ext2 indirect blocks need to be
	466	* written back and waited upon before fsync() returns.
	467	*
	468	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
	469	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
	470	* management of a list of dependent buffers at ->i_mapping->private_list.
	471	*
	472	* Locking is a little subtle: try_to_free_buffers() will remove buffers
	473	* from their controlling inode's queue when they are being freed. But
	474	* try_to_free_buffers() will be operating against the blockdev mapping
	475	* at the time, not against the S_ISREG file which depends on those buffers.
	476	* So the locking for private_list is via the private_lock in the address_space
	477	* which backs the buffers. Which is different from the address_space
	478	* against which the buffers are listed. So for a particular address_space,
	479	* mapping->private_lock does not protect mapping->private_list! In fact,
	480	* mapping->private_list will always be protected by the backing blockdev's
	481	* ->private_lock.
	482	*
	483	* Which introduces a requirement: all buffers on an address_space's
	484	* ->private_list must be from the same address_space: the blockdev's.
	485	*
	486	* address_spaces which do not place buffers at ->private_list via these
	487	* utility functions are free to use private_lock and private_list for
	488	* whatever they want. The only requirement is that list_empty(private_list)
	489	* be true at clear_inode() time.
	490	*
	491	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
	492	* filesystems should do that. invalidate_inode_buffers() should just go
	493	* BUG_ON(!list_empty).
	494	*
	495	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
	496	* take an address_space, not an inode. And it should be called
	497	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
	498	* queued up.
	499	*
	500	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
	501	* list if it is already on a list. Because if the buffer is on a list,
	502	* it must already be on the right one. If not, the filesystem is being
	503	* silly. This will save a ton of locking. But first we have to ensure
	504	* that buffers are taken off the old inode's list when they are freed
	505	* (presumably in truncate). That requires careful auditing of all
	506	* filesystems (do it inside bforget()). It could also be done by bringing
	507	* b_inode back.
	508	*/
	509
	510	/*
	511	* The buffer's backing address_space's private_lock must be held
	512	*/
	513	static void __remove_assoc_queue(struct buffer_head *bh)
	514	{
	515	list_del_init(&bh->b_assoc_buffers);
	516	WARN_ON(!bh->b_assoc_map);
	517	bh->b_assoc_map = NULL;
	518	}
	519
	520	int inode_has_buffers(struct inode *inode)
	521	{
	522	return !list_empty(&inode->i_data.private_list);
	523	}
	524
	525	/*
	526	* osync is designed to support O_SYNC io. It waits synchronously for
	527	* all already-submitted IO to complete, but does not queue any new
	528	* writes to the disk.
	529	*
	530	* To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
	531	* as you dirty the buffers, and then use osync_inode_buffers to wait for
	532	* completion. Any other dirty buffers which are not yet queued for
	533	* write will not be flushed to disk by the osync.
	534	*/
	535	static int osync_buffers_list(spinlock_t lock, struct list_head list)
	536	{
	537	struct buffer_head *bh;
	538	struct list_head *p;
	539	int err = 0;
	540
	541	spin_lock(lock);
	542	repeat:
	543	list_for_each_prev(p, list) {
	544	bh = BH_ENTRY(p);
	545	if (buffer_locked(bh)) {
	546	get_bh(bh);
	547	spin_unlock(lock);
	548	wait_on_buffer(bh);
	549	if (!buffer_uptodate(bh))
	550	err = -EIO;
	551	brelse(bh);
	552	spin_lock(lock);
	553	goto repeat;
	554	}
	555	}
	556	spin_unlock(lock);
	557	return err;
	558	}
	559
	560	/**
	561	* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
	562	* @mapping: the mapping which wants those buffers written
	563	*
	564	* Starts I/O against the buffers at mapping->private_list, and waits upon
	565	* that I/O.
	566	*
	567	* Basically, this is a convenience function for fsync().
	568	* @mapping is a file or directory which needs those buffers to be written for
	569	* a successful fsync().
	570	*/
	571	int sync_mapping_buffers(struct address_space *mapping)
	572	{
	573	struct address_space *buffer_mapping = mapping->private_data;
	574
	575	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
	576	return 0;
	577
	578	return fsync_buffers_list(&buffer_mapping->private_lock,
	579	&mapping->private_list);
	580	}
	581	EXPORT_SYMBOL(sync_mapping_buffers);
	582
	583	/**
	584	* generic_buffers_fsync_noflush - generic buffer fsync implementation
	585	* for simple filesystems with no inode lock
	586	*
	587	* @file: file to synchronize
	588	* @start: start offset in bytes
	589	* @end: end offset in bytes (inclusive)
	590	* @datasync: only synchronize essential metadata if true
	591	*
	592	* This is a generic implementation of the fsync method for simple
	593	* filesystems which track all non-inode metadata in the buffers list
	594	* hanging off the address_space structure.
	595	*/
	596	int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
	597	bool datasync)
	598	{
	599	struct inode *inode = file->f_mapping->host;
	600	int err;
	601	int ret;
	602
	603	err = file_write_and_wait_range(file, start, end);
	604	if (err)
	605	return err;
	606
	607	ret = sync_mapping_buffers(inode->i_mapping);
	608	if (!(inode->i_state & I_DIRTY_ALL))
	609	goto out;
	610	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
	611	goto out;
	612
	613	err = sync_inode_metadata(inode, 1);
	614	if (ret == 0)
	615	ret = err;
	616
	617	out:
	618	/* check and advance again to catch errors after syncing out buffers */
	619	err = file_check_and_advance_wb_err(file);
	620	if (ret == 0)
	621	ret = err;
	622	return ret;
	623	}
	624	EXPORT_SYMBOL(generic_buffers_fsync_noflush);
	625
	626	/**
	627	* generic_buffers_fsync - generic buffer fsync implementation
	628	* for simple filesystems with no inode lock
	629	*
	630	* @file: file to synchronize
	631	* @start: start offset in bytes
	632	* @end: end offset in bytes (inclusive)
	633	* @datasync: only synchronize essential metadata if true
	634	*
	635	* This is a generic implementation of the fsync method for simple
	636	* filesystems which track all non-inode metadata in the buffers list
	637	* hanging off the address_space structure. This also makes sure that
	638	* a device cache flush operation is called at the end.
	639	*/
	640	int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
	641	bool datasync)
	642	{
	643	struct inode *inode = file->f_mapping->host;
	644	int ret;
	645
	646	ret = generic_buffers_fsync_noflush(file, start, end, datasync);
	647	if (!ret)
	648	ret = blkdev_issue_flush(inode->i_sb->s_bdev);
	649	return ret;
	650	}
	651	EXPORT_SYMBOL(generic_buffers_fsync);
	652
	653	/*
	654	* Called when we've recently written block `bblock', and it is known that
	655	* `bblock' was for a buffer_boundary() buffer. This means that the block at
	656	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
	657	* dirty, schedule it for IO. So that indirects merge nicely with their data.
	658	*/
	659	void write_boundary_block(struct block_device *bdev,
	660	sector_t bblock, unsigned blocksize)
	661	{
	662	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
	663	if (bh) {
	664	if (buffer_dirty(bh))
	665	write_dirty_buffer(bh, 0);
	666	put_bh(bh);
	667	}
	668	}
	669
	670	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
	671	{
	672	struct address_space *mapping = inode->i_mapping;
	673	struct address_space *buffer_mapping = bh->b_folio->mapping;
	674
	675	mark_buffer_dirty(bh);
	676	if (!mapping->private_data) {
	677	mapping->private_data = buffer_mapping;
	678	} else {
	679	BUG_ON(mapping->private_data != buffer_mapping);
	680	}
	681	if (!bh->b_assoc_map) {
	682	spin_lock(&buffer_mapping->private_lock);
	683	list_move_tail(&bh->b_assoc_buffers,
	684	&mapping->private_list);
	685	bh->b_assoc_map = mapping;
	686	spin_unlock(&buffer_mapping->private_lock);
	687	}
	688	}
	689	EXPORT_SYMBOL(mark_buffer_dirty_inode);
	690
	691	/*
	692	* Add a page to the dirty page list.
	693	*
	694	* It is a sad fact of life that this function is called from several places
	695	* deeply under spinlocking. It may not sleep.
	696	*
	697	* If the page has buffers, the uptodate buffers are set dirty, to preserve
	698	* dirty-state coherency between the page and the buffers. It the page does
	699	* not have buffers then when they are later attached they will all be set
	700	* dirty.
	701	*
	702	* The buffers are dirtied before the page is dirtied. There's a small race
	703	* window in which a writepage caller may see the page cleanness but not the
	704	* buffer dirtiness. That's fine. If this code were to set the page dirty
	705	* before the buffers, a concurrent writepage caller could clear the page dirty
	706	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
	707	* page on the dirty page list.
	708	*
	709	* We use private_lock to lock against try_to_free_buffers while using the
	710	* page's buffer list. Also use this to protect against clean buffers being
	711	* added to the page after it was set dirty.
	712	*
	713	* FIXME: may need to call ->reservepage here as well. That's rather up to the
	714	* address_space though.
	715	*/
	716	bool block_dirty_folio(struct address_space mapping, struct folio folio)
	717	{
	718	struct buffer_head *head;
	719	bool newly_dirty;
	720
	721	spin_lock(&mapping->private_lock);
	722	head = folio_buffers(folio);
	723	if (head) {
	724	struct buffer_head *bh = head;
	725
	726	do {
	727	set_buffer_dirty(bh);
	728	bh = bh->b_this_page;
	729	} while (bh != head);
	730	}
	731	/*
	732	* Lock out page's memcg migration to keep PageDirty
	733	* synchronized with per-memcg dirty page counters.
	734	*/
	735	folio_memcg_lock(folio);
	736	newly_dirty = !folio_test_set_dirty(folio);
	737	spin_unlock(&mapping->private_lock);
	738
	739	if (newly_dirty)
	740	__folio_mark_dirty(folio, mapping, 1);
	741
	742	folio_memcg_unlock(folio);
	743
	744	if (newly_dirty)
	745	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
	746
	747	return newly_dirty;
	748	}
	749	EXPORT_SYMBOL(block_dirty_folio);
	750
	751	/*
	752	* Write out and wait upon a list of buffers.
	753	*
	754	* We have conflicting pressures: we want to make sure that all
	755	* initially dirty buffers get waited on, but that any subsequently
	756	* dirtied buffers don't. After all, we don't want fsync to last
	757	* forever if somebody is actively writing to the file.
	758	*
	759	* Do this in two main stages: first we copy dirty buffers to a
	760	* temporary inode list, queueing the writes as we go. Then we clean
	761	* up, waiting for those writes to complete.
	762	*
	763	* During this second stage, any subsequent updates to the file may end
	764	* up refiling the buffer on the original inode's dirty list again, so
	765	* there is a chance we will end up with a buffer queued for write but
	766	* not yet completed on that list. So, as a final cleanup we go through
	767	* the osync code to catch these locked, dirty buffers without requeuing
	768	* any newly dirty buffers for write.
	769	*/
	770	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
	771	{
	772	struct buffer_head *bh;
	773	struct list_head tmp;
	774	struct address_space *mapping;
	775	int err = 0, err2;
	776	struct blk_plug plug;
	777
	778	INIT_LIST_HEAD(&tmp);
	779	blk_start_plug(&plug);
	780
	781	spin_lock(lock);
	782	while (!list_empty(list)) {
	783	bh = BH_ENTRY(list->next);
	784	mapping = bh->b_assoc_map;
	785	__remove_assoc_queue(bh);
	786	/* Avoid race with mark_buffer_dirty_inode() which does
	787	* a lockless check and we rely on seeing the dirty bit */
	788	smp_mb();
	789	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
	790	list_add(&bh->b_assoc_buffers, &tmp);
	791	bh->b_assoc_map = mapping;
	792	if (buffer_dirty(bh)) {
	793	get_bh(bh);
	794	spin_unlock(lock);
	795	/*
	796	* Ensure any pending I/O completes so that
	797	* write_dirty_buffer() actually writes the
	798	* current contents - it is a noop if I/O is
	799	* still in flight on potentially older
	800	* contents.
	801	*/
	802	write_dirty_buffer(bh, REQ_SYNC);
	803
	804	/*
	805	* Kick off IO for the previous mapping. Note
	806	* that we will not run the very last mapping,
	807	* wait_on_buffer() will do that for us
	808	* through sync_buffer().
	809	*/
	810	brelse(bh);
	811	spin_lock(lock);
	812	}
	813	}
	814	}
	815
	816	spin_unlock(lock);
	817	blk_finish_plug(&plug);
	818	spin_lock(lock);
	819
	820	while (!list_empty(&tmp)) {
	821	bh = BH_ENTRY(tmp.prev);
	822	get_bh(bh);
	823	mapping = bh->b_assoc_map;
	824	__remove_assoc_queue(bh);
	825	/* Avoid race with mark_buffer_dirty_inode() which does
	826	* a lockless check and we rely on seeing the dirty bit */
	827	smp_mb();
	828	if (buffer_dirty(bh)) {
	829	list_add(&bh->b_assoc_buffers,
	830	&mapping->private_list);
	831	bh->b_assoc_map = mapping;
	832	}
	833	spin_unlock(lock);
	834	wait_on_buffer(bh);
	835	if (!buffer_uptodate(bh))
	836	err = -EIO;
	837	brelse(bh);
	838	spin_lock(lock);
	839	}
	840
	841	spin_unlock(lock);
	842	err2 = osync_buffers_list(lock, list);
	843	if (err)
	844	return err;
	845	else
	846	return err2;
	847	}
	848
	849	/*
	850	* Invalidate any and all dirty buffers on a given inode. We are
	851	* probably unmounting the fs, but that doesn't mean we have already
	852	* done a sync(). Just drop the buffers from the inode list.
	853	*
	854	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
	855	* assumes that all the buffers are against the blockdev. Not true
	856	* for reiserfs.
	857	*/
	858	void invalidate_inode_buffers(struct inode *inode)
	859	{
	860	if (inode_has_buffers(inode)) {
	861	struct address_space *mapping = &inode->i_data;
	862	struct list_head *list = &mapping->private_list;
	863	struct address_space *buffer_mapping = mapping->private_data;
	864
	865	spin_lock(&buffer_mapping->private_lock);
	866	while (!list_empty(list))
	867	__remove_assoc_queue(BH_ENTRY(list->next));
	868	spin_unlock(&buffer_mapping->private_lock);
	869	}
	870	}
	871	EXPORT_SYMBOL(invalidate_inode_buffers);
	872
	873	/*
	874	* Remove any clean buffers from the inode's buffer list. This is called
	875	* when we're trying to free the inode itself. Those buffers can pin it.
	876	*
	877	* Returns true if all buffers were removed.
	878	*/
	879	int remove_inode_buffers(struct inode *inode)
	880	{
	881	int ret = 1;
	882
	883	if (inode_has_buffers(inode)) {
	884	struct address_space *mapping = &inode->i_data;
	885	struct list_head *list = &mapping->private_list;
	886	struct address_space *buffer_mapping = mapping->private_data;
	887
	888	spin_lock(&buffer_mapping->private_lock);
	889	while (!list_empty(list)) {
	890	struct buffer_head *bh = BH_ENTRY(list->next);
	891	if (buffer_dirty(bh)) {
	892	ret = 0;
	893	break;
	894	}
	895	__remove_assoc_queue(bh);
	896	}
	897	spin_unlock(&buffer_mapping->private_lock);
	898	}
	899	return ret;
	900	}
	901
	902	/*
	903	* Create the appropriate buffers when given a folio for data area and
	904	* the size of each buffer.. Use the bh->b_this_page linked list to
	905	* follow the buffers created. Return NULL if unable to create more
	906	* buffers.
	907	*
	908	* The retry flag is used to differentiate async IO (paging, swapping)
	909	* which may not fail from ordinary buffer allocations.
	910	*/
	911	struct buffer_head folio_alloc_buffers(struct folio folio, unsigned long size,
	912	gfp_t gfp)
	913	{
	914	struct buffer_head bh, head;
	915	long offset;
	916	struct mem_cgroup memcg, old_memcg;
	917
	918	/* The folio lock pins the memcg */
	919	memcg = folio_memcg(folio);
	920	old_memcg = set_active_memcg(memcg);
	921
	922	head = NULL;
	923	offset = folio_size(folio);
	924	while ((offset -= size) >= 0) {
	925	bh = alloc_buffer_head(gfp);
	926	if (!bh)
	927	goto no_grow;
	928
	929	bh->b_this_page = head;
	930	bh->b_blocknr = -1;
	931	head = bh;
	932
	933	bh->b_size = size;
	934
	935	/* Link the buffer to its folio */
	936	folio_set_bh(bh, folio, offset);
	937	}
	938	out:
	939	set_active_memcg(old_memcg);
	940	return head;
	941	/*
	942	* In case anything failed, we just free everything we got.
	943	*/
	944	no_grow:
	945	if (head) {
	946	do {
	947	bh = head;
	948	head = head->b_this_page;
	949	free_buffer_head(bh);
	950	} while (head);
	951	}
	952
	953	goto out;
	954	}
	955	EXPORT_SYMBOL_GPL(folio_alloc_buffers);
	956
	957	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
	958	bool retry)
	959	{
	960	gfp_t gfp = GFP_NOFS \| __GFP_ACCOUNT;
	961	if (retry)
	962	gfp \|= __GFP_NOFAIL;
	963
	964	return folio_alloc_buffers(page_folio(page), size, gfp);
	965	}
	966	EXPORT_SYMBOL_GPL(alloc_page_buffers);
	967
	968	static inline void link_dev_buffers(struct folio *folio,
	969	struct buffer_head *head)
	970	{
	971	struct buffer_head bh, tail;
	972
	973	bh = head;
	974	do {
	975	tail = bh;
	976	bh = bh->b_this_page;
	977	} while (bh);
	978	tail->b_this_page = head;
	979	folio_attach_private(folio, head);
	980	}
	981
	982	static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
	983	{
	984	sector_t retval = ~((sector_t)0);
	985	loff_t sz = bdev_nr_bytes(bdev);
	986
	987	if (sz) {
	988	unsigned int sizebits = blksize_bits(size);
	989	retval = (sz >> sizebits);
	990	}
	991	return retval;
	992	}
	993
	994	/*
	995	* Initialise the state of a blockdev folio's buffers.
	996	*/
	997	static sector_t folio_init_buffers(struct folio *folio,
	998	struct block_device *bdev, unsigned size)
	999	{
	1000	struct buffer_head *head = folio_buffers(folio);
	1001	struct buffer_head *bh = head;
	1002	bool uptodate = folio_test_uptodate(folio);
	1003	sector_t block = div_u64(folio_pos(folio), size);
	1004	sector_t end_block = blkdev_max_block(bdev, size);
	1005
	1006	do {
	1007	if (!buffer_mapped(bh)) {
	1008	bh->b_end_io = NULL;
	1009	bh->b_private = NULL;
	1010	bh->b_bdev = bdev;
	1011	bh->b_blocknr = block;
	1012	if (uptodate)
	1013	set_buffer_uptodate(bh);
	1014	if (block < end_block)
	1015	set_buffer_mapped(bh);
	1016	}
	1017	block++;
	1018	bh = bh->b_this_page;
	1019	} while (bh != head);
	1020
	1021	/*
	1022	* Caller needs to validate requested block against end of device.
	1023	*/
	1024	return end_block;
	1025	}
	1026
	1027	/*
	1028	* Create the page-cache folio that contains the requested block.
	1029	*
	1030	* This is used purely for blockdev mappings.
	1031	*
	1032	* Returns false if we have a 'permanent' failure. Returns true if
	1033	* we succeeded, or the caller should retry.
	1034	*/
	1035	static bool grow_dev_folio(struct block_device *bdev, sector_t block,
	1036	pgoff_t index, unsigned size, gfp_t gfp)
	1037	{
	1038	struct inode *inode = bdev->bd_inode;
	1039	struct folio *folio;
	1040	struct buffer_head *bh;
	1041	sector_t end_block = 0;
	1042
	1043	folio = __filemap_get_folio(inode->i_mapping, index,
	1044	FGP_LOCK \| FGP_ACCESSED \| FGP_CREAT, gfp);
	1045	if (IS_ERR(folio))
	1046	return false;
	1047
	1048	bh = folio_buffers(folio);
	1049	if (bh) {
	1050	if (bh->b_size == size) {
	1051	end_block = folio_init_buffers(folio, bdev, size);
	1052	goto unlock;
	1053	}
	1054
	1055	/* Caller should retry if this call fails */
	1056	end_block = ~0ULL;
	1057	if (!try_to_free_buffers(folio))
	1058	goto unlock;
	1059	}
	1060
	1061	bh = folio_alloc_buffers(folio, size, gfp \| __GFP_ACCOUNT);
	1062	if (!bh)
	1063	goto unlock;
	1064
	1065	/*
	1066	* Link the folio to the buffers and initialise them. Take the
	1067	* lock to be atomic wrt __find_get_block(), which does not
	1068	* run under the folio lock.
	1069	*/
	1070	spin_lock(&inode->i_mapping->private_lock);
	1071	link_dev_buffers(folio, bh);
	1072	end_block = folio_init_buffers(folio, bdev, size);
	1073	spin_unlock(&inode->i_mapping->private_lock);
	1074	unlock:
	1075	folio_unlock(folio);
	1076	folio_put(folio);
	1077	return block < end_block;
	1078	}
	1079
	1080	/*
	1081	* Create buffers for the specified block device block's folio. If
	1082	* that folio was dirty, the buffers are set dirty also. Returns false
	1083	* if we've hit a permanent error.
	1084	*/
	1085	static bool grow_buffers(struct block_device *bdev, sector_t block,
	1086	unsigned size, gfp_t gfp)
	1087	{
	1088	loff_t pos;
	1089
	1090	/*
	1091	* Check for a block which lies outside our maximum possible
	1092	* pagecache index.
	1093	*/
	1094	if (check_mul_overflow(block, (sector_t)size, &pos) \|\| pos > MAX_LFS_FILESIZE) {
	1095	printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
	1096	__func__, (unsigned long long)block,
	1097	bdev);
	1098	return false;
	1099	}
	1100
	1101	/* Create a folio with the proper size buffers */
	1102	return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
	1103	}
	1104
	1105	static struct buffer_head *
	1106	__getblk_slow(struct block_device *bdev, sector_t block,
	1107	unsigned size, gfp_t gfp)
	1108	{
	1109	/* Size must be multiple of hard sectorsize */
	1110	if (unlikely(size & (bdev_logical_block_size(bdev)-1) \|\|
	1111	(size < 512 \|\| size > PAGE_SIZE))) {
	1112	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
	1113	size);
	1114	printk(KERN_ERR "logical block size: %d\n",
	1115	bdev_logical_block_size(bdev));
	1116
	1117	dump_stack();
	1118	return NULL;
	1119	}
	1120
	1121	for (;;) {
	1122	struct buffer_head *bh;
	1123
	1124	bh = __find_get_block(bdev, block, size);
	1125	if (bh)
	1126	return bh;
	1127
	1128	if (!grow_buffers(bdev, block, size, gfp))
	1129	return NULL;
	1130	}
	1131	}
	1132
	1133	/*
	1134	* The relationship between dirty buffers and dirty pages:
	1135	*
	1136	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
	1137	* the page is tagged dirty in the page cache.
	1138	*
	1139	* At all times, the dirtiness of the buffers represents the dirtiness of
	1140	* subsections of the page. If the page has buffers, the page dirty bit is
	1141	* merely a hint about the true dirty state.
	1142	*
	1143	* When a page is set dirty in its entirety, all its buffers are marked dirty
	1144	* (if the page has buffers).
	1145	*
	1146	* When a buffer is marked dirty, its page is dirtied, but the page's other
	1147	* buffers are not.
	1148	*
	1149	* Also. When blockdev buffers are explicitly read with bread(), they
	1150	* individually become uptodate. But their backing page remains not
	1151	* uptodate - even if all of its buffers are uptodate. A subsequent
	1152	* block_read_full_folio() against that folio will discover all the uptodate
	1153	* buffers, will set the folio uptodate and will perform no I/O.
	1154	*/
	1155
	1156	/**
	1157	* mark_buffer_dirty - mark a buffer_head as needing writeout
	1158	* @bh: the buffer_head to mark dirty
	1159	*
	1160	* mark_buffer_dirty() will set the dirty bit against the buffer, then set
	1161	* its backing page dirty, then tag the page as dirty in the page cache
	1162	* and then attach the address_space's inode to its superblock's dirty
	1163	* inode list.
	1164	*
	1165	* mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock,
	1166	* i_pages lock and mapping->host->i_lock.
	1167	*/
	1168	void mark_buffer_dirty(struct buffer_head *bh)
	1169	{
	1170	WARN_ON_ONCE(!buffer_uptodate(bh));
	1171
	1172	trace_block_dirty_buffer(bh);
	1173
	1174	/*
	1175	* Very carefully optimize the it-is-already-dirty case.
	1176	*
	1177	* Don't let the final "is it dirty" escape to before we
	1178	* perhaps modified the buffer.
	1179	*/
	1180	if (buffer_dirty(bh)) {
	1181	smp_mb();
	1182	if (buffer_dirty(bh))
	1183	return;
	1184	}
	1185
	1186	if (!test_set_buffer_dirty(bh)) {
	1187	struct folio *folio = bh->b_folio;
	1188	struct address_space *mapping = NULL;
	1189
	1190	folio_memcg_lock(folio);
	1191	if (!folio_test_set_dirty(folio)) {
	1192	mapping = folio->mapping;
	1193	if (mapping)
	1194	__folio_mark_dirty(folio, mapping, 0);
	1195	}
	1196	folio_memcg_unlock(folio);
	1197	if (mapping)
	1198	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
	1199	}
	1200	}
	1201	EXPORT_SYMBOL(mark_buffer_dirty);
	1202
	1203	void mark_buffer_write_io_error(struct buffer_head *bh)
	1204	{
	1205	set_buffer_write_io_error(bh);
	1206	/* FIXME: do we need to set this in both places? */
	1207	if (bh->b_folio && bh->b_folio->mapping)
	1208	mapping_set_error(bh->b_folio->mapping, -EIO);
	1209	if (bh->b_assoc_map) {
	1210	mapping_set_error(bh->b_assoc_map, -EIO);
	1211	errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO);
	1212	}
	1213	}
	1214	EXPORT_SYMBOL(mark_buffer_write_io_error);
	1215
	1216	/*
	1217	* Decrement a buffer_head's reference count. If all buffers against a page
	1218	* have zero reference count, are clean and unlocked, and if the page is clean
	1219	* and unlocked then try_to_free_buffers() may strip the buffers from the page
	1220	* in preparation for freeing it (sometimes, rarely, buffers are removed from
	1221	* a page but it ends up not being freed, and buffers may later be reattached).
	1222	*/
	1223	void __brelse(struct buffer_head * buf)
	1224	{
	1225	if (atomic_read(&buf->b_count)) {
	1226	put_bh(buf);
	1227	return;
	1228	}
	1229	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
	1230	}
	1231	EXPORT_SYMBOL(__brelse);
	1232
	1233	/*
	1234	* bforget() is like brelse(), except it discards any
	1235	* potentially dirty data.
	1236	*/
	1237	void __bforget(struct buffer_head *bh)
	1238	{
	1239	clear_buffer_dirty(bh);
	1240	if (bh->b_assoc_map) {
	1241	struct address_space *buffer_mapping = bh->b_folio->mapping;
	1242
	1243	spin_lock(&buffer_mapping->private_lock);
	1244	list_del_init(&bh->b_assoc_buffers);
	1245	bh->b_assoc_map = NULL;
	1246	spin_unlock(&buffer_mapping->private_lock);
	1247	}
	1248	__brelse(bh);
	1249	}
	1250	EXPORT_SYMBOL(__bforget);
	1251
	1252	static struct buffer_head __bread_slow(struct buffer_head bh)
	1253	{
	1254	lock_buffer(bh);
	1255	if (buffer_uptodate(bh)) {
	1256	unlock_buffer(bh);
	1257	return bh;
	1258	} else {
	1259	get_bh(bh);
	1260	bh->b_end_io = end_buffer_read_sync;
	1261	submit_bh(REQ_OP_READ, bh);
	1262	wait_on_buffer(bh);
	1263	if (buffer_uptodate(bh))
	1264	return bh;
	1265	}
	1266	brelse(bh);
	1267	return NULL;
	1268	}
	1269
	1270	/*
	1271	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
	1272	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
	1273	* refcount elevated by one when they're in an LRU. A buffer can only appear
	1274	* once in a particular CPU's LRU. A single buffer can be present in multiple
	1275	* CPU's LRUs at the same time.
	1276	*
	1277	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
	1278	* sb_find_get_block().
	1279	*
	1280	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
	1281	* a local interrupt disable for that.
	1282	*/
	1283
	1284	#define BH_LRU_SIZE 16
	1285
	1286	struct bh_lru {
	1287	struct buffer_head *bhs[BH_LRU_SIZE];
	1288	};
	1289
	1290	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
	1291
	1292	#ifdef CONFIG_SMP
	1293	#define bh_lru_lock() local_irq_disable()
	1294	#define bh_lru_unlock() local_irq_enable()
	1295	#else
	1296	#define bh_lru_lock() preempt_disable()
	1297	#define bh_lru_unlock() preempt_enable()
	1298	#endif
	1299
	1300	static inline void check_irqs_on(void)
	1301	{
	1302	#ifdef irqs_disabled
	1303	BUG_ON(irqs_disabled());
	1304	#endif
	1305	}
	1306
	1307	/*
	1308	* Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
	1309	* inserted at the front, and the buffer_head at the back if any is evicted.
	1310	* Or, if already in the LRU it is moved to the front.
	1311	*/
	1312	static void bh_lru_install(struct buffer_head *bh)
	1313	{
	1314	struct buffer_head *evictee = bh;
	1315	struct bh_lru *b;
	1316	int i;
	1317
	1318	check_irqs_on();
	1319	bh_lru_lock();
	1320
	1321	/*
	1322	* the refcount of buffer_head in bh_lru prevents dropping the
	1323	* attached page(i.e., try_to_free_buffers) so it could cause
	1324	* failing page migration.
	1325	* Skip putting upcoming bh into bh_lru until migration is done.
	1326	*/
	1327	if (lru_cache_disabled() \|\| cpu_is_isolated(smp_processor_id())) {
	1328	bh_lru_unlock();
	1329	return;
	1330	}
	1331
	1332	b = this_cpu_ptr(&bh_lrus);
	1333	for (i = 0; i < BH_LRU_SIZE; i++) {
	1334	swap(evictee, b->bhs[i]);
	1335	if (evictee == bh) {
	1336	bh_lru_unlock();
	1337	return;
	1338	}
	1339	}
	1340
	1341	get_bh(bh);
	1342	bh_lru_unlock();
	1343	brelse(evictee);
	1344	}
	1345
	1346	/*
	1347	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
	1348	*/
	1349	static struct buffer_head *
	1350	lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
	1351	{
	1352	struct buffer_head *ret = NULL;
	1353	unsigned int i;
	1354
	1355	check_irqs_on();
	1356	bh_lru_lock();
	1357	if (cpu_is_isolated(smp_processor_id())) {
	1358	bh_lru_unlock();
	1359	return NULL;
	1360	}
	1361	for (i = 0; i < BH_LRU_SIZE; i++) {
	1362	struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
	1363
	1364	if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
	1365	bh->b_size == size) {
	1366	if (i) {
	1367	while (i) {
	1368	__this_cpu_write(bh_lrus.bhs[i],
	1369	__this_cpu_read(bh_lrus.bhs[i - 1]));
	1370	i--;
	1371	}
	1372	__this_cpu_write(bh_lrus.bhs[0], bh);
	1373	}
	1374	get_bh(bh);
	1375	ret = bh;
	1376	break;
	1377	}
	1378	}
	1379	bh_lru_unlock();
	1380	return ret;
	1381	}
	1382
	1383	/*
	1384	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
	1385	* it in the LRU and mark it as accessed. If it is not present then return
	1386	* NULL
	1387	*/
	1388	struct buffer_head *
	1389	__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
	1390	{
	1391	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
	1392
	1393	if (bh == NULL) {
	1394	/* __find_get_block_slow will mark the page accessed */
	1395	bh = __find_get_block_slow(bdev, block);
	1396	if (bh)
	1397	bh_lru_install(bh);
	1398	} else
	1399	touch_buffer(bh);
	1400
	1401	return bh;
	1402	}
	1403	EXPORT_SYMBOL(__find_get_block);
	1404
	1405	/**
	1406	* bdev_getblk - Get a buffer_head in a block device's buffer cache.
	1407	* @bdev: The block device.
	1408	* @block: The block number.
	1409	* @size: The size of buffer_heads for this @bdev.
	1410	* @gfp: The memory allocation flags to use.
	1411	*
	1412	* Return: The buffer head, or NULL if memory could not be allocated.
	1413	*/
	1414	struct buffer_head bdev_getblk(struct block_device bdev, sector_t block,
	1415	unsigned size, gfp_t gfp)
	1416	{
	1417	struct buffer_head *bh = __find_get_block(bdev, block, size);
	1418
	1419	might_alloc(gfp);
	1420	if (bh)
	1421	return bh;
	1422
	1423	return __getblk_slow(bdev, block, size, gfp);
	1424	}
	1425	EXPORT_SYMBOL(bdev_getblk);
	1426
	1427	/*
	1428	* Do async read-ahead on a buffer..
	1429	*/
	1430	void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
	1431	{
	1432	struct buffer_head *bh = bdev_getblk(bdev, block, size,
	1433	GFP_NOWAIT \| __GFP_MOVABLE);
	1434
	1435	if (likely(bh)) {
	1436	bh_readahead(bh, REQ_RAHEAD);
	1437	brelse(bh);
	1438	}
	1439	}
	1440	EXPORT_SYMBOL(__breadahead);
	1441
	1442	/**
	1443	* __bread_gfp() - reads a specified block and returns the bh
	1444	* @bdev: the block_device to read from
	1445	* @block: number of block
	1446	* @size: size (in bytes) to read
	1447	* @gfp: page allocation flag
	1448	*
	1449	* Reads a specified block, and returns buffer head that contains it.
	1450	* The page cache can be allocated from non-movable area
	1451	* not to prevent page migration if you set gfp to zero.
	1452	* It returns NULL if the block was unreadable.
	1453	*/
	1454	struct buffer_head *
	1455	__bread_gfp(struct block_device *bdev, sector_t block,
	1456	unsigned size, gfp_t gfp)
	1457	{
	1458	struct buffer_head *bh;
	1459
	1460	gfp \|= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
	1461
	1462	/*
	1463	* Prefer looping in the allocator rather than here, at least that
	1464	* code knows what it's doing.
	1465	*/
	1466	gfp \|= __GFP_NOFAIL;
	1467
	1468	bh = bdev_getblk(bdev, block, size, gfp);
	1469
	1470	if (likely(bh) && !buffer_uptodate(bh))
	1471	bh = __bread_slow(bh);
	1472	return bh;
	1473	}
	1474	EXPORT_SYMBOL(__bread_gfp);
	1475
	1476	static void __invalidate_bh_lrus(struct bh_lru *b)
	1477	{
	1478	int i;
	1479
	1480	for (i = 0; i < BH_LRU_SIZE; i++) {
	1481	brelse(b->bhs[i]);
	1482	b->bhs[i] = NULL;
	1483	}
	1484	}
	1485	/*
	1486	* invalidate_bh_lrus() is called rarely - but not only at unmount.
	1487	* This doesn't race because it runs in each cpu either in irq
	1488	* or with preempt disabled.
	1489	*/
	1490	static void invalidate_bh_lru(void *arg)
	1491	{
	1492	struct bh_lru *b = &get_cpu_var(bh_lrus);
	1493
	1494	__invalidate_bh_lrus(b);
	1495	put_cpu_var(bh_lrus);
	1496	}
	1497
	1498	bool has_bh_in_lru(int cpu, void *dummy)
	1499	{
	1500	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
	1501	int i;
	1502
	1503	for (i = 0; i < BH_LRU_SIZE; i++) {
	1504	if (b->bhs[i])
	1505	return true;
	1506	}
	1507
	1508	return false;
	1509	}
	1510
	1511	void invalidate_bh_lrus(void)
	1512	{
	1513	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
	1514	}
	1515	EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
	1516
	1517	/*
	1518	* It's called from workqueue context so we need a bh_lru_lock to close
	1519	* the race with preemption/irq.
	1520	*/
	1521	void invalidate_bh_lrus_cpu(void)
	1522	{
	1523	struct bh_lru *b;
	1524
	1525	bh_lru_lock();
	1526	b = this_cpu_ptr(&bh_lrus);
	1527	__invalidate_bh_lrus(b);
	1528	bh_lru_unlock();
	1529	}
	1530
	1531	void folio_set_bh(struct buffer_head bh, struct folio folio,
	1532	unsigned long offset)
	1533	{
	1534	bh->b_folio = folio;
	1535	BUG_ON(offset >= folio_size(folio));
	1536	if (folio_test_highmem(folio))
	1537	/*
	1538	* This catches illegal uses and preserves the offset:
	1539	*/
	1540	bh->b_data = (char *)(0 + offset);
	1541	else
	1542	bh->b_data = folio_address(folio) + offset;
	1543	}
	1544	EXPORT_SYMBOL(folio_set_bh);
	1545
	1546	/*
	1547	* Called when truncating a buffer on a page completely.
	1548	*/
	1549
	1550	/* Bits that are cleared during an invalidate */
	1551	#define BUFFER_FLAGS_DISCARD \
	1552	(1 << BH_Mapped \| 1 << BH_New \| 1 << BH_Req \| \
	1553	1 << BH_Delay \| 1 << BH_Unwritten)
	1554
	1555	static void discard_buffer(struct buffer_head * bh)
	1556	{
	1557	unsigned long b_state;
	1558
	1559	lock_buffer(bh);
	1560	clear_buffer_dirty(bh);
	1561	bh->b_bdev = NULL;
	1562	b_state = READ_ONCE(bh->b_state);
	1563	do {
	1564	} while (!try_cmpxchg(&bh->b_state, &b_state,
	1565	b_state & ~BUFFER_FLAGS_DISCARD));
	1566	unlock_buffer(bh);
	1567	}
	1568
	1569	/**
	1570	* block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
	1571	* @folio: The folio which is affected.
	1572	* @offset: start of the range to invalidate
	1573	* @length: length of the range to invalidate
	1574	*
	1575	* block_invalidate_folio() is called when all or part of the folio has been
	1576	* invalidated by a truncate operation.
	1577	*
	1578	* block_invalidate_folio() does not have to release all buffers, but it must
	1579	* ensure that no dirty buffer is left outside @offset and that no I/O
	1580	* is underway against any of the blocks which are outside the truncation
	1581	* point. Because the caller is about to free (and possibly reuse) those
	1582	* blocks on-disk.
	1583	*/
	1584	void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
	1585	{
	1586	struct buffer_head head, bh, *next;
	1587	size_t curr_off = 0;
	1588	size_t stop = length + offset;
	1589
	1590	BUG_ON(!folio_test_locked(folio));
	1591
	1592	/*
	1593	* Check for overflow
	1594	*/
	1595	BUG_ON(stop > folio_size(folio) \|\| stop < length);
	1596
	1597	head = folio_buffers(folio);
	1598	if (!head)
	1599	return;
	1600
	1601	bh = head;
	1602	do {
	1603	size_t next_off = curr_off + bh->b_size;
	1604	next = bh->b_this_page;
	1605
	1606	/*
	1607	* Are we still fully in range ?
	1608	*/
	1609	if (next_off > stop)
	1610	goto out;
	1611
	1612	/*
	1613	* is this block fully invalidated?
	1614	*/
	1615	if (offset <= curr_off)
	1616	discard_buffer(bh);
	1617	curr_off = next_off;
	1618	bh = next;
	1619	} while (bh != head);
	1620
	1621	/*
	1622	* We release buffers only if the entire folio is being invalidated.
	1623	* The get_block cached value has been unconditionally invalidated,
	1624	* so real IO is not possible anymore.
	1625	*/
	1626	if (length == folio_size(folio))
	1627	filemap_release_folio(folio, 0);
	1628	out:
	1629	return;
	1630	}
	1631	EXPORT_SYMBOL(block_invalidate_folio);
	1632
	1633	/*
	1634	* We attach and possibly dirty the buffers atomically wrt
	1635	* block_dirty_folio() via private_lock. try_to_free_buffers
	1636	* is already excluded via the folio lock.
	1637	*/
	1638	struct buffer_head create_empty_buffers(struct folio folio,
	1639	unsigned long blocksize, unsigned long b_state)
	1640	{
	1641	struct buffer_head bh, head, *tail;
	1642	gfp_t gfp = GFP_NOFS \| __GFP_ACCOUNT \| __GFP_NOFAIL;
	1643
	1644	head = folio_alloc_buffers(folio, blocksize, gfp);
	1645	bh = head;
	1646	do {
	1647	bh->b_state \|= b_state;
	1648	tail = bh;
	1649	bh = bh->b_this_page;
	1650	} while (bh);
	1651	tail->b_this_page = head;
	1652
	1653	spin_lock(&folio->mapping->private_lock);
	1654	if (folio_test_uptodate(folio) \|\| folio_test_dirty(folio)) {
	1655	bh = head;
	1656	do {
	1657	if (folio_test_dirty(folio))
	1658	set_buffer_dirty(bh);
	1659	if (folio_test_uptodate(folio))
	1660	set_buffer_uptodate(bh);
	1661	bh = bh->b_this_page;
	1662	} while (bh != head);
	1663	}
	1664	folio_attach_private(folio, head);
	1665	spin_unlock(&folio->mapping->private_lock);
	1666
	1667	return head;
	1668	}
	1669	EXPORT_SYMBOL(create_empty_buffers);
	1670
	1671	/**
	1672	* clean_bdev_aliases: clean a range of buffers in block device
	1673	* @bdev: Block device to clean buffers in
	1674	* @block: Start of a range of blocks to clean
	1675	* @len: Number of blocks to clean
	1676	*
	1677	* We are taking a range of blocks for data and we don't want writeback of any
	1678	* buffer-cache aliases starting from return from this function and until the
	1679	* moment when something will explicitly mark the buffer dirty (hopefully that
	1680	* will not happen until we will free that block ;-) We don't even need to mark
	1681	* it not-uptodate - nobody can expect anything from a newly allocated buffer
	1682	* anyway. We used to use unmap_buffer() for such invalidation, but that was
	1683	* wrong. We definitely don't want to mark the alias unmapped, for example - it
	1684	* would confuse anyone who might pick it with bread() afterwards...
	1685	*
	1686	* Also.. Note that bforget() doesn't lock the buffer. So there can be
	1687	* writeout I/O going on against recently-freed buffers. We don't wait on that
	1688	* I/O in bforget() - it's more efficient to wait on the I/O only if we really
	1689	* need to. That happens here.
	1690	*/
	1691	void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
	1692	{
	1693	struct inode *bd_inode = bdev->bd_inode;
	1694	struct address_space *bd_mapping = bd_inode->i_mapping;
	1695	struct folio_batch fbatch;
	1696	pgoff_t index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE;
	1697	pgoff_t end;
	1698	int i, count;
	1699	struct buffer_head *bh;
	1700	struct buffer_head *head;
	1701
	1702	end = ((loff_t)(block + len - 1) << bd_inode->i_blkbits) / PAGE_SIZE;
	1703	folio_batch_init(&fbatch);
	1704	while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
	1705	count = folio_batch_count(&fbatch);
	1706	for (i = 0; i < count; i++) {
	1707	struct folio *folio = fbatch.folios[i];
	1708
	1709	if (!folio_buffers(folio))
	1710	continue;
	1711	/*
	1712	* We use folio lock instead of bd_mapping->private_lock
	1713	* to pin buffers here since we can afford to sleep and
	1714	* it scales better than a global spinlock lock.
	1715	*/
	1716	folio_lock(folio);
	1717	/* Recheck when the folio is locked which pins bhs */
	1718	head = folio_buffers(folio);
	1719	if (!head)
	1720	goto unlock_page;
	1721	bh = head;
	1722	do {
	1723	if (!buffer_mapped(bh) \|\| (bh->b_blocknr < block))
	1724	goto next;
	1725	if (bh->b_blocknr >= block + len)
	1726	break;
	1727	clear_buffer_dirty(bh);
	1728	wait_on_buffer(bh);
	1729	clear_buffer_req(bh);
	1730	next:
	1731	bh = bh->b_this_page;
	1732	} while (bh != head);
	1733	unlock_page:
	1734	folio_unlock(folio);
	1735	}
	1736	folio_batch_release(&fbatch);
	1737	cond_resched();
	1738	/* End of range already reached? */
	1739	if (index > end \|\| !index)
	1740	break;
	1741	}
	1742	}
	1743	EXPORT_SYMBOL(clean_bdev_aliases);
	1744
	1745	static struct buffer_head folio_create_buffers(struct folio folio,
	1746	struct inode *inode,
	1747	unsigned int b_state)
	1748	{
	1749	struct buffer_head *bh;
	1750
	1751	BUG_ON(!folio_test_locked(folio));
	1752
	1753	bh = folio_buffers(folio);
	1754	if (!bh)
	1755	bh = create_empty_buffers(folio,
	1756	1 << READ_ONCE(inode->i_blkbits), b_state);
	1757	return bh;
	1758	}
	1759
	1760	/*
	1761	* NOTE! All mapped/uptodate combinations are valid:
	1762	*
	1763	* Mapped Uptodate Meaning
	1764	*
	1765	* No No "unknown" - must do get_block()
	1766	* No Yes "hole" - zero-filled
	1767	* Yes No "allocated" - allocated on disk, not read in
	1768	* Yes Yes "valid" - allocated and up-to-date in memory.
	1769	*
	1770	* "Dirty" is valid only with the last case (mapped+uptodate).
	1771	*/
	1772
	1773	/*
	1774	* While block_write_full_page is writing back the dirty buffers under
	1775	* the page lock, whoever dirtied the buffers may decide to clean them
	1776	* again at any time. We handle that by only looking at the buffer
	1777	* state inside lock_buffer().
	1778	*
	1779	* If block_write_full_page() is called for regular writeback
	1780	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
	1781	* locked buffer. This only can happen if someone has written the buffer
	1782	* directly, with submit_bh(). At the address_space level PageWriteback
	1783	* prevents this contention from occurring.
	1784	*
	1785	* If block_write_full_page() is called with wbc->sync_mode ==
	1786	* WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
	1787	* causes the writes to be flagged as synchronous writes.
	1788	*/
	1789	int __block_write_full_folio(struct inode inode, struct folio folio,
	1790	get_block_t get_block, struct writeback_control wbc,
	1791	bh_end_io_t *handler)
	1792	{
	1793	int err;
	1794	sector_t block;
	1795	sector_t last_block;
	1796	struct buffer_head bh, head;
	1797	size_t blocksize;
	1798	int nr_underway = 0;
	1799	blk_opf_t write_flags = wbc_to_write_flags(wbc);
	1800
	1801	head = folio_create_buffers(folio, inode,
	1802	(1 << BH_Dirty) \| (1 << BH_Uptodate));
	1803
	1804	/*
	1805	* Be very careful. We have no exclusion from block_dirty_folio
	1806	* here, and the (potentially unmapped) buffers may become dirty at
	1807	* any time. If a buffer becomes dirty here after we've inspected it
	1808	* then we just miss that fact, and the folio stays dirty.
	1809	*
	1810	* Buffers outside i_size may be dirtied by block_dirty_folio;
	1811	* handle that here by just cleaning them.
	1812	*/
	1813
	1814	bh = head;
	1815	blocksize = bh->b_size;
	1816
	1817	block = div_u64(folio_pos(folio), blocksize);
	1818	last_block = div_u64(i_size_read(inode) - 1, blocksize);
	1819
	1820	/*
	1821	* Get all the dirty buffers mapped to disk addresses and
	1822	* handle any aliases from the underlying blockdev's mapping.
	1823	*/
	1824	do {
	1825	if (block > last_block) {
	1826	/*
	1827	* mapped buffers outside i_size will occur, because
	1828	* this folio can be outside i_size when there is a
	1829	* truncate in progress.
	1830	*/
	1831	/*
	1832	* The buffer was zeroed by block_write_full_page()
	1833	*/
	1834	clear_buffer_dirty(bh);
	1835	set_buffer_uptodate(bh);
	1836	} else if ((!buffer_mapped(bh) \|\| buffer_delay(bh)) &&
	1837	buffer_dirty(bh)) {
	1838	WARN_ON(bh->b_size != blocksize);
	1839	err = get_block(inode, block, bh, 1);
	1840	if (err)
	1841	goto recover;
	1842	clear_buffer_delay(bh);
	1843	if (buffer_new(bh)) {
	1844	/* blockdev mappings never come here */
	1845	clear_buffer_new(bh);
	1846	clean_bdev_bh_alias(bh);
	1847	}
	1848	}
	1849	bh = bh->b_this_page;
	1850	block++;
	1851	} while (bh != head);
	1852
	1853	do {
	1854	if (!buffer_mapped(bh))
	1855	continue;
	1856	/*
	1857	* If it's a fully non-blocking write attempt and we cannot
	1858	* lock the buffer then redirty the folio. Note that this can
	1859	* potentially cause a busy-wait loop from writeback threads
	1860	* and kswapd activity, but those code paths have their own
	1861	* higher-level throttling.
	1862	*/
	1863	if (wbc->sync_mode != WB_SYNC_NONE) {
	1864	lock_buffer(bh);
	1865	} else if (!trylock_buffer(bh)) {
	1866	folio_redirty_for_writepage(wbc, folio);
	1867	continue;
	1868	}
	1869	if (test_clear_buffer_dirty(bh)) {
	1870	mark_buffer_async_write_endio(bh, handler);
	1871	} else {
	1872	unlock_buffer(bh);
	1873	}
	1874	} while ((bh = bh->b_this_page) != head);
	1875
	1876	/*
	1877	* The folio and its buffers are protected by the writeback flag,
	1878	* so we can drop the bh refcounts early.
	1879	*/
	1880	BUG_ON(folio_test_writeback(folio));
	1881	folio_start_writeback(folio);
	1882
	1883	do {
	1884	struct buffer_head *next = bh->b_this_page;
	1885	if (buffer_async_write(bh)) {
	1886	submit_bh_wbc(REQ_OP_WRITE \| write_flags, bh, wbc);
	1887	nr_underway++;
	1888	}
	1889	bh = next;
	1890	} while (bh != head);
	1891	folio_unlock(folio);
	1892
	1893	err = 0;
	1894	done:
	1895	if (nr_underway == 0) {
	1896	/*
	1897	* The folio was marked dirty, but the buffers were
	1898	* clean. Someone wrote them back by hand with
	1899	* write_dirty_buffer/submit_bh. A rare case.
	1900	*/
	1901	folio_end_writeback(folio);
	1902
	1903	/*
	1904	* The folio and buffer_heads can be released at any time from
	1905	* here on.
	1906	*/
	1907	}
	1908	return err;
	1909
	1910	recover:
	1911	/*
	1912	* ENOSPC, or some other error. We may already have added some
	1913	* blocks to the file, so we need to write these out to avoid
	1914	* exposing stale data.
	1915	* The folio is currently locked and not marked for writeback
	1916	*/
	1917	bh = head;
	1918	/* Recovery: lock and submit the mapped buffers */
	1919	do {
	1920	if (buffer_mapped(bh) && buffer_dirty(bh) &&
	1921	!buffer_delay(bh)) {
	1922	lock_buffer(bh);
	1923	mark_buffer_async_write_endio(bh, handler);
	1924	} else {
	1925	/*
	1926	* The buffer may have been set dirty during
	1927	* attachment to a dirty folio.
	1928	*/
	1929	clear_buffer_dirty(bh);
	1930	}
	1931	} while ((bh = bh->b_this_page) != head);
	1932	folio_set_error(folio);
	1933	BUG_ON(folio_test_writeback(folio));
	1934	mapping_set_error(folio->mapping, err);
	1935	folio_start_writeback(folio);
	1936	do {
	1937	struct buffer_head *next = bh->b_this_page;
	1938	if (buffer_async_write(bh)) {
	1939	clear_buffer_dirty(bh);
	1940	submit_bh_wbc(REQ_OP_WRITE \| write_flags, bh, wbc);
	1941	nr_underway++;
	1942	}
	1943	bh = next;
	1944	} while (bh != head);
	1945	folio_unlock(folio);
	1946	goto done;
	1947	}
	1948	EXPORT_SYMBOL(__block_write_full_folio);
	1949
	1950	/*
	1951	* If a folio has any new buffers, zero them out here, and mark them uptodate
	1952	* and dirty so they'll be written out (in order to prevent uninitialised
	1953	* block data from leaking). And clear the new bit.
	1954	*/
	1955	void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
	1956	{
	1957	size_t block_start, block_end;
	1958	struct buffer_head head, bh;
	1959
	1960	BUG_ON(!folio_test_locked(folio));
	1961	head = folio_buffers(folio);
	1962	if (!head)
	1963	return;
	1964
	1965	bh = head;
	1966	block_start = 0;
	1967	do {
	1968	block_end = block_start + bh->b_size;
	1969
	1970	if (buffer_new(bh)) {
	1971	if (block_end > from && block_start < to) {
	1972	if (!folio_test_uptodate(folio)) {
	1973	size_t start, xend;
	1974
	1975	start = max(from, block_start);
	1976	xend = min(to, block_end);
	1977
	1978	folio_zero_segment(folio, start, xend);
	1979	set_buffer_uptodate(bh);
	1980	}
	1981
	1982	clear_buffer_new(bh);
	1983	mark_buffer_dirty(bh);
	1984	}
	1985	}
	1986
	1987	block_start = block_end;
	1988	bh = bh->b_this_page;
	1989	} while (bh != head);
	1990	}
	1991	EXPORT_SYMBOL(folio_zero_new_buffers);
	1992
	1993	static int
	1994	iomap_to_bh(struct inode inode, sector_t block, struct buffer_head bh,
	1995	const struct iomap *iomap)
	1996	{
	1997	loff_t offset = (loff_t)block << inode->i_blkbits;
	1998
	1999	bh->b_bdev = iomap->bdev;
	2000
	2001	/*
	2002	* Block points to offset in file we need to map, iomap contains
	2003	* the offset at which the map starts. If the map ends before the
	2004	* current block, then do not map the buffer and let the caller
	2005	* handle it.
	2006	*/
	2007	if (offset >= iomap->offset + iomap->length)
	2008	return -EIO;
	2009
	2010	switch (iomap->type) {
	2011	case IOMAP_HOLE:
	2012	/*
	2013	* If the buffer is not up to date or beyond the current EOF,
	2014	* we need to mark it as new to ensure sub-block zeroing is
	2015	* executed if necessary.
	2016	*/
	2017	if (!buffer_uptodate(bh) \|\|
	2018	(offset >= i_size_read(inode)))
	2019	set_buffer_new(bh);
	2020	return 0;
	2021	case IOMAP_DELALLOC:
	2022	if (!buffer_uptodate(bh) \|\|
	2023	(offset >= i_size_read(inode)))
	2024	set_buffer_new(bh);
	2025	set_buffer_uptodate(bh);
	2026	set_buffer_mapped(bh);
	2027	set_buffer_delay(bh);
	2028	return 0;
	2029	case IOMAP_UNWRITTEN:
	2030	/*
	2031	* For unwritten regions, we always need to ensure that regions
	2032	* in the block we are not writing to are zeroed. Mark the
	2033	* buffer as new to ensure this.
	2034	*/
	2035	set_buffer_new(bh);
	2036	set_buffer_unwritten(bh);
	2037	fallthrough;
	2038	case IOMAP_MAPPED:
	2039	if ((iomap->flags & IOMAP_F_NEW) \|\|
	2040	offset >= i_size_read(inode)) {
	2041	/*
	2042	* This can happen if truncating the block device races
	2043	* with the check in the caller as i_size updates on
	2044	* block devices aren't synchronized by i_rwsem for
	2045	* block devices.
	2046	*/
	2047	if (S_ISBLK(inode->i_mode))
	2048	return -EIO;
	2049	set_buffer_new(bh);
	2050	}
	2051	bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
	2052	inode->i_blkbits;
	2053	set_buffer_mapped(bh);
	2054	return 0;
	2055	default:
	2056	WARN_ON_ONCE(1);
	2057	return -EIO;
	2058	}
	2059	}
	2060
	2061	int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
	2062	get_block_t get_block, const struct iomap iomap)
	2063	{
	2064	size_t from = offset_in_folio(folio, pos);
	2065	size_t to = from + len;
	2066	struct inode *inode = folio->mapping->host;
	2067	size_t block_start, block_end;
	2068	sector_t block;
	2069	int err = 0;
	2070	size_t blocksize;
	2071	struct buffer_head bh, head, wait[2], *wait_bh=wait;
	2072
	2073	BUG_ON(!folio_test_locked(folio));
	2074	BUG_ON(to > folio_size(folio));
	2075	BUG_ON(from > to);
	2076
	2077	head = folio_create_buffers(folio, inode, 0);
	2078	blocksize = head->b_size;
	2079	block = div_u64(folio_pos(folio), blocksize);
	2080
	2081	for (bh = head, block_start = 0; bh != head \|\| !block_start;
	2082	block++, block_start=block_end, bh = bh->b_this_page) {
	2083	block_end = block_start + blocksize;
	2084	if (block_end <= from \|\| block_start >= to) {
	2085	if (folio_test_uptodate(folio)) {
	2086	if (!buffer_uptodate(bh))
	2087	set_buffer_uptodate(bh);
	2088	}
	2089	continue;
	2090	}
	2091	if (buffer_new(bh))
	2092	clear_buffer_new(bh);
	2093	if (!buffer_mapped(bh)) {
	2094	WARN_ON(bh->b_size != blocksize);
	2095	if (get_block)
	2096	err = get_block(inode, block, bh, 1);
	2097	else
	2098	err = iomap_to_bh(inode, block, bh, iomap);
	2099	if (err)
	2100	break;
	2101
	2102	if (buffer_new(bh)) {
	2103	clean_bdev_bh_alias(bh);
	2104	if (folio_test_uptodate(folio)) {
	2105	clear_buffer_new(bh);
	2106	set_buffer_uptodate(bh);
	2107	mark_buffer_dirty(bh);
	2108	continue;
	2109	}
	2110	if (block_end > to \|\| block_start < from)
	2111	folio_zero_segments(folio,
	2112	to, block_end,
	2113	block_start, from);
	2114	continue;
	2115	}
	2116	}
	2117	if (folio_test_uptodate(folio)) {
	2118	if (!buffer_uptodate(bh))
	2119	set_buffer_uptodate(bh);
	2120	continue;
	2121	}
	2122	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
	2123	!buffer_unwritten(bh) &&
	2124	(block_start < from \|\| block_end > to)) {
	2125	bh_read_nowait(bh, 0);
	2126	*wait_bh++=bh;
	2127	}
	2128	}
	2129	/*
	2130	* If we issued read requests - let them complete.
	2131	*/
	2132	while(wait_bh > wait) {
	2133	wait_on_buffer(*--wait_bh);
	2134	if (!buffer_uptodate(*wait_bh))
	2135	err = -EIO;
	2136	}
	2137	if (unlikely(err))
	2138	folio_zero_new_buffers(folio, from, to);
	2139	return err;
	2140	}
	2141
	2142	int __block_write_begin(struct page *page, loff_t pos, unsigned len,
	2143	get_block_t *get_block)
	2144	{
	2145	return __block_write_begin_int(page_folio(page), pos, len, get_block,
	2146	NULL);
	2147	}
	2148	EXPORT_SYMBOL(__block_write_begin);
	2149
	2150	static void __block_commit_write(struct folio *folio, size_t from, size_t to)
	2151	{
	2152	size_t block_start, block_end;
	2153	bool partial = false;
	2154	unsigned blocksize;
	2155	struct buffer_head bh, head;
	2156
	2157	bh = head = folio_buffers(folio);
	2158	blocksize = bh->b_size;
	2159
	2160	block_start = 0;
	2161	do {
	2162	block_end = block_start + blocksize;
	2163	if (block_end <= from \|\| block_start >= to) {
	2164	if (!buffer_uptodate(bh))
	2165	partial = true;
	2166	} else {
	2167	set_buffer_uptodate(bh);
	2168	mark_buffer_dirty(bh);
	2169	}
	2170	if (buffer_new(bh))
	2171	clear_buffer_new(bh);
	2172
	2173	block_start = block_end;
	2174	bh = bh->b_this_page;
	2175	} while (bh != head);
	2176
	2177	/*
	2178	* If this is a partial write which happened to make all buffers
	2179	* uptodate then we can optimize away a bogus read_folio() for
	2180	* the next read(). Here we 'discover' whether the folio went
	2181	* uptodate as a result of this (potentially partial) write.
	2182	*/
	2183	if (!partial)
	2184	folio_mark_uptodate(folio);
	2185	}
	2186
	2187	/*
	2188	* block_write_begin takes care of the basic task of block allocation and
	2189	* bringing partial write blocks uptodate first.
	2190	*
	2191	* The filesystem needs to handle block truncation upon failure.
	2192	*/
	2193	int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
	2194	struct page *pagep, get_block_t get_block)
	2195	{
	2196	pgoff_t index = pos >> PAGE_SHIFT;
	2197	struct page *page;
	2198	int status;
	2199
	2200	page = grab_cache_page_write_begin(mapping, index);
	2201	if (!page)
	2202	return -ENOMEM;
	2203
	2204	status = __block_write_begin(page, pos, len, get_block);
	2205	if (unlikely(status)) {
	2206	unlock_page(page);
	2207	put_page(page);
	2208	page = NULL;
	2209	}
	2210
	2211	*pagep = page;
	2212	return status;
	2213	}
	2214	EXPORT_SYMBOL(block_write_begin);
	2215
	2216	int block_write_end(struct file file, struct address_space mapping,
	2217	loff_t pos, unsigned len, unsigned copied,
	2218	struct page page, void fsdata)
	2219	{
	2220	struct folio *folio = page_folio(page);
	2221	size_t start = pos - folio_pos(folio);
	2222
	2223	if (unlikely(copied < len)) {
	2224	/*
	2225	* The buffers that were written will now be uptodate, so
	2226	* we don't have to worry about a read_folio reading them
	2227	* and overwriting a partial write. However if we have
	2228	* encountered a short write and only partially written
	2229	* into a buffer, it will not be marked uptodate, so a
	2230	* read_folio might come in and destroy our partial write.
	2231	*
	2232	* Do the simplest thing, and just treat any short write to a
	2233	* non uptodate folio as a zero-length write, and force the
	2234	* caller to redo the whole thing.
	2235	*/
	2236	if (!folio_test_uptodate(folio))
	2237	copied = 0;
	2238
	2239	folio_zero_new_buffers(folio, start+copied, start+len);
	2240	}
	2241	flush_dcache_folio(folio);
	2242
	2243	/* This could be a short (even 0-length) commit */
	2244	__block_commit_write(folio, start, start + copied);
	2245
	2246	return copied;
	2247	}
	2248	EXPORT_SYMBOL(block_write_end);
	2249
	2250	int generic_write_end(struct file file, struct address_space mapping,
	2251	loff_t pos, unsigned len, unsigned copied,
	2252	struct page page, void fsdata)
	2253	{
	2254	struct inode *inode = mapping->host;
	2255	loff_t old_size = inode->i_size;
	2256	bool i_size_changed = false;
	2257
	2258	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
	2259
	2260	/*
	2261	* No need to use i_size_read() here, the i_size cannot change under us
	2262	* because we hold i_rwsem.
	2263	*
	2264	* But it's important to update i_size while still holding page lock:
	2265	* page writeout could otherwise come in and zero beyond i_size.
	2266	*/
	2267	if (pos + copied > inode->i_size) {
	2268	i_size_write(inode, pos + copied);
	2269	i_size_changed = true;
	2270	}
	2271
	2272	unlock_page(page);
	2273	put_page(page);
	2274
	2275	if (old_size < pos)
	2276	pagecache_isize_extended(inode, old_size, pos);
	2277	/*
	2278	* Don't mark the inode dirty under page lock. First, it unnecessarily
	2279	* makes the holding time of page lock longer. Second, it forces lock
	2280	* ordering of page lock and transaction start for journaling
	2281	* filesystems.
	2282	*/
	2283	if (i_size_changed)
	2284	mark_inode_dirty(inode);
	2285	return copied;
	2286	}
	2287	EXPORT_SYMBOL(generic_write_end);
	2288
	2289	/*
	2290	* block_is_partially_uptodate checks whether buffers within a folio are
	2291	* uptodate or not.
	2292	*
	2293	* Returns true if all buffers which correspond to the specified part
	2294	* of the folio are uptodate.
	2295	*/
	2296	bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
	2297	{
	2298	unsigned block_start, block_end, blocksize;
	2299	unsigned to;
	2300	struct buffer_head bh, head;
	2301	bool ret = true;
	2302
	2303	head = folio_buffers(folio);
	2304	if (!head)
	2305	return false;
	2306	blocksize = head->b_size;
	2307	to = min_t(unsigned, folio_size(folio) - from, count);
	2308	to = from + to;
	2309	if (from < blocksize && to > folio_size(folio) - blocksize)
	2310	return false;
	2311
	2312	bh = head;
	2313	block_start = 0;
	2314	do {
	2315	block_end = block_start + blocksize;
	2316	if (block_end > from && block_start < to) {
	2317	if (!buffer_uptodate(bh)) {
	2318	ret = false;
	2319	break;
	2320	}
	2321	if (block_end >= to)
	2322	break;
	2323	}
	2324	block_start = block_end;
	2325	bh = bh->b_this_page;
	2326	} while (bh != head);
	2327
	2328	return ret;
	2329	}
	2330	EXPORT_SYMBOL(block_is_partially_uptodate);
	2331
	2332	/*
	2333	* Generic "read_folio" function for block devices that have the normal
	2334	* get_block functionality. This is most of the block device filesystems.
	2335	* Reads the folio asynchronously --- the unlock_buffer() and
	2336	* set/clear_buffer_uptodate() functions propagate buffer state into the
	2337	* folio once IO has completed.
	2338	*/
	2339	int block_read_full_folio(struct folio folio, get_block_t get_block)
	2340	{
	2341	struct inode *inode = folio->mapping->host;
	2342	sector_t iblock, lblock;
	2343	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
	2344	size_t blocksize;
	2345	int nr, i;
	2346	int fully_mapped = 1;
	2347	bool page_error = false;
	2348	loff_t limit = i_size_read(inode);
	2349
	2350	/* This is needed for ext4. */
	2351	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
	2352	limit = inode->i_sb->s_maxbytes;
	2353
	2354	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
	2355
	2356	head = folio_create_buffers(folio, inode, 0);
	2357	blocksize = head->b_size;
	2358
	2359	iblock = div_u64(folio_pos(folio), blocksize);
	2360	lblock = div_u64(limit + blocksize - 1, blocksize);
	2361	bh = head;
	2362	nr = 0;
	2363	i = 0;
	2364
	2365	do {
	2366	if (buffer_uptodate(bh))
	2367	continue;
	2368
	2369	if (!buffer_mapped(bh)) {
	2370	int err = 0;
	2371
	2372	fully_mapped = 0;
	2373	if (iblock < lblock) {
	2374	WARN_ON(bh->b_size != blocksize);
	2375	err = get_block(inode, iblock, bh, 0);
	2376	if (err) {
	2377	folio_set_error(folio);
	2378	page_error = true;
	2379	}
	2380	}
	2381	if (!buffer_mapped(bh)) {
	2382	folio_zero_range(folio, i * blocksize,
	2383	blocksize);
	2384	if (!err)
	2385	set_buffer_uptodate(bh);
	2386	continue;
	2387	}
	2388	/*
	2389	* get_block() might have updated the buffer
	2390	* synchronously
	2391	*/
	2392	if (buffer_uptodate(bh))
	2393	continue;
	2394	}
	2395	arr[nr++] = bh;
	2396	} while (i++, iblock++, (bh = bh->b_this_page) != head);
	2397
	2398	if (fully_mapped)
	2399	folio_set_mappedtodisk(folio);
	2400
	2401	if (!nr) {
	2402	/*
	2403	* All buffers are uptodate or get_block() returned an
	2404	* error when trying to map them - we can finish the read.
	2405	*/
	2406	folio_end_read(folio, !page_error);
	2407	return 0;
	2408	}
	2409
	2410	/* Stage two: lock the buffers */
	2411	for (i = 0; i < nr; i++) {
	2412	bh = arr[i];
	2413	lock_buffer(bh);
	2414	mark_buffer_async_read(bh);
	2415	}
	2416
	2417	/*
	2418	* Stage 3: start the IO. Check for uptodateness
	2419	* inside the buffer lock in case another process reading
	2420	* the underlying blockdev brought it uptodate (the sct fix).
	2421	*/
	2422	for (i = 0; i < nr; i++) {
	2423	bh = arr[i];
	2424	if (buffer_uptodate(bh))
	2425	end_buffer_async_read(bh, 1);
	2426	else
	2427	submit_bh(REQ_OP_READ, bh);
	2428	}
	2429	return 0;
	2430	}
	2431	EXPORT_SYMBOL(block_read_full_folio);
	2432
	2433	/* utility function for filesystems that need to do work on expanding
	2434	* truncates. Uses filesystem pagecache writes to allow the filesystem to
	2435	* deal with the hole.
	2436	*/
	2437	int generic_cont_expand_simple(struct inode *inode, loff_t size)
	2438	{
	2439	struct address_space *mapping = inode->i_mapping;
	2440	const struct address_space_operations *aops = mapping->a_ops;
	2441	struct page *page;
	2442	void *fsdata = NULL;
	2443	int err;
	2444
	2445	err = inode_newsize_ok(inode, size);
	2446	if (err)
	2447	goto out;
	2448
	2449	err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata);
	2450	if (err)
	2451	goto out;
	2452
	2453	err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
	2454	BUG_ON(err > 0);
	2455
	2456	out:
	2457	return err;
	2458	}
	2459	EXPORT_SYMBOL(generic_cont_expand_simple);
	2460
	2461	static int cont_expand_zero(struct file file, struct address_space mapping,
	2462	loff_t pos, loff_t *bytes)
	2463	{
	2464	struct inode *inode = mapping->host;
	2465	const struct address_space_operations *aops = mapping->a_ops;
	2466	unsigned int blocksize = i_blocksize(inode);
	2467	struct page *page;
	2468	void *fsdata = NULL;
	2469	pgoff_t index, curidx;
	2470	loff_t curpos;
	2471	unsigned zerofrom, offset, len;
	2472	int err = 0;
	2473
	2474	index = pos >> PAGE_SHIFT;
	2475	offset = pos & ~PAGE_MASK;
	2476
	2477	while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
	2478	zerofrom = curpos & ~PAGE_MASK;
	2479	if (zerofrom & (blocksize-1)) {
	2480	*bytes \|= (blocksize-1);
	2481	(*bytes)++;
	2482	}
	2483	len = PAGE_SIZE - zerofrom;
	2484
	2485	err = aops->write_begin(file, mapping, curpos, len,
	2486	&page, &fsdata);
	2487	if (err)
	2488	goto out;
	2489	zero_user(page, zerofrom, len);
	2490	err = aops->write_end(file, mapping, curpos, len, len,
	2491	page, fsdata);
	2492	if (err < 0)
	2493	goto out;
	2494	BUG_ON(err != len);
	2495	err = 0;
	2496
	2497	balance_dirty_pages_ratelimited(mapping);
	2498
	2499	if (fatal_signal_pending(current)) {
	2500	err = -EINTR;
	2501	goto out;
	2502	}
	2503	}
	2504
	2505	/* page covers the boundary, find the boundary offset */
	2506	if (index == curidx) {
	2507	zerofrom = curpos & ~PAGE_MASK;
	2508	/* if we will expand the thing last block will be filled */
	2509	if (offset <= zerofrom) {
	2510	goto out;
	2511	}
	2512	if (zerofrom & (blocksize-1)) {
	2513	*bytes \|= (blocksize-1);
	2514	(*bytes)++;
	2515	}
	2516	len = offset - zerofrom;
	2517
	2518	err = aops->write_begin(file, mapping, curpos, len,
	2519	&page, &fsdata);
	2520	if (err)
	2521	goto out;
	2522	zero_user(page, zerofrom, len);
	2523	err = aops->write_end(file, mapping, curpos, len, len,
	2524	page, fsdata);
	2525	if (err < 0)
	2526	goto out;
	2527	BUG_ON(err != len);
	2528	err = 0;
	2529	}
	2530	out:
	2531	return err;
	2532	}
	2533
	2534	/*
	2535	* For moronic filesystems that do not allow holes in file.
	2536	* We may have to extend the file.
	2537	*/
	2538	int cont_write_begin(struct file file, struct address_space mapping,
	2539	loff_t pos, unsigned len,
	2540	struct page pagep, void fsdata,
	2541	get_block_t get_block, loff_t bytes)
	2542	{
	2543	struct inode *inode = mapping->host;
	2544	unsigned int blocksize = i_blocksize(inode);
	2545	unsigned int zerofrom;
	2546	int err;
	2547
	2548	err = cont_expand_zero(file, mapping, pos, bytes);
	2549	if (err)
	2550	return err;
	2551
	2552	zerofrom = *bytes & ~PAGE_MASK;
	2553	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
	2554	*bytes \|= (blocksize-1);
	2555	(*bytes)++;
	2556	}
	2557
	2558	return block_write_begin(mapping, pos, len, pagep, get_block);
	2559	}
	2560	EXPORT_SYMBOL(cont_write_begin);
	2561
	2562	void block_commit_write(struct page *page, unsigned from, unsigned to)
	2563	{
	2564	struct folio *folio = page_folio(page);
	2565	__block_commit_write(folio, from, to);
	2566	}
	2567	EXPORT_SYMBOL(block_commit_write);
	2568
	2569	/*
	2570	* block_page_mkwrite() is not allowed to change the file size as it gets
	2571	* called from a page fault handler when a page is first dirtied. Hence we must
	2572	* be careful to check for EOF conditions here. We set the page up correctly
	2573	* for a written page which means we get ENOSPC checking when writing into
	2574	* holes and correct delalloc and unwritten extent mapping on filesystems that
	2575	* support these features.
	2576	*
	2577	* We are not allowed to take the i_mutex here so we have to play games to
	2578	* protect against truncate races as the page could now be beyond EOF. Because
	2579	* truncate writes the inode size before removing pages, once we have the
	2580	* page lock we can determine safely if the page is beyond EOF. If it is not
	2581	* beyond EOF, then the page is guaranteed safe against truncation until we
	2582	* unlock the page.
	2583	*
	2584	* Direct callers of this function should protect against filesystem freezing
	2585	* using sb_start_pagefault() - sb_end_pagefault() functions.
	2586	*/
	2587	int block_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
	2588	get_block_t get_block)
	2589	{
	2590	struct folio *folio = page_folio(vmf->page);
	2591	struct inode *inode = file_inode(vma->vm_file);
	2592	unsigned long end;
	2593	loff_t size;
	2594	int ret;
	2595
	2596	folio_lock(folio);
	2597	size = i_size_read(inode);
	2598	if ((folio->mapping != inode->i_mapping) \|\|
	2599	(folio_pos(folio) >= size)) {
	2600	/* We overload EFAULT to mean page got truncated */
	2601	ret = -EFAULT;
	2602	goto out_unlock;
	2603	}
	2604
	2605	end = folio_size(folio);
	2606	/* folio is wholly or partially inside EOF */
	2607	if (folio_pos(folio) + end > size)
	2608	end = size - folio_pos(folio);
	2609
	2610	ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
	2611	if (unlikely(ret))
	2612	goto out_unlock;
	2613
	2614	__block_commit_write(folio, 0, end);
	2615
	2616	folio_mark_dirty(folio);
	2617	folio_wait_stable(folio);
	2618	return 0;
	2619	out_unlock:
	2620	folio_unlock(folio);
	2621	return ret;
	2622	}
	2623	EXPORT_SYMBOL(block_page_mkwrite);
	2624
	2625	int block_truncate_page(struct address_space *mapping,
	2626	loff_t from, get_block_t *get_block)
	2627	{
	2628	pgoff_t index = from >> PAGE_SHIFT;
	2629	unsigned blocksize;
	2630	sector_t iblock;
	2631	size_t offset, length, pos;
	2632	struct inode *inode = mapping->host;
	2633	struct folio *folio;
	2634	struct buffer_head *bh;
	2635	int err = 0;
	2636
	2637	blocksize = i_blocksize(inode);
	2638	length = from & (blocksize - 1);
	2639
	2640	/* Block boundary? Nothing to do */
	2641	if (!length)
	2642	return 0;
	2643
	2644	length = blocksize - length;
	2645	iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;
	2646
	2647	folio = filemap_grab_folio(mapping, index);
	2648	if (IS_ERR(folio))
	2649	return PTR_ERR(folio);
	2650
	2651	bh = folio_buffers(folio);
	2652	if (!bh)
	2653	bh = create_empty_buffers(folio, blocksize, 0);
	2654
	2655	/* Find the buffer that contains "offset" */
	2656	offset = offset_in_folio(folio, from);
	2657	pos = blocksize;
	2658	while (offset >= pos) {
	2659	bh = bh->b_this_page;
	2660	iblock++;
	2661	pos += blocksize;
	2662	}
	2663
	2664	if (!buffer_mapped(bh)) {
	2665	WARN_ON(bh->b_size != blocksize);
	2666	err = get_block(inode, iblock, bh, 0);
	2667	if (err)
	2668	goto unlock;
	2669	/* unmapped? It's a hole - nothing to do */
	2670	if (!buffer_mapped(bh))
	2671	goto unlock;
	2672	}
	2673
	2674	/* Ok, it's mapped. Make sure it's up-to-date */
	2675	if (folio_test_uptodate(folio))
	2676	set_buffer_uptodate(bh);
	2677
	2678	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
	2679	err = bh_read(bh, 0);
	2680	/* Uhhuh. Read error. Complain and punt. */
	2681	if (err < 0)
	2682	goto unlock;
	2683	}
	2684
	2685	folio_zero_range(folio, offset, length);
	2686	mark_buffer_dirty(bh);
	2687
	2688	unlock:
	2689	folio_unlock(folio);
	2690	folio_put(folio);
	2691
	2692	return err;
	2693	}
	2694	EXPORT_SYMBOL(block_truncate_page);
	2695
	2696	/*
	2697	* The generic ->writepage function for buffer-backed address_spaces
	2698	*/
	2699	int block_write_full_page(struct page page, get_block_t get_block,
	2700	struct writeback_control *wbc)
	2701	{
	2702	struct folio *folio = page_folio(page);
	2703	struct inode * const inode = folio->mapping->host;
	2704	loff_t i_size = i_size_read(inode);
	2705
	2706	/* Is the folio fully inside i_size? */
	2707	if (folio_pos(folio) + folio_size(folio) <= i_size)
	2708	return __block_write_full_folio(inode, folio, get_block, wbc,
	2709	end_buffer_async_write);
	2710
	2711	/* Is the folio fully outside i_size? (truncate in progress) */
	2712	if (folio_pos(folio) >= i_size) {
	2713	folio_unlock(folio);
	2714	return 0; /* don't care */
	2715	}
	2716
	2717	/*
	2718	* The folio straddles i_size. It must be zeroed out on each and every
	2719	* writepage invocation because it may be mmapped. "A file is mapped
	2720	* in multiples of the page size. For a file that is not a multiple of
	2721	* the page size, the remaining memory is zeroed when mapped, and
	2722	* writes to that region are not written out to the file."
	2723	*/
	2724	folio_zero_segment(folio, offset_in_folio(folio, i_size),
	2725	folio_size(folio));
	2726	return __block_write_full_folio(inode, folio, get_block, wbc,
	2727	end_buffer_async_write);
	2728	}
	2729	EXPORT_SYMBOL(block_write_full_page);
	2730
	2731	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
	2732	get_block_t *get_block)
	2733	{
	2734	struct inode *inode = mapping->host;
	2735	struct buffer_head tmp = {
	2736	.b_size = i_blocksize(inode),
	2737	};
	2738
	2739	get_block(inode, block, &tmp, 0);
	2740	return tmp.b_blocknr;
	2741	}
	2742	EXPORT_SYMBOL(generic_block_bmap);
	2743
	2744	static void end_bio_bh_io_sync(struct bio *bio)
	2745	{
	2746	struct buffer_head *bh = bio->bi_private;
	2747
	2748	if (unlikely(bio_flagged(bio, BIO_QUIET)))
	2749	set_bit(BH_Quiet, &bh->b_state);
	2750
	2751	bh->b_end_io(bh, !bio->bi_status);
	2752	bio_put(bio);
	2753	}
	2754
	2755	static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
	2756	struct writeback_control *wbc)
	2757	{
	2758	const enum req_op op = opf & REQ_OP_MASK;
	2759	struct bio *bio;
	2760
	2761	BUG_ON(!buffer_locked(bh));
	2762	BUG_ON(!buffer_mapped(bh));
	2763	BUG_ON(!bh->b_end_io);
	2764	BUG_ON(buffer_delay(bh));
	2765	BUG_ON(buffer_unwritten(bh));
	2766
	2767	/*
	2768	* Only clear out a write error when rewriting
	2769	*/
	2770	if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
	2771	clear_buffer_write_io_error(bh);
	2772
	2773	if (buffer_meta(bh))
	2774	opf \|= REQ_META;
	2775	if (buffer_prio(bh))
	2776	opf \|= REQ_PRIO;
	2777
	2778	bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
	2779
	2780	fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
	2781
	2782	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
	2783
	2784	__bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
	2785
	2786	bio->bi_end_io = end_bio_bh_io_sync;
	2787	bio->bi_private = bh;
	2788
	2789	/* Take care of bh's that straddle the end of the device */
	2790	guard_bio_eod(bio);
	2791
	2792	if (wbc) {
	2793	wbc_init_bio(wbc, bio);
	2794	wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
	2795	}
	2796
	2797	submit_bio(bio);
	2798	}
	2799
	2800	void submit_bh(blk_opf_t opf, struct buffer_head *bh)
	2801	{
	2802	submit_bh_wbc(opf, bh, NULL);
	2803	}
	2804	EXPORT_SYMBOL(submit_bh);
	2805
	2806	void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
	2807	{
	2808	lock_buffer(bh);
	2809	if (!test_clear_buffer_dirty(bh)) {
	2810	unlock_buffer(bh);
	2811	return;
	2812	}
	2813	bh->b_end_io = end_buffer_write_sync;
	2814	get_bh(bh);
	2815	submit_bh(REQ_OP_WRITE \| op_flags, bh);
	2816	}
	2817	EXPORT_SYMBOL(write_dirty_buffer);
	2818
	2819	/*
	2820	* For a data-integrity writeout, we need to wait upon any in-progress I/O
	2821	* and then start new I/O and then wait upon it. The caller must have a ref on
	2822	* the buffer_head.
	2823	*/
	2824	int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
	2825	{
	2826	WARN_ON(atomic_read(&bh->b_count) < 1);
	2827	lock_buffer(bh);
	2828	if (test_clear_buffer_dirty(bh)) {
	2829	/*
	2830	* The bh should be mapped, but it might not be if the
	2831	* device was hot-removed. Not much we can do but fail the I/O.
	2832	*/
	2833	if (!buffer_mapped(bh)) {
	2834	unlock_buffer(bh);
	2835	return -EIO;
	2836	}
	2837
	2838	get_bh(bh);
	2839	bh->b_end_io = end_buffer_write_sync;
	2840	submit_bh(REQ_OP_WRITE \| op_flags, bh);
	2841	wait_on_buffer(bh);
	2842	if (!buffer_uptodate(bh))
	2843	return -EIO;
	2844	} else {
	2845	unlock_buffer(bh);
	2846	}
	2847	return 0;
	2848	}
	2849	EXPORT_SYMBOL(__sync_dirty_buffer);
	2850
	2851	int sync_dirty_buffer(struct buffer_head *bh)
	2852	{
	2853	return __sync_dirty_buffer(bh, REQ_SYNC);
	2854	}
	2855	EXPORT_SYMBOL(sync_dirty_buffer);
	2856
	2857	/*
	2858	* try_to_free_buffers() checks if all the buffers on this particular folio
	2859	* are unused, and releases them if so.
	2860	*
	2861	* Exclusion against try_to_free_buffers may be obtained by either
	2862	* locking the folio or by holding its mapping's private_lock.
	2863	*
	2864	* If the folio is dirty but all the buffers are clean then we need to
	2865	* be sure to mark the folio clean as well. This is because the folio
	2866	* may be against a block device, and a later reattachment of buffers
	2867	* to a dirty folio will set all buffers dirty. Which would corrupt
	2868	* filesystem data on the same device.
	2869	*
	2870	* The same applies to regular filesystem folios: if all the buffers are
	2871	* clean then we set the folio clean and proceed. To do that, we require
	2872	* total exclusion from block_dirty_folio(). That is obtained with
	2873	* private_lock.
	2874	*
	2875	* try_to_free_buffers() is non-blocking.
	2876	*/
	2877	static inline int buffer_busy(struct buffer_head *bh)
	2878	{
	2879	return atomic_read(&bh->b_count) \|
	2880	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
	2881	}
	2882
	2883	static bool
	2884	drop_buffers(struct folio folio, struct buffer_head *buffers_to_free)
	2885	{
	2886	struct buffer_head *head = folio_buffers(folio);
	2887	struct buffer_head *bh;
	2888
	2889	bh = head;
	2890	do {
	2891	if (buffer_busy(bh))
	2892	goto failed;
	2893	bh = bh->b_this_page;
	2894	} while (bh != head);
	2895
	2896	do {
	2897	struct buffer_head *next = bh->b_this_page;
	2898
	2899	if (bh->b_assoc_map)
	2900	__remove_assoc_queue(bh);
	2901	bh = next;
	2902	} while (bh != head);
	2903	*buffers_to_free = head;
	2904	folio_detach_private(folio);
	2905	return true;
	2906	failed:
	2907	return false;
	2908	}
	2909
	2910	bool try_to_free_buffers(struct folio *folio)
	2911	{
	2912	struct address_space * const mapping = folio->mapping;
	2913	struct buffer_head *buffers_to_free = NULL;
	2914	bool ret = 0;
	2915
	2916	BUG_ON(!folio_test_locked(folio));
	2917	if (folio_test_writeback(folio))
	2918	return false;
	2919
	2920	if (mapping == NULL) { /* can this still happen? */
	2921	ret = drop_buffers(folio, &buffers_to_free);
	2922	goto out;
	2923	}
	2924
	2925	spin_lock(&mapping->private_lock);
	2926	ret = drop_buffers(folio, &buffers_to_free);
	2927
	2928	/*
	2929	* If the filesystem writes its buffers by hand (eg ext3)
	2930	* then we can have clean buffers against a dirty folio. We
	2931	* clean the folio here; otherwise the VM will never notice
	2932	* that the filesystem did any IO at all.
	2933	*
	2934	* Also, during truncate, discard_buffer will have marked all
	2935	* the folio's buffers clean. We discover that here and clean
	2936	* the folio also.
	2937	*
	2938	* private_lock must be held over this entire operation in order
	2939	* to synchronise against block_dirty_folio and prevent the
	2940	* dirty bit from being lost.
	2941	*/
	2942	if (ret)
	2943	folio_cancel_dirty(folio);
	2944	spin_unlock(&mapping->private_lock);
	2945	out:
	2946	if (buffers_to_free) {
	2947	struct buffer_head *bh = buffers_to_free;
	2948
	2949	do {
	2950	struct buffer_head *next = bh->b_this_page;
	2951	free_buffer_head(bh);
	2952	bh = next;
	2953	} while (bh != buffers_to_free);
	2954	}
	2955	return ret;
	2956	}
	2957	EXPORT_SYMBOL(try_to_free_buffers);
	2958
	2959	/*
	2960	* Buffer-head allocation
	2961	*/
	2962	static struct kmem_cache *bh_cachep __ro_after_init;
	2963
	2964	/*
	2965	* Once the number of bh's in the machine exceeds this level, we start
	2966	* stripping them in writeback.
	2967	*/
	2968	static unsigned long max_buffer_heads __ro_after_init;
	2969
	2970	int buffer_heads_over_limit;
	2971
	2972	struct bh_accounting {
	2973	int nr; /* Number of live bh's */
	2974	int ratelimit; /* Limit cacheline bouncing */
	2975	};
	2976
	2977	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
	2978
	2979	static void recalc_bh_state(void)
	2980	{
	2981	int i;
	2982	int tot = 0;
	2983
	2984	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
	2985	return;
	2986	__this_cpu_write(bh_accounting.ratelimit, 0);
	2987	for_each_online_cpu(i)
	2988	tot += per_cpu(bh_accounting, i).nr;
	2989	buffer_heads_over_limit = (tot > max_buffer_heads);
	2990	}
	2991
	2992	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
	2993	{
	2994	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
	2995	if (ret) {
	2996	INIT_LIST_HEAD(&ret->b_assoc_buffers);
	2997	spin_lock_init(&ret->b_uptodate_lock);
	2998	preempt_disable();
	2999	__this_cpu_inc(bh_accounting.nr);
	3000	recalc_bh_state();
	3001	preempt_enable();
	3002	}
	3003	return ret;
	3004	}
	3005	EXPORT_SYMBOL(alloc_buffer_head);
	3006
	3007	void free_buffer_head(struct buffer_head *bh)
	3008	{
	3009	BUG_ON(!list_empty(&bh->b_assoc_buffers));
	3010	kmem_cache_free(bh_cachep, bh);
	3011	preempt_disable();
	3012	__this_cpu_dec(bh_accounting.nr);
	3013	recalc_bh_state();
	3014	preempt_enable();
	3015	}
	3016	EXPORT_SYMBOL(free_buffer_head);
	3017
	3018	static int buffer_exit_cpu_dead(unsigned int cpu)
	3019	{
	3020	int i;
	3021	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
	3022
	3023	for (i = 0; i < BH_LRU_SIZE; i++) {
	3024	brelse(b->bhs[i]);
	3025	b->bhs[i] = NULL;
	3026	}
	3027	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
	3028	per_cpu(bh_accounting, cpu).nr = 0;
	3029	return 0;
	3030	}
	3031
	3032	/**
	3033	* bh_uptodate_or_lock - Test whether the buffer is uptodate
	3034	* @bh: struct buffer_head
	3035	*
	3036	* Return true if the buffer is up-to-date and false,
	3037	* with the buffer locked, if not.
	3038	*/
	3039	int bh_uptodate_or_lock(struct buffer_head *bh)
	3040	{
	3041	if (!buffer_uptodate(bh)) {
	3042	lock_buffer(bh);
	3043	if (!buffer_uptodate(bh))
	3044	return 0;
	3045	unlock_buffer(bh);
	3046	}
	3047	return 1;
	3048	}
	3049	EXPORT_SYMBOL(bh_uptodate_or_lock);
	3050
	3051	/**
	3052	* __bh_read - Submit read for a locked buffer
	3053	* @bh: struct buffer_head
	3054	* @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
	3055	* @wait: wait until reading finish
	3056	*
	3057	* Returns zero on success or don't wait, and -EIO on error.
	3058	*/
	3059	int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
	3060	{
	3061	int ret = 0;
	3062
	3063	BUG_ON(!buffer_locked(bh));
	3064
	3065	get_bh(bh);
	3066	bh->b_end_io = end_buffer_read_sync;
	3067	submit_bh(REQ_OP_READ \| op_flags, bh);
	3068	if (wait) {
	3069	wait_on_buffer(bh);
	3070	if (!buffer_uptodate(bh))
	3071	ret = -EIO;
	3072	}
	3073	return ret;
	3074	}
	3075	EXPORT_SYMBOL(__bh_read);
	3076
	3077	/**
	3078	* __bh_read_batch - Submit read for a batch of unlocked buffers
	3079	* @nr: entry number of the buffer batch
	3080	* @bhs: a batch of struct buffer_head
	3081	* @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
	3082	* @force_lock: force to get a lock on the buffer if set, otherwise drops any
	3083	* buffer that cannot lock.
	3084	*
	3085	* Returns zero on success or don't wait, and -EIO on error.
	3086	*/
	3087	void __bh_read_batch(int nr, struct buffer_head *bhs[],
	3088	blk_opf_t op_flags, bool force_lock)
	3089	{
	3090	int i;
	3091
	3092	for (i = 0; i < nr; i++) {
	3093	struct buffer_head *bh = bhs[i];
	3094
	3095	if (buffer_uptodate(bh))
	3096	continue;
	3097
	3098	if (force_lock)
	3099	lock_buffer(bh);
	3100	else
	3101	if (!trylock_buffer(bh))
	3102	continue;
	3103
	3104	if (buffer_uptodate(bh)) {
	3105	unlock_buffer(bh);
	3106	continue;
	3107	}
	3108
	3109	bh->b_end_io = end_buffer_read_sync;
	3110	get_bh(bh);
	3111	submit_bh(REQ_OP_READ \| op_flags, bh);
	3112	}
	3113	}
	3114	EXPORT_SYMBOL(__bh_read_batch);
	3115
	3116	void __init buffer_init(void)
	3117	{
	3118	unsigned long nrpages;
	3119	int ret;
	3120
	3121	bh_cachep = kmem_cache_create("buffer_head",
	3122	sizeof(struct buffer_head), 0,
	3123	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
	3124	SLAB_MEM_SPREAD),
	3125	NULL);
	3126
	3127	/*
	3128	* Limit the bh occupancy to 10% of ZONE_NORMAL
	3129	*/
	3130	nrpages = (nr_free_buffer_pages() * 10) / 100;
	3131	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
	3132	ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
	3133	NULL, buffer_exit_cpu_dead);
	3134	WARN_ON(ret < 0);
	3135	}