Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* -- mode: c; c-basic-offset: 8; --
	2	* vim: noexpandtab sw=8 ts=8 sts=0:
	3	*
	4	* file.c
	5	*
	6	* File open, close, extend, truncate
	7	*
	8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
	9	*
	10	* This program is free software; you can redistribute it and/or
	11	* modify it under the terms of the GNU General Public
	12	* License as published by the Free Software Foundation; either
	13	* version 2 of the License, or (at your option) any later version.
	14	*
	15	* This program is distributed in the hope that it will be useful,
	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	18	* General Public License for more details.
	19	*
	20	* You should have received a copy of the GNU General Public
	21	* License along with this program; if not, write to the
	22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
	23	* Boston, MA 021110-1307, USA.
	24	*/
	25
	26	#include <linux/capability.h>
	27	#include <linux/fs.h>
	28	#include <linux/types.h>
	29	#include <linux/slab.h>
	30	#include <linux/highmem.h>
	31	#include <linux/pagemap.h>
	32	#include <linux/uio.h>
	33	#include <linux/sched.h>
	34	#include <linux/splice.h>
	35	#include <linux/mount.h>
	36	#include <linux/writeback.h>
	37	#include <linux/falloc.h>
	38	#include <linux/quotaops.h>
	39	#include <linux/blkdev.h>
	40
	41	#include <cluster/masklog.h>
	42
	43	#include "ocfs2.h"
	44
	45	#include "alloc.h"
	46	#include "aops.h"
	47	#include "dir.h"
	48	#include "dlmglue.h"
	49	#include "extent_map.h"
	50	#include "file.h"
	51	#include "sysfile.h"
	52	#include "inode.h"
	53	#include "ioctl.h"
	54	#include "journal.h"
	55	#include "locks.h"
	56	#include "mmap.h"
	57	#include "suballoc.h"
	58	#include "super.h"
	59	#include "xattr.h"
	60	#include "acl.h"
	61	#include "quota.h"
	62	#include "refcounttree.h"
	63	#include "ocfs2_trace.h"
	64
	65	#include "buffer_head_io.h"
	66
	67	static int ocfs2_init_file_private(struct inode inode, struct file file)
	68	{
	69	struct ocfs2_file_private *fp;
	70
	71	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
	72	if (!fp)
	73	return -ENOMEM;
	74
	75	fp->fp_file = file;
	76	mutex_init(&fp->fp_mutex);
	77	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
	78	file->private_data = fp;
	79
	80	return 0;
	81	}
	82
	83	static void ocfs2_free_file_private(struct inode inode, struct file file)
	84	{
	85	struct ocfs2_file_private *fp = file->private_data;
	86	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	87
	88	if (fp) {
	89	ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
	90	ocfs2_lock_res_free(&fp->fp_flock);
	91	kfree(fp);
	92	file->private_data = NULL;
	93	}
	94	}
	95
	96	static int ocfs2_file_open(struct inode inode, struct file file)
	97	{
	98	int status;
	99	int mode = file->f_flags;
	100	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	101
	102	trace_ocfs2_file_open(inode, file, file->f_path.dentry,
	103	(unsigned long long)OCFS2_I(inode)->ip_blkno,
	104	file->f_path.dentry->d_name.len,
	105	file->f_path.dentry->d_name.name, mode);
	106
	107	if (file->f_mode & FMODE_WRITE)
	108	dquot_initialize(inode);
	109
	110	spin_lock(&oi->ip_lock);
	111
	112	/* Check that the inode hasn't been wiped from disk by another
	113	* node. If it hasn't then we're safe as long as we hold the
	114	* spin lock until our increment of open count. */
	115	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
	116	spin_unlock(&oi->ip_lock);
	117
	118	status = -ENOENT;
	119	goto leave;
	120	}
	121
	122	if (mode & O_DIRECT)
	123	oi->ip_flags \|= OCFS2_INODE_OPEN_DIRECT;
	124
	125	oi->ip_open_count++;
	126	spin_unlock(&oi->ip_lock);
	127
	128	status = ocfs2_init_file_private(inode, file);
	129	if (status) {
	130	/*
	131	* We want to set open count back if we're failing the
	132	* open.
	133	*/
	134	spin_lock(&oi->ip_lock);
	135	oi->ip_open_count--;
	136	spin_unlock(&oi->ip_lock);
	137	}
	138
	139	leave:
	140	return status;
	141	}
	142
	143	static int ocfs2_file_release(struct inode inode, struct file file)
	144	{
	145	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	146
	147	spin_lock(&oi->ip_lock);
	148	if (!--oi->ip_open_count)
	149	oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
	150
	151	trace_ocfs2_file_release(inode, file, file->f_path.dentry,
	152	oi->ip_blkno,
	153	file->f_path.dentry->d_name.len,
	154	file->f_path.dentry->d_name.name,
	155	oi->ip_open_count);
	156	spin_unlock(&oi->ip_lock);
	157
	158	ocfs2_free_file_private(inode, file);
	159
	160	return 0;
	161	}
	162
	163	static int ocfs2_dir_open(struct inode inode, struct file file)
	164	{
	165	return ocfs2_init_file_private(inode, file);
	166	}
	167
	168	static int ocfs2_dir_release(struct inode inode, struct file file)
	169	{
	170	ocfs2_free_file_private(inode, file);
	171	return 0;
	172	}
	173
	174	static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
	175	int datasync)
	176	{
	177	int err = 0;
	178	journal_t *journal;
	179	struct inode *inode = file->f_mapping->host;
	180	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	181
	182	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
	183	OCFS2_I(inode)->ip_blkno,
	184	file->f_path.dentry->d_name.len,
	185	file->f_path.dentry->d_name.name,
	186	(unsigned long long)datasync);
	187
	188	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
	189	if (err)
	190	return err;
	191
	192	/*
	193	* Probably don't need the i_mutex at all in here, just putting it here
	194	* to be consistent with how fsync used to be called, someone more
	195	* familiar with the fs could possibly remove it.
	196	*/
	197	mutex_lock(&inode->i_mutex);
	198	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
	199	/*
	200	* We still have to flush drive's caches to get data to the
	201	* platter
	202	*/
	203	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
	204	blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
	205	goto bail;
	206	}
	207
	208	journal = osb->journal->j_journal;
	209	err = jbd2_journal_force_commit(journal);
	210
	211	bail:
	212	if (err)
	213	mlog_errno(err);
	214	mutex_unlock(&inode->i_mutex);
	215
	216	return (err < 0) ? -EIO : 0;
	217	}
	218
	219	int ocfs2_should_update_atime(struct inode *inode,
	220	struct vfsmount *vfsmnt)
	221	{
	222	struct timespec now;
	223	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	224
	225	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
	226	return 0;
	227
	228	if ((inode->i_flags & S_NOATIME) \|\|
	229	((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
	230	return 0;
	231
	232	/*
	233	* We can be called with no vfsmnt structure - NFSD will
	234	* sometimes do this.
	235	*
	236	* Note that our action here is different than touch_atime() -
	237	* if we can't tell whether this is a noatime mount, then we
	238	* don't know whether to trust the value of s_atime_quantum.
	239	*/
	240	if (vfsmnt == NULL)
	241	return 0;
	242
	243	if ((vfsmnt->mnt_flags & MNT_NOATIME) \|\|
	244	((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
	245	return 0;
	246
	247	if (vfsmnt->mnt_flags & MNT_RELATIME) {
	248	if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) \|\|
	249	(timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
	250	return 1;
	251
	252	return 0;
	253	}
	254
	255	now = CURRENT_TIME;
	256	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
	257	return 0;
	258	else
	259	return 1;
	260	}
	261
	262	int ocfs2_update_inode_atime(struct inode *inode,
	263	struct buffer_head *bh)
	264	{
	265	int ret;
	266	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	267	handle_t *handle;
	268	struct ocfs2_dinode di = (struct ocfs2_dinode ) bh->b_data;
	269
	270	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
	271	if (IS_ERR(handle)) {
	272	ret = PTR_ERR(handle);
	273	mlog_errno(ret);
	274	goto out;
	275	}
	276
	277	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
	278	OCFS2_JOURNAL_ACCESS_WRITE);
	279	if (ret) {
	280	mlog_errno(ret);
	281	goto out_commit;
	282	}
	283
	284	/*
	285	* Don't use ocfs2_mark_inode_dirty() here as we don't always
	286	* have i_mutex to guard against concurrent changes to other
	287	* inode fields.
	288	*/
	289	inode->i_atime = CURRENT_TIME;
	290	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
	291	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
	292	ocfs2_journal_dirty(handle, bh);
	293
	294	out_commit:
	295	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
	296	out:
	297	return ret;
	298	}
	299
	300	static int ocfs2_set_inode_size(handle_t *handle,
	301	struct inode *inode,
	302	struct buffer_head *fe_bh,
	303	u64 new_i_size)
	304	{
	305	int status;
	306
	307	i_size_write(inode, new_i_size);
	308	inode->i_blocks = ocfs2_inode_sector_count(inode);
	309	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
	310
	311	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
	312	if (status < 0) {
	313	mlog_errno(status);
	314	goto bail;
	315	}
	316
	317	bail:
	318	return status;
	319	}
	320
	321	int ocfs2_simple_size_update(struct inode *inode,
	322	struct buffer_head *di_bh,
	323	u64 new_i_size)
	324	{
	325	int ret;
	326	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	327	handle_t *handle = NULL;
	328
	329	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
	330	if (IS_ERR(handle)) {
	331	ret = PTR_ERR(handle);
	332	mlog_errno(ret);
	333	goto out;
	334	}
	335
	336	ret = ocfs2_set_inode_size(handle, inode, di_bh,
	337	new_i_size);
	338	if (ret < 0)
	339	mlog_errno(ret);
	340
	341	ocfs2_commit_trans(osb, handle);
	342	out:
	343	return ret;
	344	}
	345
	346	static int ocfs2_cow_file_pos(struct inode *inode,
	347	struct buffer_head *fe_bh,
	348	u64 offset)
	349	{
	350	int status;
	351	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
	352	unsigned int num_clusters = 0;
	353	unsigned int ext_flags = 0;
	354
	355	/*
	356	* If the new offset is aligned to the range of the cluster, there is
	357	* no space for ocfs2_zero_range_for_truncate to fill, so no need to
	358	* CoW either.
	359	*/
	360	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
	361	return 0;
	362
	363	status = ocfs2_get_clusters(inode, cpos, &phys,
	364	&num_clusters, &ext_flags);
	365	if (status) {
	366	mlog_errno(status);
	367	goto out;
	368	}
	369
	370	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
	371	goto out;
	372
	373	return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
	374
	375	out:
	376	return status;
	377	}
	378
	379	static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
	380	struct inode *inode,
	381	struct buffer_head *fe_bh,
	382	u64 new_i_size)
	383	{
	384	int status;
	385	handle_t *handle;
	386	struct ocfs2_dinode *di;
	387	u64 cluster_bytes;
	388
	389	/*
	390	* We need to CoW the cluster contains the offset if it is reflinked
	391	* since we will call ocfs2_zero_range_for_truncate later which will
	392	* write "0" from offset to the end of the cluster.
	393	*/
	394	status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
	395	if (status) {
	396	mlog_errno(status);
	397	return status;
	398	}
	399
	400	/* TODO: This needs to actually orphan the inode in this
	401	* transaction. */
	402
	403	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
	404	if (IS_ERR(handle)) {
	405	status = PTR_ERR(handle);
	406	mlog_errno(status);
	407	goto out;
	408	}
	409
	410	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
	411	OCFS2_JOURNAL_ACCESS_WRITE);
	412	if (status < 0) {
	413	mlog_errno(status);
	414	goto out_commit;
	415	}
	416
	417	/*
	418	* Do this before setting i_size.
	419	*/
	420	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
	421	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
	422	cluster_bytes);
	423	if (status) {
	424	mlog_errno(status);
	425	goto out_commit;
	426	}
	427
	428	i_size_write(inode, new_i_size);
	429	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
	430
	431	di = (struct ocfs2_dinode *) fe_bh->b_data;
	432	di->i_size = cpu_to_le64(new_i_size);
	433	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
	434	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
	435
	436	ocfs2_journal_dirty(handle, fe_bh);
	437
	438	out_commit:
	439	ocfs2_commit_trans(osb, handle);
	440	out:
	441	return status;
	442	}
	443
	444	static int ocfs2_truncate_file(struct inode *inode,
	445	struct buffer_head *di_bh,
	446	u64 new_i_size)
	447	{
	448	int status = 0;
	449	struct ocfs2_dinode *fe = NULL;
	450	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	451
	452	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
	453	* already validated it */
	454	fe = (struct ocfs2_dinode *) di_bh->b_data;
	455
	456	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
	457	(unsigned long long)le64_to_cpu(fe->i_size),
	458	(unsigned long long)new_i_size);
	459
	460	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
	461	"Inode %llu, inode i_size = %lld != di "
	462	"i_size = %llu, i_flags = 0x%x\n",
	463	(unsigned long long)OCFS2_I(inode)->ip_blkno,
	464	i_size_read(inode),
	465	(unsigned long long)le64_to_cpu(fe->i_size),
	466	le32_to_cpu(fe->i_flags));
	467
	468	if (new_i_size > le64_to_cpu(fe->i_size)) {
	469	trace_ocfs2_truncate_file_error(
	470	(unsigned long long)le64_to_cpu(fe->i_size),
	471	(unsigned long long)new_i_size);
	472	status = -EINVAL;
	473	mlog_errno(status);
	474	goto bail;
	475	}
	476
	477	/* lets handle the simple truncate cases before doing any more
	478	* cluster locking. */
	479	if (new_i_size == le64_to_cpu(fe->i_size))
	480	goto bail;
	481
	482	down_write(&OCFS2_I(inode)->ip_alloc_sem);
	483
	484	ocfs2_resv_discard(&osb->osb_la_resmap,
	485	&OCFS2_I(inode)->ip_la_data_resv);
	486
	487	/*
	488	* The inode lock forced other nodes to sync and drop their
	489	* pages, which (correctly) happens even if we have a truncate
	490	* without allocation change - ocfs2 cluster sizes can be much
	491	* greater than page size, so we have to truncate them
	492	* anyway.
	493	*/
	494	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
	495	truncate_inode_pages(inode->i_mapping, new_i_size);
	496
	497	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
	498	status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
	499	i_size_read(inode), 1);
	500	if (status)
	501	mlog_errno(status);
	502
	503	goto bail_unlock_sem;
	504	}
	505
	506	/* alright, we're going to need to do a full blown alloc size
	507	* change. Orphan the inode so that recovery can complete the
	508	* truncate if necessary. This does the task of marking
	509	* i_size. */
	510	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
	511	if (status < 0) {
	512	mlog_errno(status);
	513	goto bail_unlock_sem;
	514	}
	515
	516	status = ocfs2_commit_truncate(osb, inode, di_bh);
	517	if (status < 0) {
	518	mlog_errno(status);
	519	goto bail_unlock_sem;
	520	}
	521
	522	/* TODO: orphan dir cleanup here. */
	523	bail_unlock_sem:
	524	up_write(&OCFS2_I(inode)->ip_alloc_sem);
	525
	526	bail:
	527	if (!status && OCFS2_I(inode)->ip_clusters == 0)
	528	status = ocfs2_try_remove_refcount_tree(inode, di_bh);
	529
	530	return status;
	531	}
	532
	533	/*
	534	* extend file allocation only here.
	535	* we'll update all the disk stuff, and oip->alloc_size
	536	*
	537	* expect stuff to be locked, a transaction started and enough data /
	538	* metadata reservations in the contexts.
	539	*
	540	* Will return -EAGAIN, and a reason if a restart is needed.
	541	* If passed in, *reason will always be set, even in error.
	542	*/
	543	int ocfs2_add_inode_data(struct ocfs2_super *osb,
	544	struct inode *inode,
	545	u32 *logical_offset,
	546	u32 clusters_to_add,
	547	int mark_unwritten,
	548	struct buffer_head *fe_bh,
	549	handle_t *handle,
	550	struct ocfs2_alloc_context *data_ac,
	551	struct ocfs2_alloc_context *meta_ac,
	552	enum ocfs2_alloc_restarted *reason_ret)
	553	{
	554	int ret;
	555	struct ocfs2_extent_tree et;
	556
	557	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
	558	ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
	559	clusters_to_add, mark_unwritten,
	560	data_ac, meta_ac, reason_ret);
	561
	562	return ret;
	563	}
	564
	565	static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
	566	u32 clusters_to_add, int mark_unwritten)
	567	{
	568	int status = 0;
	569	int restart_func = 0;
	570	int credits;
	571	u32 prev_clusters;
	572	struct buffer_head *bh = NULL;
	573	struct ocfs2_dinode *fe = NULL;
	574	handle_t *handle = NULL;
	575	struct ocfs2_alloc_context *data_ac = NULL;
	576	struct ocfs2_alloc_context *meta_ac = NULL;
	577	enum ocfs2_alloc_restarted why;
	578	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	579	struct ocfs2_extent_tree et;
	580	int did_quota = 0;
	581
	582	/*
	583	* This function only exists for file systems which don't
	584	* support holes.
	585	*/
	586	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
	587
	588	status = ocfs2_read_inode_block(inode, &bh);
	589	if (status < 0) {
	590	mlog_errno(status);
	591	goto leave;
	592	}
	593	fe = (struct ocfs2_dinode *) bh->b_data;
	594
	595	restart_all:
	596	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
	597
	598	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
	599	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
	600	&data_ac, &meta_ac);
	601	if (status) {
	602	mlog_errno(status);
	603	goto leave;
	604	}
	605
	606	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
	607	clusters_to_add);
	608	handle = ocfs2_start_trans(osb, credits);
	609	if (IS_ERR(handle)) {
	610	status = PTR_ERR(handle);
	611	handle = NULL;
	612	mlog_errno(status);
	613	goto leave;
	614	}
	615
	616	restarted_transaction:
	617	trace_ocfs2_extend_allocation(
	618	(unsigned long long)OCFS2_I(inode)->ip_blkno,
	619	(unsigned long long)i_size_read(inode),
	620	le32_to_cpu(fe->i_clusters), clusters_to_add,
	621	why, restart_func);
	622
	623	status = dquot_alloc_space_nodirty(inode,
	624	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
	625	if (status)
	626	goto leave;
	627	did_quota = 1;
	628
	629	/* reserve a write to the file entry early on - that we if we
	630	* run out of credits in the allocation path, we can still
	631	* update i_size. */
	632	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
	633	OCFS2_JOURNAL_ACCESS_WRITE);
	634	if (status < 0) {
	635	mlog_errno(status);
	636	goto leave;
	637	}
	638
	639	prev_clusters = OCFS2_I(inode)->ip_clusters;
	640
	641	status = ocfs2_add_inode_data(osb,
	642	inode,
	643	&logical_start,
	644	clusters_to_add,
	645	mark_unwritten,
	646	bh,
	647	handle,
	648	data_ac,
	649	meta_ac,
	650	&why);
	651	if ((status < 0) && (status != -EAGAIN)) {
	652	if (status != -ENOSPC)
	653	mlog_errno(status);
	654	goto leave;
	655	}
	656
	657	ocfs2_journal_dirty(handle, bh);
	658
	659	spin_lock(&OCFS2_I(inode)->ip_lock);
	660	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
	661	spin_unlock(&OCFS2_I(inode)->ip_lock);
	662	/* Release unused quota reservation */
	663	dquot_free_space(inode,
	664	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
	665	did_quota = 0;
	666
	667	if (why != RESTART_NONE && clusters_to_add) {
	668	if (why == RESTART_META) {
	669	restart_func = 1;
	670	status = 0;
	671	} else {
	672	BUG_ON(why != RESTART_TRANS);
	673
	674	/* TODO: This can be more intelligent. */
	675	credits = ocfs2_calc_extend_credits(osb->sb,
	676	&fe->id2.i_list,
	677	clusters_to_add);
	678	status = ocfs2_extend_trans(handle, credits);
	679	if (status < 0) {
	680	/* handle still has to be committed at
	681	* this point. */
	682	status = -ENOMEM;
	683	mlog_errno(status);
	684	goto leave;
	685	}
	686	goto restarted_transaction;
	687	}
	688	}
	689
	690	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
	691	le32_to_cpu(fe->i_clusters),
	692	(unsigned long long)le64_to_cpu(fe->i_size),
	693	OCFS2_I(inode)->ip_clusters,
	694	(unsigned long long)i_size_read(inode));
	695
	696	leave:
	697	if (status < 0 && did_quota)
	698	dquot_free_space(inode,
	699	ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
	700	if (handle) {
	701	ocfs2_commit_trans(osb, handle);
	702	handle = NULL;
	703	}
	704	if (data_ac) {
	705	ocfs2_free_alloc_context(data_ac);
	706	data_ac = NULL;
	707	}
	708	if (meta_ac) {
	709	ocfs2_free_alloc_context(meta_ac);
	710	meta_ac = NULL;
	711	}
	712	if ((!status) && restart_func) {
	713	restart_func = 0;
	714	goto restart_all;
	715	}
	716	brelse(bh);
	717	bh = NULL;
	718
	719	return status;
	720	}
	721
	722	/*
	723	* While a write will already be ordering the data, a truncate will not.
	724	* Thus, we need to explicitly order the zeroed pages.
	725	*/
	726	static handle_t ocfs2_zero_start_ordered_transaction(struct inode inode)
	727	{
	728	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	729	handle_t *handle = NULL;
	730	int ret = 0;
	731
	732	if (!ocfs2_should_order_data(inode))
	733	goto out;
	734
	735	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
	736	if (IS_ERR(handle)) {
	737	ret = -ENOMEM;
	738	mlog_errno(ret);
	739	goto out;
	740	}
	741
	742	ret = ocfs2_jbd2_file_inode(handle, inode);
	743	if (ret < 0)
	744	mlog_errno(ret);
	745
	746	out:
	747	if (ret) {
	748	if (!IS_ERR(handle))
	749	ocfs2_commit_trans(osb, handle);
	750	handle = ERR_PTR(ret);
	751	}
	752	return handle;
	753	}
	754
	755	/* Some parts of this taken from generic_cont_expand, which turned out
	756	* to be too fragile to do exactly what we need without us having to
	757	* worry about recursive locking in ->write_begin() and ->write_end(). */
	758	static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
	759	u64 abs_to)
	760	{
	761	struct address_space *mapping = inode->i_mapping;
	762	struct page *page;
	763	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
	764	handle_t *handle = NULL;
	765	int ret = 0;
	766	unsigned zero_from, zero_to, block_start, block_end;
	767
	768	BUG_ON(abs_from >= abs_to);
	769	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
	770	BUG_ON(abs_from & (inode->i_blkbits - 1));
	771
	772	page = find_or_create_page(mapping, index, GFP_NOFS);
	773	if (!page) {
	774	ret = -ENOMEM;
	775	mlog_errno(ret);
	776	goto out;
	777	}
	778
	779	/* Get the offsets within the page that we want to zero */
	780	zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
	781	zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
	782	if (!zero_to)
	783	zero_to = PAGE_CACHE_SIZE;
	784
	785	trace_ocfs2_write_zero_page(
	786	(unsigned long long)OCFS2_I(inode)->ip_blkno,
	787	(unsigned long long)abs_from,
	788	(unsigned long long)abs_to,
	789	index, zero_from, zero_to);
	790
	791	/* We know that zero_from is block aligned */
	792	for (block_start = zero_from; block_start < zero_to;
	793	block_start = block_end) {
	794	block_end = block_start + (1 << inode->i_blkbits);
	795
	796	/*
	797	* block_start is block-aligned. Bump it by one to force
	798	* __block_write_begin and block_commit_write to zero the
	799	* whole block.
	800	*/
	801	ret = __block_write_begin(page, block_start + 1, 0,
	802	ocfs2_get_block);
	803	if (ret < 0) {
	804	mlog_errno(ret);
	805	goto out_unlock;
	806	}
	807
	808	if (!handle) {
	809	handle = ocfs2_zero_start_ordered_transaction(inode);
	810	if (IS_ERR(handle)) {
	811	ret = PTR_ERR(handle);
	812	handle = NULL;
	813	break;
	814	}
	815	}
	816
	817	/* must not update i_size! */
	818	ret = block_commit_write(page, block_start + 1,
	819	block_start + 1);
	820	if (ret < 0)
	821	mlog_errno(ret);
	822	else
	823	ret = 0;
	824	}
	825
	826	if (handle)
	827	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
	828
	829	out_unlock:
	830	unlock_page(page);
	831	page_cache_release(page);
	832	out:
	833	return ret;
	834	}
	835
	836	/*
	837	* Find the next range to zero. We do this in terms of bytes because
	838	* that's what ocfs2_zero_extend() wants, and it is dealing with the
	839	* pagecache. We may return multiple extents.
	840	*
	841	* zero_start and zero_end are ocfs2_zero_extend()s current idea of what
	842	* needs to be zeroed. range_start and range_end return the next zeroing
	843	* range. A subsequent call should pass the previous range_end as its
	844	* zero_start. If range_end is 0, there's nothing to do.
	845	*
	846	* Unwritten extents are skipped over. Refcounted extents are CoWd.
	847	*/
	848	static int ocfs2_zero_extend_get_range(struct inode *inode,
	849	struct buffer_head *di_bh,
	850	u64 zero_start, u64 zero_end,
	851	u64 range_start, u64 range_end)
	852	{
	853	int rc = 0, needs_cow = 0;
	854	u32 p_cpos, zero_clusters = 0;
	855	u32 zero_cpos =
	856	zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
	857	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
	858	unsigned int num_clusters = 0;
	859	unsigned int ext_flags = 0;
	860
	861	while (zero_cpos < last_cpos) {
	862	rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
	863	&num_clusters, &ext_flags);
	864	if (rc) {
	865	mlog_errno(rc);
	866	goto out;
	867	}
	868
	869	if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
	870	zero_clusters = num_clusters;
	871	if (ext_flags & OCFS2_EXT_REFCOUNTED)
	872	needs_cow = 1;
	873	break;
	874	}
	875
	876	zero_cpos += num_clusters;
	877	}
	878	if (!zero_clusters) {
	879	*range_end = 0;
	880	goto out;
	881	}
	882
	883	while ((zero_cpos + zero_clusters) < last_cpos) {
	884	rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
	885	&p_cpos, &num_clusters,
	886	&ext_flags);
	887	if (rc) {
	888	mlog_errno(rc);
	889	goto out;
	890	}
	891
	892	if (!p_cpos \|\| (ext_flags & OCFS2_EXT_UNWRITTEN))
	893	break;
	894	if (ext_flags & OCFS2_EXT_REFCOUNTED)
	895	needs_cow = 1;
	896	zero_clusters += num_clusters;
	897	}
	898	if ((zero_cpos + zero_clusters) > last_cpos)
	899	zero_clusters = last_cpos - zero_cpos;
	900
	901	if (needs_cow) {
	902	rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
	903	zero_clusters, UINT_MAX);
	904	if (rc) {
	905	mlog_errno(rc);
	906	goto out;
	907	}
	908	}
	909
	910	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
	911	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
	912	zero_cpos + zero_clusters);
	913
	914	out:
	915	return rc;
	916	}
	917
	918	/*
	919	* Zero one range returned from ocfs2_zero_extend_get_range(). The caller
	920	* has made sure that the entire range needs zeroing.
	921	*/
	922	static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
	923	u64 range_end)
	924	{
	925	int rc = 0;
	926	u64 next_pos;
	927	u64 zero_pos = range_start;
	928
	929	trace_ocfs2_zero_extend_range(
	930	(unsigned long long)OCFS2_I(inode)->ip_blkno,
	931	(unsigned long long)range_start,
	932	(unsigned long long)range_end);
	933	BUG_ON(range_start >= range_end);
	934
	935	while (zero_pos < range_end) {
	936	next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
	937	if (next_pos > range_end)
	938	next_pos = range_end;
	939	rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
	940	if (rc < 0) {
	941	mlog_errno(rc);
	942	break;
	943	}
	944	zero_pos = next_pos;
	945
	946	/*
	947	* Very large extends have the potential to lock up
	948	* the cpu for extended periods of time.
	949	*/
	950	cond_resched();
	951	}
	952
	953	return rc;
	954	}
	955
	956	int ocfs2_zero_extend(struct inode inode, struct buffer_head di_bh,
	957	loff_t zero_to_size)
	958	{
	959	int ret = 0;
	960	u64 zero_start, range_start = 0, range_end = 0;
	961	struct super_block *sb = inode->i_sb;
	962
	963	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
	964	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
	965	(unsigned long long)zero_start,
	966	(unsigned long long)i_size_read(inode));
	967	while (zero_start < zero_to_size) {
	968	ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
	969	zero_to_size,
	970	&range_start,
	971	&range_end);
	972	if (ret) {
	973	mlog_errno(ret);
	974	break;
	975	}
	976	if (!range_end)
	977	break;
	978	/* Trim the ends */
	979	if (range_start < zero_start)
	980	range_start = zero_start;
	981	if (range_end > zero_to_size)
	982	range_end = zero_to_size;
	983
	984	ret = ocfs2_zero_extend_range(inode, range_start,
	985	range_end);
	986	if (ret) {
	987	mlog_errno(ret);
	988	break;
	989	}
	990	zero_start = range_end;
	991	}
	992
	993	return ret;
	994	}
	995
	996	int ocfs2_extend_no_holes(struct inode inode, struct buffer_head di_bh,
	997	u64 new_i_size, u64 zero_to)
	998	{
	999	int ret;
	1000	u32 clusters_to_add;
	1001	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	1002
	1003	/*
	1004	* Only quota files call this without a bh, and they can't be
	1005	* refcounted.
	1006	*/
	1007	BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
	1008	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
	1009
	1010	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
	1011	if (clusters_to_add < oi->ip_clusters)
	1012	clusters_to_add = 0;
	1013	else
	1014	clusters_to_add -= oi->ip_clusters;
	1015
	1016	if (clusters_to_add) {
	1017	ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
	1018	clusters_to_add, 0);
	1019	if (ret) {
	1020	mlog_errno(ret);
	1021	goto out;
	1022	}
	1023	}
	1024
	1025	/*
	1026	* Call this even if we don't add any clusters to the tree. We
	1027	* still need to zero the area between the old i_size and the
	1028	* new i_size.
	1029	*/
	1030	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
	1031	if (ret < 0)
	1032	mlog_errno(ret);
	1033
	1034	out:
	1035	return ret;
	1036	}
	1037
	1038	static int ocfs2_extend_file(struct inode *inode,
	1039	struct buffer_head *di_bh,
	1040	u64 new_i_size)
	1041	{
	1042	int ret = 0;
	1043	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	1044
	1045	BUG_ON(!di_bh);
	1046
	1047	/* setattr sometimes calls us like this. */
	1048	if (new_i_size == 0)
	1049	goto out;
	1050
	1051	if (i_size_read(inode) == new_i_size)
	1052	goto out;
	1053	BUG_ON(new_i_size < i_size_read(inode));
	1054
	1055	/*
	1056	* The alloc sem blocks people in read/write from reading our
	1057	* allocation until we're done changing it. We depend on
	1058	* i_mutex to block other extend/truncate calls while we're
	1059	* here. We even have to hold it for sparse files because there
	1060	* might be some tail zeroing.
	1061	*/
	1062	down_write(&oi->ip_alloc_sem);
	1063
	1064	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
	1065	/*
	1066	* We can optimize small extends by keeping the inodes
	1067	* inline data.
	1068	*/
	1069	if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
	1070	up_write(&oi->ip_alloc_sem);
	1071	goto out_update_size;
	1072	}
	1073
	1074	ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
	1075	if (ret) {
	1076	up_write(&oi->ip_alloc_sem);
	1077	mlog_errno(ret);
	1078	goto out;
	1079	}
	1080	}
	1081
	1082	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
	1083	ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
	1084	else
	1085	ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
	1086	new_i_size);
	1087
	1088	up_write(&oi->ip_alloc_sem);
	1089
	1090	if (ret < 0) {
	1091	mlog_errno(ret);
	1092	goto out;
	1093	}
	1094
	1095	out_update_size:
	1096	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
	1097	if (ret < 0)
	1098	mlog_errno(ret);
	1099
	1100	out:
	1101	return ret;
	1102	}
	1103
	1104	int ocfs2_setattr(struct dentry dentry, struct iattr attr)
	1105	{
	1106	int status = 0, size_change;
	1107	struct inode *inode = dentry->d_inode;
	1108	struct super_block *sb = inode->i_sb;
	1109	struct ocfs2_super *osb = OCFS2_SB(sb);
	1110	struct buffer_head *bh = NULL;
	1111	handle_t *handle = NULL;
	1112	struct dquot *transfer_to[MAXQUOTAS] = { };
	1113	int qtype;
	1114
	1115	trace_ocfs2_setattr(inode, dentry,
	1116	(unsigned long long)OCFS2_I(inode)->ip_blkno,
	1117	dentry->d_name.len, dentry->d_name.name,
	1118	attr->ia_valid, attr->ia_mode,
	1119	from_kuid(&init_user_ns, attr->ia_uid),
	1120	from_kgid(&init_user_ns, attr->ia_gid));
	1121
	1122	/* ensuring we don't even attempt to truncate a symlink */
	1123	if (S_ISLNK(inode->i_mode))
	1124	attr->ia_valid &= ~ATTR_SIZE;
	1125
	1126	#define OCFS2_VALID_ATTRS (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME \| ATTR_SIZE \
	1127	\| ATTR_GID \| ATTR_UID \| ATTR_MODE)
	1128	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
	1129	return 0;
	1130
	1131	status = inode_change_ok(inode, attr);
	1132	if (status)
	1133	return status;
	1134
	1135	if (is_quota_modification(inode, attr))
	1136	dquot_initialize(inode);
	1137	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
	1138	if (size_change) {
	1139	status = ocfs2_rw_lock(inode, 1);
	1140	if (status < 0) {
	1141	mlog_errno(status);
	1142	goto bail;
	1143	}
	1144	}
	1145
	1146	status = ocfs2_inode_lock(inode, &bh, 1);
	1147	if (status < 0) {
	1148	if (status != -ENOENT)
	1149	mlog_errno(status);
	1150	goto bail_unlock_rw;
	1151	}
	1152
	1153	if (size_change && attr->ia_size != i_size_read(inode)) {
	1154	status = inode_newsize_ok(inode, attr->ia_size);
	1155	if (status)
	1156	goto bail_unlock;
	1157
	1158	inode_dio_wait(inode);
	1159
	1160	if (i_size_read(inode) > attr->ia_size) {
	1161	if (ocfs2_should_order_data(inode)) {
	1162	status = ocfs2_begin_ordered_truncate(inode,
	1163	attr->ia_size);
	1164	if (status)
	1165	goto bail_unlock;
	1166	}
	1167	status = ocfs2_truncate_file(inode, bh, attr->ia_size);
	1168	} else
	1169	status = ocfs2_extend_file(inode, bh, attr->ia_size);
	1170	if (status < 0) {
	1171	if (status != -ENOSPC)
	1172	mlog_errno(status);
	1173	status = -ENOSPC;
	1174	goto bail_unlock;
	1175	}
	1176	}
	1177
	1178	if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) \|\|
	1179	(attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
	1180	/*
	1181	* Gather pointers to quota structures so that allocation /
	1182	* freeing of quota structures happens here and not inside
	1183	* dquot_transfer() where we have problems with lock ordering
	1184	*/
	1185	if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
	1186	&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
	1187	OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
	1188	transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
	1189	if (!transfer_to[USRQUOTA]) {
	1190	status = -ESRCH;
	1191	goto bail_unlock;
	1192	}
	1193	}
	1194	if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
	1195	&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
	1196	OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
	1197	transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
	1198	if (!transfer_to[GRPQUOTA]) {
	1199	status = -ESRCH;
	1200	goto bail_unlock;
	1201	}
	1202	}
	1203	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
	1204	2 * ocfs2_quota_trans_credits(sb));
	1205	if (IS_ERR(handle)) {
	1206	status = PTR_ERR(handle);
	1207	mlog_errno(status);
	1208	goto bail_unlock;
	1209	}
	1210	status = __dquot_transfer(inode, transfer_to);
	1211	if (status < 0)
	1212	goto bail_commit;
	1213	} else {
	1214	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
	1215	if (IS_ERR(handle)) {
	1216	status = PTR_ERR(handle);
	1217	mlog_errno(status);
	1218	goto bail_unlock;
	1219	}
	1220	}
	1221
	1222	setattr_copy(inode, attr);
	1223	mark_inode_dirty(inode);
	1224
	1225	status = ocfs2_mark_inode_dirty(handle, inode, bh);
	1226	if (status < 0)
	1227	mlog_errno(status);
	1228
	1229	bail_commit:
	1230	ocfs2_commit_trans(osb, handle);
	1231	bail_unlock:
	1232	ocfs2_inode_unlock(inode, 1);
	1233	bail_unlock_rw:
	1234	if (size_change)
	1235	ocfs2_rw_unlock(inode, 1);
	1236	bail:
	1237	brelse(bh);
	1238
	1239	/* Release quota pointers in case we acquired them */
	1240	for (qtype = 0; qtype < MAXQUOTAS; qtype++)
	1241	dqput(transfer_to[qtype]);
	1242
	1243	if (!status && attr->ia_valid & ATTR_MODE) {
	1244	status = ocfs2_acl_chmod(inode);
	1245	if (status < 0)
	1246	mlog_errno(status);
	1247	}
	1248
	1249	return status;
	1250	}
	1251
	1252	int ocfs2_getattr(struct vfsmount *mnt,
	1253	struct dentry *dentry,
	1254	struct kstat *stat)
	1255	{
	1256	struct inode *inode = dentry->d_inode;
	1257	struct super_block *sb = dentry->d_inode->i_sb;
	1258	struct ocfs2_super *osb = sb->s_fs_info;
	1259	int err;
	1260
	1261	err = ocfs2_inode_revalidate(dentry);
	1262	if (err) {
	1263	if (err != -ENOENT)
	1264	mlog_errno(err);
	1265	goto bail;
	1266	}
	1267
	1268	generic_fillattr(inode, stat);
	1269
	1270	/* We set the blksize from the cluster size for performance */
	1271	stat->blksize = osb->s_clustersize;
	1272
	1273	bail:
	1274	return err;
	1275	}
	1276
	1277	int ocfs2_permission(struct inode *inode, int mask)
	1278	{
	1279	int ret;
	1280
	1281	if (mask & MAY_NOT_BLOCK)
	1282	return -ECHILD;
	1283
	1284	ret = ocfs2_inode_lock(inode, NULL, 0);
	1285	if (ret) {
	1286	if (ret != -ENOENT)
	1287	mlog_errno(ret);
	1288	goto out;
	1289	}
	1290
	1291	ret = generic_permission(inode, mask);
	1292
	1293	ocfs2_inode_unlock(inode, 0);
	1294	out:
	1295	return ret;
	1296	}
	1297
	1298	static int __ocfs2_write_remove_suid(struct inode *inode,
	1299	struct buffer_head *bh)
	1300	{
	1301	int ret;
	1302	handle_t *handle;
	1303	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	1304	struct ocfs2_dinode *di;
	1305
	1306	trace_ocfs2_write_remove_suid(
	1307	(unsigned long long)OCFS2_I(inode)->ip_blkno,
	1308	inode->i_mode);
	1309
	1310	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
	1311	if (IS_ERR(handle)) {
	1312	ret = PTR_ERR(handle);
	1313	mlog_errno(ret);
	1314	goto out;
	1315	}
	1316
	1317	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
	1318	OCFS2_JOURNAL_ACCESS_WRITE);
	1319	if (ret < 0) {
	1320	mlog_errno(ret);
	1321	goto out_trans;
	1322	}
	1323
	1324	inode->i_mode &= ~S_ISUID;
	1325	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
	1326	inode->i_mode &= ~S_ISGID;
	1327
	1328	di = (struct ocfs2_dinode *) bh->b_data;
	1329	di->i_mode = cpu_to_le16(inode->i_mode);
	1330
	1331	ocfs2_journal_dirty(handle, bh);
	1332
	1333	out_trans:
	1334	ocfs2_commit_trans(osb, handle);
	1335	out:
	1336	return ret;
	1337	}
	1338
	1339	/*
	1340	* Will look for holes and unwritten extents in the range starting at
	1341	* pos for count bytes (inclusive).
	1342	*/
	1343	static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
	1344	size_t count)
	1345	{
	1346	int ret = 0;
	1347	unsigned int extent_flags;
	1348	u32 cpos, clusters, extent_len, phys_cpos;
	1349	struct super_block *sb = inode->i_sb;
	1350
	1351	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
	1352	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
	1353
	1354	while (clusters) {
	1355	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
	1356	&extent_flags);
	1357	if (ret < 0) {
	1358	mlog_errno(ret);
	1359	goto out;
	1360	}
	1361
	1362	if (phys_cpos == 0 \|\| (extent_flags & OCFS2_EXT_UNWRITTEN)) {
	1363	ret = 1;
	1364	break;
	1365	}
	1366
	1367	if (extent_len > clusters)
	1368	extent_len = clusters;
	1369
	1370	clusters -= extent_len;
	1371	cpos += extent_len;
	1372	}
	1373	out:
	1374	return ret;
	1375	}
	1376
	1377	static int ocfs2_write_remove_suid(struct inode *inode)
	1378	{
	1379	int ret;
	1380	struct buffer_head *bh = NULL;
	1381
	1382	ret = ocfs2_read_inode_block(inode, &bh);
	1383	if (ret < 0) {
	1384	mlog_errno(ret);
	1385	goto out;
	1386	}
	1387
	1388	ret = __ocfs2_write_remove_suid(inode, bh);
	1389	out:
	1390	brelse(bh);
	1391	return ret;
	1392	}
	1393
	1394	/*
	1395	* Allocate enough extents to cover the region starting at byte offset
	1396	* start for len bytes. Existing extents are skipped, any extents
	1397	* added are marked as "unwritten".
	1398	*/
	1399	static int ocfs2_allocate_unwritten_extents(struct inode *inode,
	1400	u64 start, u64 len)
	1401	{
	1402	int ret;
	1403	u32 cpos, phys_cpos, clusters, alloc_size;
	1404	u64 end = start + len;
	1405	struct buffer_head *di_bh = NULL;
	1406
	1407	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
	1408	ret = ocfs2_read_inode_block(inode, &di_bh);
	1409	if (ret) {
	1410	mlog_errno(ret);
	1411	goto out;
	1412	}
	1413
	1414	/*
	1415	* Nothing to do if the requested reservation range
	1416	* fits within the inode.
	1417	*/
	1418	if (ocfs2_size_fits_inline_data(di_bh, end))
	1419	goto out;
	1420
	1421	ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
	1422	if (ret) {
	1423	mlog_errno(ret);
	1424	goto out;
	1425	}
	1426	}
	1427
	1428	/*
	1429	* We consider both start and len to be inclusive.
	1430	*/
	1431	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
	1432	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
	1433	clusters -= cpos;
	1434
	1435	while (clusters) {
	1436	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
	1437	&alloc_size, NULL);
	1438	if (ret) {
	1439	mlog_errno(ret);
	1440	goto out;
	1441	}
	1442
	1443	/*
	1444	* Hole or existing extent len can be arbitrary, so
	1445	* cap it to our own allocation request.
	1446	*/
	1447	if (alloc_size > clusters)
	1448	alloc_size = clusters;
	1449
	1450	if (phys_cpos) {
	1451	/*
	1452	* We already have an allocation at this
	1453	* region so we can safely skip it.
	1454	*/
	1455	goto next;
	1456	}
	1457
	1458	ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
	1459	if (ret) {
	1460	if (ret != -ENOSPC)
	1461	mlog_errno(ret);
	1462	goto out;
	1463	}
	1464
	1465	next:
	1466	cpos += alloc_size;
	1467	clusters -= alloc_size;
	1468	}
	1469
	1470	ret = 0;
	1471	out:
	1472
	1473	brelse(di_bh);
	1474	return ret;
	1475	}
	1476
	1477	/*
	1478	* Truncate a byte range, avoiding pages within partial clusters. This
	1479	* preserves those pages for the zeroing code to write to.
	1480	*/
	1481	static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
	1482	u64 byte_len)
	1483	{
	1484	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	1485	loff_t start, end;
	1486	struct address_space *mapping = inode->i_mapping;
	1487
	1488	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
	1489	end = byte_start + byte_len;
	1490	end = end & ~(osb->s_clustersize - 1);
	1491
	1492	if (start < end) {
	1493	unmap_mapping_range(mapping, start, end - start, 0);
	1494	truncate_inode_pages_range(mapping, start, end - 1);
	1495	}
	1496	}
	1497
	1498	static int ocfs2_zero_partial_clusters(struct inode *inode,
	1499	u64 start, u64 len)
	1500	{
	1501	int ret = 0;
	1502	u64 tmpend, end = start + len;
	1503	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	1504	unsigned int csize = osb->s_clustersize;
	1505	handle_t *handle;
	1506
	1507	/*
	1508	* The "start" and "end" values are NOT necessarily part of
	1509	* the range whose allocation is being deleted. Rather, this
	1510	* is what the user passed in with the request. We must zero
	1511	* partial clusters here. There's no need to worry about
	1512	* physical allocation - the zeroing code knows to skip holes.
	1513	*/
	1514	trace_ocfs2_zero_partial_clusters(
	1515	(unsigned long long)OCFS2_I(inode)->ip_blkno,
	1516	(unsigned long long)start, (unsigned long long)end);
	1517
	1518	/*
	1519	* If both edges are on a cluster boundary then there's no
	1520	* zeroing required as the region is part of the allocation to
	1521	* be truncated.
	1522	*/
	1523	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
	1524	goto out;
	1525
	1526	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
	1527	if (IS_ERR(handle)) {
	1528	ret = PTR_ERR(handle);
	1529	mlog_errno(ret);
	1530	goto out;
	1531	}
	1532
	1533	/*
	1534	* We want to get the byte offset of the end of the 1st cluster.
	1535	*/
	1536	tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
	1537	if (tmpend > end)
	1538	tmpend = end;
	1539
	1540	trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
	1541	(unsigned long long)tmpend);
	1542
	1543	ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
	1544	if (ret)
	1545	mlog_errno(ret);
	1546
	1547	if (tmpend < end) {
	1548	/*
	1549	* This may make start and end equal, but the zeroing
	1550	* code will skip any work in that case so there's no
	1551	* need to catch it up here.
	1552	*/
	1553	start = end & ~(osb->s_clustersize - 1);
	1554
	1555	trace_ocfs2_zero_partial_clusters_range2(
	1556	(unsigned long long)start, (unsigned long long)end);
	1557
	1558	ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
	1559	if (ret)
	1560	mlog_errno(ret);
	1561	}
	1562
	1563	ocfs2_commit_trans(osb, handle);
	1564	out:
	1565	return ret;
	1566	}
	1567
	1568	static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
	1569	{
	1570	int i;
	1571	struct ocfs2_extent_rec *rec = NULL;
	1572
	1573	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
	1574
	1575	rec = &el->l_recs[i];
	1576
	1577	if (le32_to_cpu(rec->e_cpos) < pos)
	1578	break;
	1579	}
	1580
	1581	return i;
	1582	}
	1583
	1584	/*
	1585	* Helper to calculate the punching pos and length in one run, we handle the
	1586	* following three cases in order:
	1587	*
	1588	* - remove the entire record
	1589	* - remove a partial record
	1590	* - no record needs to be removed (hole-punching completed)
	1591	*/
	1592	static void ocfs2_calc_trunc_pos(struct inode *inode,
	1593	struct ocfs2_extent_list *el,
	1594	struct ocfs2_extent_rec *rec,
	1595	u32 trunc_start, u32 *trunc_cpos,
	1596	u32 trunc_len, u32 trunc_end,
	1597	u64 blkno, int done)
	1598	{
	1599	int ret = 0;
	1600	u32 coff, range;
	1601
	1602	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
	1603
	1604	if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
	1605	/*
	1606	* remove an entire extent record.
	1607	*/
	1608	*trunc_cpos = le32_to_cpu(rec->e_cpos);
	1609	/*
	1610	* Skip holes if any.
	1611	*/
	1612	if (range < *trunc_end)
	1613	*trunc_end = range;
	1614	trunc_len = trunc_end - le32_to_cpu(rec->e_cpos);
	1615	*blkno = le64_to_cpu(rec->e_blkno);
	1616	*trunc_end = le32_to_cpu(rec->e_cpos);
	1617	} else if (range > trunc_start) {
	1618	/*
	1619	* remove a partial extent record, which means we're
	1620	* removing the last extent record.
	1621	*/
	1622	*trunc_cpos = trunc_start;
	1623	/*
	1624	* skip hole if any.
	1625	*/
	1626	if (range < *trunc_end)
	1627	*trunc_end = range;
	1628	trunc_len = trunc_end - trunc_start;
	1629	coff = trunc_start - le32_to_cpu(rec->e_cpos);
	1630	*blkno = le64_to_cpu(rec->e_blkno) +
	1631	ocfs2_clusters_to_blocks(inode->i_sb, coff);
	1632	*trunc_end = trunc_start;
	1633	} else {
	1634	/*
	1635	* It may have two following possibilities:
	1636	*
	1637	* - last record has been removed
	1638	* - trunc_start was within a hole
	1639	*
	1640	* both two cases mean the completion of hole punching.
	1641	*/
	1642	ret = 1;
	1643	}
	1644
	1645	*done = ret;
	1646	}
	1647
	1648	static int ocfs2_remove_inode_range(struct inode *inode,
	1649	struct buffer_head *di_bh, u64 byte_start,
	1650	u64 byte_len)
	1651	{
	1652	int ret = 0, flags = 0, done = 0, i;
	1653	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
	1654	u32 cluster_in_el;
	1655	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	1656	struct ocfs2_cached_dealloc_ctxt dealloc;
	1657	struct address_space *mapping = inode->i_mapping;
	1658	struct ocfs2_extent_tree et;
	1659	struct ocfs2_path *path = NULL;
	1660	struct ocfs2_extent_list *el = NULL;
	1661	struct ocfs2_extent_rec *rec = NULL;
	1662	struct ocfs2_dinode di = (struct ocfs2_dinode )di_bh->b_data;
	1663	u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
	1664
	1665	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
	1666	ocfs2_init_dealloc_ctxt(&dealloc);
	1667
	1668	trace_ocfs2_remove_inode_range(
	1669	(unsigned long long)OCFS2_I(inode)->ip_blkno,
	1670	(unsigned long long)byte_start,
	1671	(unsigned long long)byte_len);
	1672
	1673	if (byte_len == 0)
	1674	return 0;
	1675
	1676	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
	1677	ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
	1678	byte_start + byte_len, 0);
	1679	if (ret) {
	1680	mlog_errno(ret);
	1681	goto out;
	1682	}
	1683	/*
	1684	* There's no need to get fancy with the page cache
	1685	* truncate of an inline-data inode. We're talking
	1686	* about less than a page here, which will be cached
	1687	* in the dinode buffer anyway.
	1688	*/
	1689	unmap_mapping_range(mapping, 0, 0, 0);
	1690	truncate_inode_pages(mapping, 0);
	1691	goto out;
	1692	}
	1693
	1694	/*
	1695	* For reflinks, we may need to CoW 2 clusters which might be
	1696	* partially zero'd later, if hole's start and end offset were
	1697	* within one cluster(means is not exactly aligned to clustersize).
	1698	*/
	1699
	1700	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
	1701
	1702	ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
	1703	if (ret) {
	1704	mlog_errno(ret);
	1705	goto out;
	1706	}
	1707
	1708	ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
	1709	if (ret) {
	1710	mlog_errno(ret);
	1711	goto out;
	1712	}
	1713	}
	1714
	1715	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
	1716	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
	1717	cluster_in_el = trunc_end;
	1718
	1719	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
	1720	if (ret) {
	1721	mlog_errno(ret);
	1722	goto out;
	1723	}
	1724
	1725	path = ocfs2_new_path_from_et(&et);
	1726	if (!path) {
	1727	ret = -ENOMEM;
	1728	mlog_errno(ret);
	1729	goto out;
	1730	}
	1731
	1732	while (trunc_end > trunc_start) {
	1733
	1734	ret = ocfs2_find_path(INODE_CACHE(inode), path,
	1735	cluster_in_el);
	1736	if (ret) {
	1737	mlog_errno(ret);
	1738	goto out;
	1739	}
	1740
	1741	el = path_leaf_el(path);
	1742
	1743	i = ocfs2_find_rec(el, trunc_end);
	1744	/*
	1745	* Need to go to previous extent block.
	1746	*/
	1747	if (i < 0) {
	1748	if (path->p_tree_depth == 0)
	1749	break;
	1750
	1751	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
	1752	path,
	1753	&cluster_in_el);
	1754	if (ret) {
	1755	mlog_errno(ret);
	1756	goto out;
	1757	}
	1758
	1759	/*
	1760	* We've reached the leftmost extent block,
	1761	* it's safe to leave.
	1762	*/
	1763	if (cluster_in_el == 0)
	1764	break;
	1765
	1766	/*
	1767	* The 'pos' searched for previous extent block is
	1768	* always one cluster less than actual trunc_end.
	1769	*/
	1770	trunc_end = cluster_in_el + 1;
	1771
	1772	ocfs2_reinit_path(path, 1);
	1773
	1774	continue;
	1775
	1776	} else
	1777	rec = &el->l_recs[i];
	1778
	1779	ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
	1780	&trunc_len, &trunc_end, &blkno, &done);
	1781	if (done)
	1782	break;
	1783
	1784	flags = rec->e_flags;
	1785	phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
	1786
	1787	ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
	1788	phys_cpos, trunc_len, flags,
	1789	&dealloc, refcount_loc);
	1790	if (ret < 0) {
	1791	mlog_errno(ret);
	1792	goto out;
	1793	}
	1794
	1795	cluster_in_el = trunc_end;
	1796
	1797	ocfs2_reinit_path(path, 1);
	1798	}
	1799
	1800	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
	1801
	1802	out:
	1803	ocfs2_schedule_truncate_log_flush(osb, 1);
	1804	ocfs2_run_deallocs(osb, &dealloc);
	1805
	1806	return ret;
	1807	}
	1808
	1809	/*
	1810	* Parts of this function taken from xfs_change_file_space()
	1811	*/
	1812	static int __ocfs2_change_file_space(struct file file, struct inode inode,
	1813	loff_t f_pos, unsigned int cmd,
	1814	struct ocfs2_space_resv *sr,
	1815	int change_size)
	1816	{
	1817	int ret;
	1818	s64 llen;
	1819	loff_t size;
	1820	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	1821	struct buffer_head *di_bh = NULL;
	1822	handle_t *handle;
	1823	unsigned long long max_off = inode->i_sb->s_maxbytes;
	1824
	1825	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
	1826	return -EROFS;
	1827
	1828	mutex_lock(&inode->i_mutex);
	1829
	1830	/*
	1831	* This prevents concurrent writes on other nodes
	1832	*/
	1833	ret = ocfs2_rw_lock(inode, 1);
	1834	if (ret) {
	1835	mlog_errno(ret);
	1836	goto out;
	1837	}
	1838
	1839	ret = ocfs2_inode_lock(inode, &di_bh, 1);
	1840	if (ret) {
	1841	mlog_errno(ret);
	1842	goto out_rw_unlock;
	1843	}
	1844
	1845	if (inode->i_flags & (S_IMMUTABLE\|S_APPEND)) {
	1846	ret = -EPERM;
	1847	goto out_inode_unlock;
	1848	}
	1849
	1850	switch (sr->l_whence) {
	1851	case 0: /SEEK_SET/
	1852	break;
	1853	case 1: /SEEK_CUR/
	1854	sr->l_start += f_pos;
	1855	break;
	1856	case 2: /SEEK_END/
	1857	sr->l_start += i_size_read(inode);
	1858	break;
	1859	default:
	1860	ret = -EINVAL;
	1861	goto out_inode_unlock;
	1862	}
	1863	sr->l_whence = 0;
	1864
	1865	llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
	1866
	1867	if (sr->l_start < 0
	1868	\|\| sr->l_start > max_off
	1869	\|\| (sr->l_start + llen) < 0
	1870	\|\| (sr->l_start + llen) > max_off) {
	1871	ret = -EINVAL;
	1872	goto out_inode_unlock;
	1873	}
	1874	size = sr->l_start + sr->l_len;
	1875
	1876	if (cmd == OCFS2_IOC_RESVSP \|\| cmd == OCFS2_IOC_RESVSP64) {
	1877	if (sr->l_len <= 0) {
	1878	ret = -EINVAL;
	1879	goto out_inode_unlock;
	1880	}
	1881	}
	1882
	1883	if (file && should_remove_suid(file->f_path.dentry)) {
	1884	ret = __ocfs2_write_remove_suid(inode, di_bh);
	1885	if (ret) {
	1886	mlog_errno(ret);
	1887	goto out_inode_unlock;
	1888	}
	1889	}
	1890
	1891	down_write(&OCFS2_I(inode)->ip_alloc_sem);
	1892	switch (cmd) {
	1893	case OCFS2_IOC_RESVSP:
	1894	case OCFS2_IOC_RESVSP64:
	1895	/*
	1896	* This takes unsigned offsets, but the signed ones we
	1897	* pass have been checked against overflow above.
	1898	*/
	1899	ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
	1900	sr->l_len);
	1901	break;
	1902	case OCFS2_IOC_UNRESVSP:
	1903	case OCFS2_IOC_UNRESVSP64:
	1904	ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
	1905	sr->l_len);
	1906	break;
	1907	default:
	1908	ret = -EINVAL;
	1909	}
	1910	up_write(&OCFS2_I(inode)->ip_alloc_sem);
	1911	if (ret) {
	1912	mlog_errno(ret);
	1913	goto out_inode_unlock;
	1914	}
	1915
	1916	/*
	1917	* We update c/mtime for these changes
	1918	*/
	1919	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
	1920	if (IS_ERR(handle)) {
	1921	ret = PTR_ERR(handle);
	1922	mlog_errno(ret);
	1923	goto out_inode_unlock;
	1924	}
	1925
	1926	if (change_size && i_size_read(inode) < size)
	1927	i_size_write(inode, size);
	1928
	1929	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
	1930	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
	1931	if (ret < 0)
	1932	mlog_errno(ret);
	1933
	1934	if (file && (file->f_flags & O_SYNC))
	1935	handle->h_sync = 1;
	1936
	1937	ocfs2_commit_trans(osb, handle);
	1938
	1939	out_inode_unlock:
	1940	brelse(di_bh);
	1941	ocfs2_inode_unlock(inode, 1);
	1942	out_rw_unlock:
	1943	ocfs2_rw_unlock(inode, 1);
	1944
	1945	out:
	1946	mutex_unlock(&inode->i_mutex);
	1947	return ret;
	1948	}
	1949
	1950	int ocfs2_change_file_space(struct file *file, unsigned int cmd,
	1951	struct ocfs2_space_resv *sr)
	1952	{
	1953	struct inode *inode = file_inode(file);
	1954	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	1955	int ret;
	1956
	1957	if ((cmd == OCFS2_IOC_RESVSP \|\| cmd == OCFS2_IOC_RESVSP64) &&
	1958	!ocfs2_writes_unwritten_extents(osb))
	1959	return -ENOTTY;
	1960	else if ((cmd == OCFS2_IOC_UNRESVSP \|\| cmd == OCFS2_IOC_UNRESVSP64) &&
	1961	!ocfs2_sparse_alloc(osb))
	1962	return -ENOTTY;
	1963
	1964	if (!S_ISREG(inode->i_mode))
	1965	return -EINVAL;
	1966
	1967	if (!(file->f_mode & FMODE_WRITE))
	1968	return -EBADF;
	1969
	1970	ret = mnt_want_write_file(file);
	1971	if (ret)
	1972	return ret;
	1973	ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
	1974	mnt_drop_write_file(file);
	1975	return ret;
	1976	}
	1977
	1978	static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
	1979	loff_t len)
	1980	{
	1981	struct inode *inode = file_inode(file);
	1982	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	1983	struct ocfs2_space_resv sr;
	1984	int change_size = 1;
	1985	int cmd = OCFS2_IOC_RESVSP64;
	1986
	1987	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
	1988	return -EOPNOTSUPP;
	1989	if (!ocfs2_writes_unwritten_extents(osb))
	1990	return -EOPNOTSUPP;
	1991
	1992	if (mode & FALLOC_FL_KEEP_SIZE)
	1993	change_size = 0;
	1994
	1995	if (mode & FALLOC_FL_PUNCH_HOLE)
	1996	cmd = OCFS2_IOC_UNRESVSP64;
	1997
	1998	sr.l_whence = 0;
	1999	sr.l_start = (s64)offset;
	2000	sr.l_len = (s64)len;
	2001
	2002	return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
	2003	change_size);
	2004	}
	2005
	2006	int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
	2007	size_t count)
	2008	{
	2009	int ret = 0;
	2010	unsigned int extent_flags;
	2011	u32 cpos, clusters, extent_len, phys_cpos;
	2012	struct super_block *sb = inode->i_sb;
	2013
	2014	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) \|\|
	2015	!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) \|\|
	2016	OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
	2017	return 0;
	2018
	2019	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
	2020	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
	2021
	2022	while (clusters) {
	2023	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
	2024	&extent_flags);
	2025	if (ret < 0) {
	2026	mlog_errno(ret);
	2027	goto out;
	2028	}
	2029
	2030	if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
	2031	ret = 1;
	2032	break;
	2033	}
	2034
	2035	if (extent_len > clusters)
	2036	extent_len = clusters;
	2037
	2038	clusters -= extent_len;
	2039	cpos += extent_len;
	2040	}
	2041	out:
	2042	return ret;
	2043	}
	2044
	2045	static void ocfs2_aiodio_wait(struct inode *inode)
	2046	{
	2047	wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
	2048
	2049	wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
	2050	}
	2051
	2052	static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
	2053	{
	2054	int blockmask = inode->i_sb->s_blocksize - 1;
	2055	loff_t final_size = pos + count;
	2056
	2057	if ((pos & blockmask) \|\| (final_size & blockmask))
	2058	return 1;
	2059	return 0;
	2060	}
	2061
	2062	static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
	2063	struct file *file,
	2064	loff_t pos, size_t count,
	2065	int *meta_level)
	2066	{
	2067	int ret;
	2068	struct buffer_head *di_bh = NULL;
	2069	u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
	2070	u32 clusters =
	2071	ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
	2072
	2073	ret = ocfs2_inode_lock(inode, &di_bh, 1);
	2074	if (ret) {
	2075	mlog_errno(ret);
	2076	goto out;
	2077	}
	2078
	2079	*meta_level = 1;
	2080
	2081	ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
	2082	if (ret)
	2083	mlog_errno(ret);
	2084	out:
	2085	brelse(di_bh);
	2086	return ret;
	2087	}
	2088
	2089	static int ocfs2_prepare_inode_for_write(struct file *file,
	2090	loff_t *ppos,
	2091	size_t count,
	2092	int appending,
	2093	int *direct_io,
	2094	int *has_refcount)
	2095	{
	2096	int ret = 0, meta_level = 0;
	2097	struct dentry *dentry = file->f_path.dentry;
	2098	struct inode *inode = dentry->d_inode;
	2099	loff_t saved_pos = 0, end;
	2100
	2101	/*
	2102	* We start with a read level meta lock and only jump to an ex
	2103	* if we need to make modifications here.
	2104	*/
	2105	for(;;) {
	2106	ret = ocfs2_inode_lock(inode, NULL, meta_level);
	2107	if (ret < 0) {
	2108	meta_level = -1;
	2109	mlog_errno(ret);
	2110	goto out;
	2111	}
	2112
	2113	/* Clear suid / sgid if necessary. We do this here
	2114	* instead of later in the write path because
	2115	* remove_suid() calls ->setattr without any hint that
	2116	* we may have already done our cluster locking. Since
	2117	* ocfs2_setattr() must take cluster locks to
	2118	* proceed, this will lead us to recursively lock the
	2119	* inode. There's also the dinode i_size state which
	2120	* can be lost via setattr during extending writes (we
	2121	* set inode->i_size at the end of a write. */
	2122	if (should_remove_suid(dentry)) {
	2123	if (meta_level == 0) {
	2124	ocfs2_inode_unlock(inode, meta_level);
	2125	meta_level = 1;
	2126	continue;
	2127	}
	2128
	2129	ret = ocfs2_write_remove_suid(inode);
	2130	if (ret < 0) {
	2131	mlog_errno(ret);
	2132	goto out_unlock;
	2133	}
	2134	}
	2135
	2136	/* work on a copy of ppos until we're sure that we won't have
	2137	* to recalculate it due to relocking. */
	2138	if (appending)
	2139	saved_pos = i_size_read(inode);
	2140	else
	2141	saved_pos = *ppos;
	2142
	2143	end = saved_pos + count;
	2144
	2145	ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
	2146	if (ret == 1) {
	2147	ocfs2_inode_unlock(inode, meta_level);
	2148	meta_level = -1;
	2149
	2150	ret = ocfs2_prepare_inode_for_refcount(inode,
	2151	file,
	2152	saved_pos,
	2153	count,
	2154	&meta_level);
	2155	if (has_refcount)
	2156	*has_refcount = 1;
	2157	if (direct_io)
	2158	*direct_io = 0;
	2159	}
	2160
	2161	if (ret < 0) {
	2162	mlog_errno(ret);
	2163	goto out_unlock;
	2164	}
	2165
	2166	/*
	2167	* Skip the O_DIRECT checks if we don't need
	2168	* them.
	2169	*/
	2170	if (!direct_io \|\| !(*direct_io))
	2171	break;
	2172
	2173	/*
	2174	* There's no sane way to do direct writes to an inode
	2175	* with inline data.
	2176	*/
	2177	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
	2178	*direct_io = 0;
	2179	break;
	2180	}
	2181
	2182	/*
	2183	* Allowing concurrent direct writes means
	2184	* i_size changes wouldn't be synchronized, so
	2185	* one node could wind up truncating another
	2186	* nodes writes.
	2187	*/
	2188	if (end > i_size_read(inode)) {
	2189	*direct_io = 0;
	2190	break;
	2191	}
	2192
	2193	/*
	2194	* We don't fill holes during direct io, so
	2195	* check for them here. If any are found, the
	2196	* caller will have to retake some cluster
	2197	* locks and initiate the io as buffered.
	2198	*/
	2199	ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
	2200	if (ret == 1) {
	2201	*direct_io = 0;
	2202	ret = 0;
	2203	} else if (ret < 0)
	2204	mlog_errno(ret);
	2205	break;
	2206	}
	2207
	2208	if (appending)
	2209	*ppos = saved_pos;
	2210
	2211	out_unlock:
	2212	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
	2213	saved_pos, appending, count,
	2214	direct_io, has_refcount);
	2215
	2216	if (meta_level >= 0)
	2217	ocfs2_inode_unlock(inode, meta_level);
	2218
	2219	out:
	2220	return ret;
	2221	}
	2222
	2223	static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
	2224	const struct iovec *iov,
	2225	unsigned long nr_segs,
	2226	loff_t pos)
	2227	{
	2228	int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
	2229	int can_do_direct, has_refcount = 0;
	2230	ssize_t written = 0;
	2231	size_t ocount; /* original count */
	2232	size_t count; /* after file limit checks */
	2233	loff_t old_size, *ppos = &iocb->ki_pos;
	2234	u32 old_clusters;
	2235	struct file *file = iocb->ki_filp;
	2236	struct inode *inode = file_inode(file);
	2237	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	2238	int full_coherency = !(osb->s_mount_opt &
	2239	OCFS2_MOUNT_COHERENCY_BUFFERED);
	2240	int unaligned_dio = 0;
	2241
	2242	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
	2243	(unsigned long long)OCFS2_I(inode)->ip_blkno,
	2244	file->f_path.dentry->d_name.len,
	2245	file->f_path.dentry->d_name.name,
	2246	(unsigned int)nr_segs);
	2247
	2248	if (iocb->ki_left == 0)
	2249	return 0;
	2250
	2251	appending = file->f_flags & O_APPEND ? 1 : 0;
	2252	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
	2253
	2254	mutex_lock(&inode->i_mutex);
	2255
	2256	ocfs2_iocb_clear_sem_locked(iocb);
	2257
	2258	relock:
	2259	/* to match setattr's i_mutex -> rw_lock ordering */
	2260	if (direct_io) {
	2261	have_alloc_sem = 1;
	2262	/* communicate with ocfs2_dio_end_io */
	2263	ocfs2_iocb_set_sem_locked(iocb);
	2264	}
	2265
	2266	/*
	2267	* Concurrent O_DIRECT writes are allowed with
	2268	* mount_option "coherency=buffered".
	2269	*/
	2270	rw_level = (!direct_io \|\| full_coherency);
	2271
	2272	ret = ocfs2_rw_lock(inode, rw_level);
	2273	if (ret < 0) {
	2274	mlog_errno(ret);
	2275	goto out_sems;
	2276	}
	2277
	2278	/*
	2279	* O_DIRECT writes with "coherency=full" need to take EX cluster
	2280	* inode_lock to guarantee coherency.
	2281	*/
	2282	if (direct_io && full_coherency) {
	2283	/*
	2284	* We need to take and drop the inode lock to force
	2285	* other nodes to drop their caches. Buffered I/O
	2286	* already does this in write_begin().
	2287	*/
	2288	ret = ocfs2_inode_lock(inode, NULL, 1);
	2289	if (ret < 0) {
	2290	mlog_errno(ret);
	2291	goto out_sems;
	2292	}
	2293
	2294	ocfs2_inode_unlock(inode, 1);
	2295	}
	2296
	2297	can_do_direct = direct_io;
	2298	ret = ocfs2_prepare_inode_for_write(file, ppos,
	2299	iocb->ki_left, appending,
	2300	&can_do_direct, &has_refcount);
	2301	if (ret < 0) {
	2302	mlog_errno(ret);
	2303	goto out;
	2304	}
	2305
	2306	if (direct_io && !is_sync_kiocb(iocb))
	2307	unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
	2308	*ppos);
	2309
	2310	/*
	2311	* We can't complete the direct I/O as requested, fall back to
	2312	* buffered I/O.
	2313	*/
	2314	if (direct_io && !can_do_direct) {
	2315	ocfs2_rw_unlock(inode, rw_level);
	2316
	2317	have_alloc_sem = 0;
	2318	rw_level = -1;
	2319
	2320	direct_io = 0;
	2321	goto relock;
	2322	}
	2323
	2324	if (unaligned_dio) {
	2325	/*
	2326	* Wait on previous unaligned aio to complete before
	2327	* proceeding.
	2328	*/
	2329	ocfs2_aiodio_wait(inode);
	2330
	2331	/* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
	2332	atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
	2333	ocfs2_iocb_set_unaligned_aio(iocb);
	2334	}
	2335
	2336	/*
	2337	* To later detect whether a journal commit for sync writes is
	2338	* necessary, we sample i_size, and cluster count here.
	2339	*/
	2340	old_size = i_size_read(inode);
	2341	old_clusters = OCFS2_I(inode)->ip_clusters;
	2342
	2343	/* communicate with ocfs2_dio_end_io */
	2344	ocfs2_iocb_set_rw_locked(iocb, rw_level);
	2345
	2346	ret = generic_segment_checks(iov, &nr_segs, &ocount,
	2347	VERIFY_READ);
	2348	if (ret)
	2349	goto out_dio;
	2350
	2351	count = ocount;
	2352	ret = generic_write_checks(file, ppos, &count,
	2353	S_ISBLK(inode->i_mode));
	2354	if (ret)
	2355	goto out_dio;
	2356
	2357	if (direct_io) {
	2358	written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
	2359	ppos, count, ocount);
	2360	if (written < 0) {
	2361	ret = written;
	2362	goto out_dio;
	2363	}
	2364	} else {
	2365	current->backing_dev_info = file->f_mapping->backing_dev_info;
	2366	written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
	2367	ppos, count, 0);
	2368	current->backing_dev_info = NULL;
	2369	}
	2370
	2371	out_dio:
	2372	/* buffered aio wouldn't have proper lock coverage today */
	2373	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
	2374
	2375	if (((file->f_flags & O_DSYNC) && !direct_io) \|\| IS_SYNC(inode) \|\|
	2376	((file->f_flags & O_DIRECT) && !direct_io)) {
	2377	ret = filemap_fdatawrite_range(file->f_mapping, pos,
	2378	pos + count - 1);
	2379	if (ret < 0)
	2380	written = ret;
	2381
	2382	if (!ret && ((old_size != i_size_read(inode)) \|\|
	2383	(old_clusters != OCFS2_I(inode)->ip_clusters) \|\|
	2384	has_refcount)) {
	2385	ret = jbd2_journal_force_commit(osb->journal->j_journal);
	2386	if (ret < 0)
	2387	written = ret;
	2388	}
	2389
	2390	if (!ret)
	2391	ret = filemap_fdatawait_range(file->f_mapping, pos,
	2392	pos + count - 1);
	2393	}
	2394
	2395	/*
	2396	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
	2397	* function pointer which is called when o_direct io completes so that
	2398	* it can unlock our rw lock.
	2399	* Unfortunately there are error cases which call end_io and others
	2400	* that don't. so we don't have to unlock the rw_lock if either an
	2401	* async dio is going to do it in the future or an end_io after an
	2402	* error has already done it.
	2403	*/
	2404	if ((ret == -EIOCBQUEUED) \|\| (!ocfs2_iocb_is_rw_locked(iocb))) {
	2405	rw_level = -1;
	2406	have_alloc_sem = 0;
	2407	unaligned_dio = 0;
	2408	}
	2409
	2410	if (unaligned_dio) {
	2411	ocfs2_iocb_clear_unaligned_aio(iocb);
	2412	atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
	2413	}
	2414
	2415	out:
	2416	if (rw_level != -1)
	2417	ocfs2_rw_unlock(inode, rw_level);
	2418
	2419	out_sems:
	2420	if (have_alloc_sem)
	2421	ocfs2_iocb_clear_sem_locked(iocb);
	2422
	2423	mutex_unlock(&inode->i_mutex);
	2424
	2425	if (written)
	2426	ret = written;
	2427	return ret;
	2428	}
	2429
	2430	static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
	2431	struct file *out,
	2432	struct splice_desc *sd)
	2433	{
	2434	int ret;
	2435
	2436	ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
	2437	sd->total_len, 0, NULL, NULL);
	2438	if (ret < 0) {
	2439	mlog_errno(ret);
	2440	return ret;
	2441	}
	2442
	2443	return splice_from_pipe_feed(pipe, sd, pipe_to_file);
	2444	}
	2445
	2446	static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
	2447	struct file *out,
	2448	loff_t *ppos,
	2449	size_t len,
	2450	unsigned int flags)
	2451	{
	2452	int ret;
	2453	struct address_space *mapping = out->f_mapping;
	2454	struct inode *inode = mapping->host;
	2455	struct splice_desc sd = {
	2456	.total_len = len,
	2457	.flags = flags,
	2458	.pos = *ppos,
	2459	.u.file = out,
	2460	};
	2461
	2462
	2463	trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
	2464	(unsigned long long)OCFS2_I(inode)->ip_blkno,
	2465	out->f_path.dentry->d_name.len,
	2466	out->f_path.dentry->d_name.name, len);
	2467
	2468	pipe_lock(pipe);
	2469
	2470	splice_from_pipe_begin(&sd);
	2471	do {
	2472	ret = splice_from_pipe_next(pipe, &sd);
	2473	if (ret <= 0)
	2474	break;
	2475
	2476	mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
	2477	ret = ocfs2_rw_lock(inode, 1);
	2478	if (ret < 0)
	2479	mlog_errno(ret);
	2480	else {
	2481	ret = ocfs2_splice_to_file(pipe, out, &sd);
	2482	ocfs2_rw_unlock(inode, 1);
	2483	}
	2484	mutex_unlock(&inode->i_mutex);
	2485	} while (ret > 0);
	2486	splice_from_pipe_end(pipe, &sd);
	2487
	2488	pipe_unlock(pipe);
	2489
	2490	if (sd.num_spliced)
	2491	ret = sd.num_spliced;
	2492
	2493	if (ret > 0) {
	2494	int err;
	2495
	2496	err = generic_write_sync(out, *ppos, ret);
	2497	if (err)
	2498	ret = err;
	2499	else
	2500	*ppos += ret;
	2501
	2502	balance_dirty_pages_ratelimited(mapping);
	2503	}
	2504
	2505	return ret;
	2506	}
	2507
	2508	static ssize_t ocfs2_file_splice_read(struct file *in,
	2509	loff_t *ppos,
	2510	struct pipe_inode_info *pipe,
	2511	size_t len,
	2512	unsigned int flags)
	2513	{
	2514	int ret = 0, lock_level = 0;
	2515	struct inode *inode = file_inode(in);
	2516
	2517	trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
	2518	(unsigned long long)OCFS2_I(inode)->ip_blkno,
	2519	in->f_path.dentry->d_name.len,
	2520	in->f_path.dentry->d_name.name, len);
	2521
	2522	/*
	2523	* See the comment in ocfs2_file_aio_read()
	2524	*/
	2525	ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
	2526	if (ret < 0) {
	2527	mlog_errno(ret);
	2528	goto bail;
	2529	}
	2530	ocfs2_inode_unlock(inode, lock_level);
	2531
	2532	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
	2533
	2534	bail:
	2535	return ret;
	2536	}
	2537
	2538	static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
	2539	const struct iovec *iov,
	2540	unsigned long nr_segs,
	2541	loff_t pos)
	2542	{
	2543	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
	2544	struct file *filp = iocb->ki_filp;
	2545	struct inode *inode = file_inode(filp);
	2546
	2547	trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
	2548	(unsigned long long)OCFS2_I(inode)->ip_blkno,
	2549	filp->f_path.dentry->d_name.len,
	2550	filp->f_path.dentry->d_name.name, nr_segs);
	2551
	2552
	2553	if (!inode) {
	2554	ret = -EINVAL;
	2555	mlog_errno(ret);
	2556	goto bail;
	2557	}
	2558
	2559	ocfs2_iocb_clear_sem_locked(iocb);
	2560
	2561	/*
	2562	* buffered reads protect themselves in ->readpage(). O_DIRECT reads
	2563	* need locks to protect pending reads from racing with truncate.
	2564	*/
	2565	if (filp->f_flags & O_DIRECT) {
	2566	have_alloc_sem = 1;
	2567	ocfs2_iocb_set_sem_locked(iocb);
	2568
	2569	ret = ocfs2_rw_lock(inode, 0);
	2570	if (ret < 0) {
	2571	mlog_errno(ret);
	2572	goto bail;
	2573	}
	2574	rw_level = 0;
	2575	/* communicate with ocfs2_dio_end_io */
	2576	ocfs2_iocb_set_rw_locked(iocb, rw_level);
	2577	}
	2578
	2579	/*
	2580	* We're fine letting folks race truncates and extending
	2581	* writes with read across the cluster, just like they can
	2582	* locally. Hence no rw_lock during read.
	2583	*
	2584	* Take and drop the meta data lock to update inode fields
	2585	* like i_size. This allows the checks down below
	2586	* generic_file_aio_read() a chance of actually working.
	2587	*/
	2588	ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
	2589	if (ret < 0) {
	2590	mlog_errno(ret);
	2591	goto bail;
	2592	}
	2593	ocfs2_inode_unlock(inode, lock_level);
	2594
	2595	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
	2596	trace_generic_file_aio_read_ret(ret);
	2597
	2598	/* buffered aio wouldn't have proper lock coverage today */
	2599	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
	2600
	2601	/* see ocfs2_file_aio_write */
	2602	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
	2603	rw_level = -1;
	2604	have_alloc_sem = 0;
	2605	}
	2606
	2607	bail:
	2608	if (have_alloc_sem)
	2609	ocfs2_iocb_clear_sem_locked(iocb);
	2610
	2611	if (rw_level != -1)
	2612	ocfs2_rw_unlock(inode, rw_level);
	2613
	2614	return ret;
	2615	}
	2616
	2617	/* Refer generic_file_llseek_unlocked() */
	2618	static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
	2619	{
	2620	struct inode *inode = file->f_mapping->host;
	2621	int ret = 0;
	2622
	2623	mutex_lock(&inode->i_mutex);
	2624
	2625	switch (whence) {
	2626	case SEEK_SET:
	2627	break;
	2628	case SEEK_END:
	2629	offset += inode->i_size;
	2630	break;
	2631	case SEEK_CUR:
	2632	if (offset == 0) {
	2633	offset = file->f_pos;
	2634	goto out;
	2635	}
	2636	offset += file->f_pos;
	2637	break;
	2638	case SEEK_DATA:
	2639	case SEEK_HOLE:
	2640	ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
	2641	if (ret)
	2642	goto out;
	2643	break;
	2644	default:
	2645	ret = -EINVAL;
	2646	goto out;
	2647	}
	2648
	2649	if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
	2650	ret = -EINVAL;
	2651	if (!ret && offset > inode->i_sb->s_maxbytes)
	2652	ret = -EINVAL;
	2653	if (ret)
	2654	goto out;
	2655
	2656	if (offset != file->f_pos) {
	2657	file->f_pos = offset;
	2658	file->f_version = 0;
	2659	}
	2660
	2661	out:
	2662	mutex_unlock(&inode->i_mutex);
	2663	if (ret)
	2664	return ret;
	2665	return offset;
	2666	}
	2667
	2668	const struct inode_operations ocfs2_file_iops = {
	2669	.setattr = ocfs2_setattr,
	2670	.getattr = ocfs2_getattr,
	2671	.permission = ocfs2_permission,
	2672	.setxattr = generic_setxattr,
	2673	.getxattr = generic_getxattr,
	2674	.listxattr = ocfs2_listxattr,
	2675	.removexattr = generic_removexattr,
	2676	.fiemap = ocfs2_fiemap,
	2677	.get_acl = ocfs2_iop_get_acl,
	2678	};
	2679
	2680	const struct inode_operations ocfs2_special_file_iops = {
	2681	.setattr = ocfs2_setattr,
	2682	.getattr = ocfs2_getattr,
	2683	.permission = ocfs2_permission,
	2684	.get_acl = ocfs2_iop_get_acl,
	2685	};
	2686
	2687	/*
	2688	* Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
	2689	* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
	2690	*/
	2691	const struct file_operations ocfs2_fops = {
	2692	.llseek = ocfs2_file_llseek,
	2693	.read = do_sync_read,
	2694	.write = do_sync_write,
	2695	.mmap = ocfs2_mmap,
	2696	.fsync = ocfs2_sync_file,
	2697	.release = ocfs2_file_release,
	2698	.open = ocfs2_file_open,
	2699	.aio_read = ocfs2_file_aio_read,
	2700	.aio_write = ocfs2_file_aio_write,
	2701	.unlocked_ioctl = ocfs2_ioctl,
	2702	#ifdef CONFIG_COMPAT
	2703	.compat_ioctl = ocfs2_compat_ioctl,
	2704	#endif
	2705	.lock = ocfs2_lock,
	2706	.flock = ocfs2_flock,
	2707	.splice_read = ocfs2_file_splice_read,
	2708	.splice_write = ocfs2_file_splice_write,
	2709	.fallocate = ocfs2_fallocate,
	2710	};
	2711
	2712	const struct file_operations ocfs2_dops = {
	2713	.llseek = generic_file_llseek,
	2714	.read = generic_read_dir,
	2715	.readdir = ocfs2_readdir,
	2716	.fsync = ocfs2_sync_file,
	2717	.release = ocfs2_dir_release,
	2718	.open = ocfs2_dir_open,
	2719	.unlocked_ioctl = ocfs2_ioctl,
	2720	#ifdef CONFIG_COMPAT
	2721	.compat_ioctl = ocfs2_compat_ioctl,
	2722	#endif
	2723	.lock = ocfs2_lock,
	2724	.flock = ocfs2_flock,
	2725	};
	2726
	2727	/*
	2728	* POSIX-lockless variants of our file_operations.
	2729	*
	2730	* These will be used if the underlying cluster stack does not support
	2731	* posix file locking, if the user passes the "localflocks" mount
	2732	* option, or if we have a local-only fs.
	2733	*
	2734	* ocfs2_flock is in here because all stacks handle UNIX file locks,
	2735	* so we still want it in the case of no stack support for
	2736	* plocks. Internally, it will do the right thing when asked to ignore
	2737	* the cluster.
	2738	*/
	2739	const struct file_operations ocfs2_fops_no_plocks = {
	2740	.llseek = ocfs2_file_llseek,
	2741	.read = do_sync_read,
	2742	.write = do_sync_write,
	2743	.mmap = ocfs2_mmap,
	2744	.fsync = ocfs2_sync_file,
	2745	.release = ocfs2_file_release,
	2746	.open = ocfs2_file_open,
	2747	.aio_read = ocfs2_file_aio_read,
	2748	.aio_write = ocfs2_file_aio_write,
	2749	.unlocked_ioctl = ocfs2_ioctl,
	2750	#ifdef CONFIG_COMPAT
	2751	.compat_ioctl = ocfs2_compat_ioctl,
	2752	#endif
	2753	.flock = ocfs2_flock,
	2754	.splice_read = ocfs2_file_splice_read,
	2755	.splice_write = ocfs2_file_splice_write,
	2756	.fallocate = ocfs2_fallocate,
	2757	};
	2758
	2759	const struct file_operations ocfs2_dops_no_plocks = {
	2760	.llseek = generic_file_llseek,
	2761	.read = generic_read_dir,
	2762	.readdir = ocfs2_readdir,
	2763	.fsync = ocfs2_sync_file,
	2764	.release = ocfs2_dir_release,
	2765	.open = ocfs2_dir_open,
	2766	.unlocked_ioctl = ocfs2_ioctl,
	2767	#ifdef CONFIG_COMPAT
	2768	.compat_ioctl = ocfs2_compat_ioctl,
	2769	#endif
	2770	.flock = ocfs2_flock,
	2771	};