* to the work-to-be schedule is freed.
*
* Thus we need to keep the io structure still valid here after
- * convertion finished. The io structure has a flag to
+ * conversion finished. The io structure has a flag to
* avoid double converting from both fsync and background work
* queue work.
*/
* the parent directory's parent as well, and so on recursively, if
* they are also freshly created.
*/
- static void ext4_sync_parent(struct inode *inode)
+ static int ext4_sync_parent(struct inode *inode)
{
+ struct writeback_control wbc;
struct dentry *dentry = NULL;
+ int ret = 0;
while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
break;
inode = dentry->d_parent->d_inode;
- sync_mapping_buffers(inode->i_mapping);
+ ret = sync_mapping_buffers(inode->i_mapping);
+ if (ret)
+ break;
+ memset(&wbc, 0, sizeof(wbc));
+ wbc.sync_mode = WB_SYNC_ALL;
+ wbc.nr_to_write = 0; /* only write out the inode */
+ ret = sync_inode(inode, &wbc);
+ if (ret)
+ break;
}
+ return ret;
}
/*
if (!journal) {
ret = generic_file_fsync(file, datasync);
if (!ret && !list_empty(&inode->i_dentry))
- ext4_sync_parent(inode);
+ ret = ext4_sync_parent(inode);
goto out;
}
* for partial write.
*/
set_buffer_new(bh);
+ set_buffer_mapped(bh);
}
return 0;
}
* because we should have holes filled from ext4_page_mkwrite(). We even don't
* need to file the inode to the transaction's list in ordered mode because if
* we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
+ * we are writing back data modified via mmap(), no one guarantees in which
* transaction the data will hit the disk. In case we are journaling data, we
* cannot start transaction directly because transaction start ranks above page
* lock so we have to do some magic.
/*
* This is called via ext4_da_writepages() to
- * calulate the total number of credits to reserve to fit
+ * calculate the total number of credits to reserve to fit
* a single extent allocation into a single transaction,
* ext4_da_writpeages() will loop calling this before
* the block allocation.
* the pages by calling redirty_page_for_writepage() but that
* would be ugly in the extreme. So instead we would need to
* replicate parts of the code in the above functions,
- * simplifying them becuase we wouldn't actually intend to
+ * simplifying them because we wouldn't actually intend to
* write out the pages, but rather only collect contiguous
* logical block extents, call the multi-block allocator, and
* then update the buffer heads with the block allocations.
*
* The unwrritten extents will be converted to written when DIO is completed.
* For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the convertion
+ * set up an end_io call back function, which will do the conversion
* when async direct IO completed.
*
* If the O_DIRECT write will extend the file then add this inode to the
* We could direct write to holes and fallocate.
*
* Allocated blocks to fill the hole are marked as uninitialized
- * to prevent paralel buffered read to expose the stale data
+ * to prevent parallel buffered read to expose the stale data
* before DIO complete the data IO.
*
* As to previously fallocated extents, ext4 get_block
int err;
/*
* for non AIO case, since the IO is already
- * completed, we could do the convertion right here
+ * completed, we could do the conversion right here
*/
err = ext4_convert_unwritten_extents(inode,
offset, ret);
*
* When we do truncate() we may have to clean the ends of several
* indirect blocks but leave the blocks themselves alive. Block is
- * partially truncated if some data below the new i_size is refered
+ * partially truncated if some data below the new i_size is referred
* from it (and it is on the path to the first completely truncated
* data block, indeed). We have to free the top of that path along
* with everything to the right of the path. Since no allocation
* @first: array of block numbers
* @last: points immediately past the end of array
*
- * We are freeing all blocks refered from that array (numbers are stored as
+ * We are freeing all blocks referred from that array (numbers are stored as
* little-endian 32-bit) and updating @inode->i_blocks appropriately.
*
* We accumulate contiguous runs of blocks to free. Conveniently, if these
* @last: pointer immediately past the end of array
* @depth: depth of the branches to free
*
- * We are freeing all blocks refered from these branches (numbers are
+ * We are freeing all blocks referred from these branches (numbers are
* stored as little-endian 32-bit) and updating @inode->i_blocks
* appropriately.
*/
Indirect chain[4];
Indirect *partial;
__le32 nr = 0;
- int n;
- ext4_lblk_t last_block;
+ int n = 0;
+ ext4_lblk_t last_block, max_block;
unsigned blocksize = inode->i_sb->s_blocksize;
trace_ext4_truncate_enter(inode);
last_block = (inode->i_size + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+ max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+ >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
if (inode->i_size & (blocksize - 1))
if (ext4_block_truncate_page(handle, mapping, inode->i_size))
goto out_stop;
- n = ext4_block_to_path(inode, last_block, offsets, NULL);
- if (n == 0)
- goto out_stop; /* error */
+ if (last_block != max_block) {
+ n = ext4_block_to_path(inode, last_block, offsets, NULL);
+ if (n == 0)
+ goto out_stop; /* error */
+ }
/*
* OK. This truncate is going to happen. We add the inode to the
*/
ei->i_disksize = inode->i_size;
- if (n == 1) { /* direct blocks */
+ if (last_block == max_block) {
+ /*
+ * It is unnecessary to free any data blocks if last_block is
+ * equal to the indirect block limit.
+ */
+ goto out_unlock;
+ } else if (n == 1) { /* direct blocks */
ext4_free_data(handle, inode, NULL, i_data+offsets[0],
i_data + EXT4_NDIR_BLOCKS);
goto do_indirects;
;
}
+ out_unlock:
up_write(&ei->i_data_sem);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
/* if nrblocks are contiguous */
if (chunk) {
/*
- * With N contiguous data blocks, it need at most
- * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
- * 2 dindirect blocks
- * 1 tindirect block
+ * With N contiguous data blocks, we need at most
+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+ * 2 dindirect blocks, and 1 tindirect block
*/
- indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
- return indirects + 3;
+ return DIV_ROUND_UP(nrblocks,
+ EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
}
/*
* if nrblocks are not contiguous, worse case, each block touch
}
/*
- * Calulate the total number of credits to reserve to fit
+ * Calculate the total number of credits to reserve to fit
* the modification of a single pages into a single transaction,
* which may include multiple chunks of block allocations.
*
* journal_end calls result in the superblock being marked dirty, so
* that sync() will call the filesystem's write_super callback if
* appropriate.
+ *
+ * To avoid j_barrier hold in userspace when a user calls freeze(),
+ * ext4 prevents a new handle from being started by s_frozen, which
+ * is in an upper layer.
*/
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
{
journal_t *journal;
+ handle_t *handle;
if (sb->s_flags & MS_RDONLY)
return ERR_PTR(-EROFS);
- vfs_check_frozen(sb, SB_FREEZE_TRANS);
- /* Special case here: if the journal has aborted behind our
- * backs (eg. EIO in the commit thread), then we still need to
- * take the FS itself readonly cleanly. */
journal = EXT4_SB(sb)->s_journal;
- if (journal) {
- if (is_journal_aborted(journal)) {
- ext4_abort(sb, "Detected aborted journal");
- return ERR_PTR(-EROFS);
- }
- return jbd2_journal_start(journal, nblocks);
+ handle = ext4_journal_current_handle();
+
+ /*
+ * If a handle has been started, it should be allowed to
+ * finish, otherwise deadlock could happen between freeze
+ * and others(e.g. truncate) due to the restart of the
+ * journal handle if the filesystem is forzen and active
+ * handles are not stopped.
+ */
+ if (!handle)
+ vfs_check_frozen(sb, SB_FREEZE_TRANS);
+
+ if (!journal)
+ return ext4_get_nojournal();
+ /*
+ * Special case here: if the journal has aborted behind our
+ * backs (eg. EIO in the commit thread), then we still need to
+ * take the FS itself readonly cleanly.
+ */
+ if (is_journal_aborted(journal)) {
+ ext4_abort(sb, "Detected aborted journal");
+ return ERR_PTR(-EROFS);
}
- return ext4_get_nojournal();
+ return jbd2_journal_start(journal, nblocks);
}
/*
* filesystem will have already been marked read/only and the
* journal has been aborted. We return 1 as a hint to callers
* who might what to use the return value from
- * ext4_grp_locked_error() to distinguish beween the
+ * ext4_grp_locked_error() to distinguish between the
* ERRORS_CONT and ERRORS_RO case, and perhaps return more
* aggressively from the ext4 function in question, with a
* more appropriate error code.
mutex_unlock(&ext4_li_info->li_list_mtx);
sbi->s_li_request = elr;
+ /*
+ * set elr to NULL here since it has been inserted to
+ * the request_list and the removal and free of it is
+ * handled by ext4_clear_request_list from now on.
+ */
+ elr = NULL;
if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
ret = ext4_run_lazyinit_thread();
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
+ init_timer(&sbi->s_err_report);
+ sbi->s_err_report.function = print_daily_error_info;
+ sbi->s_err_report.data = (unsigned long) sb;
+
err = percpu_counter_init(&sbi->s_freeblocks_counter,
ext4_count_free_blocks(sb));
if (!err) {
"Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
*sbi->s_es->s_mount_opts ? "; " : "", orig_data);
- init_timer(&sbi->s_err_report);
- sbi->s_err_report.function = print_daily_error_info;
- sbi->s_err_report.data = (unsigned long) sb;
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
sbi->s_journal = NULL;
}
failed_mount3:
+ del_timer(&sbi->s_err_report);
if (sbi->s_flex_groups) {
if (is_vmalloc_addr(sbi->s_flex_groups))
vfree(sbi->s_flex_groups);
/*
* LVM calls this function before a (read-only) snapshot is created. This
* gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently, because ext4 prevents a new handle from being started
+ * by @sb->s_frozen, which stays in an upper layer. It thus needs help from
+ * the upper layer.
*/
static int ext4_freeze(struct super_block *sb)
{
static int ext4_quota_off(struct super_block *sb, int type)
{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ handle_t *handle;
+
/* Force all delayed allocation blocks to be allocated.
* Caller already holds s_umount sem */
if (test_opt(sb, DELALLOC))
sync_filesystem(sb);
+ /* Update modification times of quota files when userspace can
+ * start looking at them */
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle))
+ goto out;
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+ out:
return dquot_quota_off(sb, type);
}
/* Read data from quotafile - avoid pagecache and such because we cannot afford
* acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
* we don't have to be afraid of races */
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off)
if (inode->i_size < off + len) {
i_size_write(inode, off + len);
EXT4_I(inode)->i_disksize = inode->i_size;
+ ext4_mark_inode_dirty(handle, inode);
}
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- ext4_mark_inode_dirty(handle, inode);
mutex_unlock(&inode->i_mutex);
return len;
}
int ret;
struct timespec now = current_kernel_time();
+ *cbh = NULL;
+
if (is_journal_aborted(journal))
return 0;
* we do not require it to remember exactly which old buffers it
* has reserved. This is consistent with the existing behaviour
* that multiple jbd2_journal_get_write_access() calls to the same
- * buffer are perfectly permissable.
+ * buffer are perfectly permissible.
*/
while (commit_transaction->t_reserved_list) {
jh = commit_transaction->t_reserved_list;
if (err)
__jbd2_journal_abort_hard(journal);
}
- if (!err && !is_journal_aborted(journal))
+ if (cbh)
err = journal_wait_on_commit_record(journal, cbh);
if (JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
journal->j_wbufsize = n;
journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
if (!journal->j_wbuf) {
- printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+ printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
__func__);
goto out_err;
}
journal->j_wbufsize = n;
journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
if (!journal->j_wbuf) {
- printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+ printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
__func__);
goto out_err;
}
new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
if (!new_dev)
return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
+ bd = bdget(device);
spin_lock(&devname_cache_lock);
if (devcache[i]) {
if (devcache[i]->device == device) {
kfree(new_dev);
+ bdput(bd);
ret = devcache[i]->devname;
spin_unlock(&devname_cache_lock);
return ret;
}
devcache[i] = new_dev;
devcache[i]->device = device;
- bd = bdget(device);
if (bd) {
bdevname(bd, devcache[i]->devname);
bdput(bd);