EXT4_MB_NUM_CRS
};
- /* criteria below which we use fast block scanning and avoid unnecessary IO */
- #define CR_FAST CR_GOAL_LEN_SLOW
-
/*
* Flags used in mballoc's allocation_context flags field.
*
* affected filesystem before 2242.
*/
-static inline __le32 ext4_encode_extra_time(struct timespec64 *time)
+static inline __le32 ext4_encode_extra_time(struct timespec64 ts)
{
- u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK;
- return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS));
+ u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK;
+ return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS));
}
-static inline void ext4_decode_extra_time(struct timespec64 *time,
- __le32 extra)
+static inline struct timespec64 ext4_decode_extra_time(__le32 base,
+ __le32 extra)
{
+ struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) };
+
if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
- time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
- time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+ ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
+ ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+ return ts;
}
-#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
+#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \
do { \
- if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\
- (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
- (raw_inode)->xtime ## _extra = \
- ext4_encode_extra_time(&(inode)->xtime); \
- } \
- else \
- (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \
+ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \
+ (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \
+ (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \
+ } else \
+ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \
} while (0)
-#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
-do { \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
- (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
- (raw_inode)->xtime ## _extra = \
- ext4_encode_extra_time(&(einode)->xtime); \
-} while (0)
+#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime)
+
+#define EXT4_INODE_SET_CTIME(inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode))
+
+#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
+ EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \
+ raw_inode, (einode)->xtime)
+
+#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \
+ (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \
+ ext4_decode_extra_time((raw_inode)->xtime, \
+ (raw_inode)->xtime ## _extra) : \
+ (struct timespec64) { \
+ .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \
+ })
#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \
do { \
- (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \
- if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \
- ext4_decode_extra_time(&(inode)->xtime, \
- raw_inode->xtime ## _extra); \
- } \
- else \
- (inode)->xtime.tv_nsec = 0; \
+ (inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode); \
} while (0)
+#define EXT4_INODE_GET_CTIME(inode, raw_inode) \
+do { \
+ inode_set_ctime_to_ts(inode, \
+ EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \
+} while (0)
-#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
-do { \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
- (einode)->xtime.tv_sec = \
- (signed)le32_to_cpu((raw_inode)->xtime); \
- else \
- (einode)->xtime.tv_sec = 0; \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
- ext4_decode_extra_time(&(einode)->xtime, \
- raw_inode->xtime ## _extra); \
- else \
- (einode)->xtime.tv_nsec = 0; \
+#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
+do { \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
+ (einode)->xtime = \
+ EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \
+ raw_inode); \
+ else \
+ (einode)->xtime = (struct timespec64){0, 0}; \
} while (0)
#define i_disk_version osd1.linux1.l_i_version
#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group
* scanning in mballoc
*/
+ #define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
#define ext4_test_and_set_bit __test_and_set_bit_le
#define ext4_set_bit __set_bit_le
- #define ext4_set_bit_atomic ext2_set_bit_atomic
#define ext4_test_and_clear_bit __test_and_clear_bit_le
#define ext4_clear_bit __clear_bit_le
- #define ext4_clear_bit_atomic ext2_clear_bit_atomic
#define ext4_test_bit test_bit_le
#define ext4_find_next_zero_bit find_next_zero_bit_le
#define ext4_find_next_bit find_next_bit_le
const char *s_last_error_func;
time64_t s_last_error_time;
/*
- * If we are in a context where we cannot update error information in
- * the on-disk superblock, we queue this work to do it.
+ * If we are in a context where we cannot update the on-disk
+ * superblock, we queue the work here. This is used to update
+ * the error information in the superblock, and for periodic
+ * updates of the superblock called from the commit callback
+ * function.
*/
- struct work_struct s_error_work;
+ struct work_struct s_sb_upd_work;
/* Ext4 fast commit sub transaction ID */
atomic_t s_fc_subtid;
*/
enum {
EXT4_MF_MNTDIR_SAMPLED,
- EXT4_MF_FS_ABORTED, /* Fatal error detected */
EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */
};
#define EXT4_FLAGS_SHUTDOWN 1
#define EXT4_FLAGS_BDEV_IS_DAX 2
- static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
+ static inline int ext4_forced_shutdown(struct super_block *sb)
{
- return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+ return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
}
/*
extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
s64 nclusters, unsigned int flags);
extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
- extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
ext4_group_t block_group,
struct buffer_head ** bh);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
extern unsigned long ext4_count_dirs(struct super_block *);
- extern void ext4_check_inodes_bitmap(struct super_block *);
extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
extern int ext4_init_inode_table(struct super_block *sb,
ext4_group_t group, int barrier);
extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
- extern int ext4_mb_reserve_blocks(struct super_block *, int);
extern void ext4_discard_preallocations(struct inode *, unsigned int);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
int len, int state);
+ static inline bool ext4_mb_cr_expensive(enum criteria cr)
+ {
+ return cr >= CR_GOAL_LEN_SLOW;
+ }
/* inode.c */
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
extern void ext4_clear_inode(struct inode *);
extern int ext4_file_getattr(struct mnt_idmap *, const struct path *,
struct kstat *, u32, unsigned int);
- extern int ext4_sync_inode(handle_t *, struct inode *);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
ext4_group_t block_group,
unsigned int flags);
+ extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
+ ext4_group_t block_group);
extern __printf(7, 8)
void __ext4_error(struct super_block *, const char *, unsigned int, bool,
/* inline.c */
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
- extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
int ext4_readpage_inline(struct inode *inode, struct folio *folio);
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-
/* For ioend & aio unwritten conversion wait queues */
#define EXT4_WQ_HASH_SZ 37
#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
might_sleep();
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ if (unlikely(ext4_forced_shutdown(sb)))
return -EIO;
- if (sb_rdonly(sb))
+ if (WARN_ON_ONCE(sb_rdonly(sb)))
return -EROFS;
+
WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
journal = EXT4_SB(sb)->s_journal;
/*
might_sleep();
- if (bh->b_bdev->bd_super)
- ext4_check_bdev_write_error(bh->b_bdev->bd_super);
+ ext4_check_bdev_write_error(sb);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
{
struct inode *inode = file_inode(iocb->ki_filp);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
if (!iov_iter_count(to))
{
struct inode *inode = file_inode(in);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
return filemap_splice_read(in, ppos, pipe, len, flags);
}
* required to change security info in file_modified(), for extending
* I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten
* extents (as partial block zeroing may be required).
+ *
+ * Note that unaligned writes are allowed under shared lock so long as
+ * they are pure overwrites. Otherwise, concurrent unaligned writes risk
+ * data corruption due to partial block zeroing in the dio layer, and so
+ * the I/O must occur exclusively.
*/
if (*ilock_shared &&
((!IS_NOSEC(inode) || *extend || !overwrite ||
/*
* Now that locking is settled, determine dio flags and exclusivity
- * requirements. Unaligned writes are allowed under shared lock so long
- * as they are pure overwrites. Set the iomap overwrite only flag as an
- * added precaution in this case. Even though this is unnecessary, we
- * can detect and warn on unexpected -EAGAIN if an unsafe unaligned
- * write is ever submitted.
- *
- * Otherwise, concurrent unaligned writes risk data corruption due to
- * partial block zeroing in the dio layer, and so the I/O must occur
- * exclusively. The inode lock is already held exclusive if the write is
- * non-overwrite or extending, so drain all outstanding dio and set the
- * force wait dio flag.
+ * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce
+ * behavior already. The inode lock is already held exclusive if the
+ * write is non-overwrite or extending, so drain all outstanding dio and
+ * set the force wait dio flag.
*/
- if (*ilock_shared && unaligned_io) {
- *dio_flags = IOMAP_DIO_OVERWRITE_ONLY;
- } else if (!*ilock_shared && (unaligned_io || *extend)) {
+ if (!*ilock_shared && (unaligned_io || *extend)) {
if (iocb->ki_flags & IOCB_NOWAIT) {
ret = -EAGAIN;
goto out;
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
dio_flags, NULL, 0);
- WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT));
if (ret == -ENOTBLK)
ret = 0;
{
struct inode *inode = file_inode(iocb->ki_filp);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
#ifdef CONFIG_FS_DAX
}
#ifdef CONFIG_FS_DAX
-static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size)
+static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
{
int error = 0;
vm_fault_t result;
* read-only.
*
* We check for VM_SHARED rather than vmf->cow_page since the latter is
- * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
+ * unset for order != 0 (i.e. only in do_cow_fault); for
* other sizes, dax_iomap_fault will handle splitting / fallback so that
* we eventually come back with a COW page.
*/
} else {
filemap_invalidate_lock_shared(mapping);
}
- result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
+ result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
if (write) {
ext4_journal_stop(handle);
goto retry;
/* Handling synchronous page fault? */
if (result & VM_FAULT_NEEDDSYNC)
- result = dax_finish_sync_fault(vmf, pe_size, pfn);
+ result = dax_finish_sync_fault(vmf, order, pfn);
filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(sb);
} else {
static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
{
- return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
+ return ext4_dax_huge_fault(vmf, 0);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_mapping->host;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct dax_device *dax_dev = sbi->s_daxdev;
+ struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
- if (unlikely(ext4_forced_shutdown(sbi)))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
/*
{
int ret;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
sb = dir->i_sb;
sbi = EXT4_SB(sb);
- if (unlikely(ext4_forced_shutdown(sbi)))
+ if (unlikely(ext4_forced_shutdown(sb)))
return ERR_PTR(-EIO);
ngroups = ext4_get_groups_count(sb);
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
ei->i_crtime = inode->i_mtime;
memset(ei->i_data, 0, sizeof(ei->i_data));
int num, ret = 0, used_blks = 0;
unsigned long used_inos = 0;
- /* This should not happen, but just to be sure check this */
- if (sb_rdonly(sb)) {
- ret = 1;
- goto out;
- }
-
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp || !grp)
goto out;
struct ext4_inode *raw_inode;
int cp_len = 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return;
BUG_ON(!EXT4_I(inode)->i_inline_off);
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
return 1;
ext4_orphan_del(handle, inode);
if (err == 0) {
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
err = ext4_mark_inode_dirty(handle, inode);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
pgoff_t index;
unsigned from, to;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
trace_ext4_write_begin(inode, pos, len);
if (folio->index < mpd->first_page)
continue;
- if (folio->index + folio_nr_pages(folio) - 1 > end)
+ if (folio_next_index(folio) - 1 > end)
continue;
BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio));
if (err < 0) {
struct super_block *sb = inode->i_sb;
- if (ext4_forced_shutdown(EXT4_SB(sb)) ||
- ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
+ if (ext4_forced_shutdown(sb))
goto invalidate_dirty_pages;
/*
* Let the uper layers retry transient errors.
if (mpd->map.m_len == 0)
mpd->first_page = folio->index;
- mpd->next_page = folio->index + folio_nr_pages(folio);
+ mpd->next_page = folio_next_index(folio);
/*
* Writeout when we cannot modify metadata is simple.
* Just submit the page. For data=journal mode we
* If the filesystem has aborted, it is read-only, so return
* right away instead of dumping stack traces later on that
* will obscure the real source of the problem. We test
- * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because
+ * fs shutdown state instead of sb->s_flag's SB_RDONLY because
* the latter could be true if the filesystem is mounted
* read-only, and in that case, ext4_writepages should
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
- if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
- ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {
+ if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) {
ret = -EROFS;
goto out_writepages;
}
int ret;
int alloc_ctx;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ if (unlikely(ext4_forced_shutdown(sb)))
return -EIO;
alloc_ctx = ext4_writepages_down_read(sb);
int ret;
long nr_to_write = wbc->nr_to_write;
struct inode *inode = mapping->host;
- struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
int alloc_ctx;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
alloc_ctx = ext4_writepages_down_read(inode->i_sb);
trace_ext4_writepages(inode, wbc);
- ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
+ ret = dax_writeback_mapping_range(mapping,
+ EXT4_SB(inode->i_sb)->s_daxdev, wbc);
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
ext4_writepages_up_read(inode->i_sb, alloc_ctx);
pgoff_t index;
struct inode *inode = mapping->host;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
index = pos >> PAGE_SHIFT;
return 1;
}
+ static int ext4_da_do_write_end(struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page)
+ {
+ struct inode *inode = mapping->host;
+ loff_t old_size = inode->i_size;
+ bool disksize_changed = false;
+ loff_t new_i_size;
+
+ /*
+ * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
+ * flag, which all that's needed to trigger page writeback.
+ */
+ copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL);
+ new_i_size = pos + copied;
+
+ /*
+ * It's important to update i_size while still holding page lock,
+ * because page writeout could otherwise come in and zero beyond
+ * i_size.
+ *
+ * Since we are holding inode lock, we are sure i_disksize <=
+ * i_size. We also know that if i_disksize < i_size, there are
+ * delalloc writes pending in the range up to i_size. If the end of
+ * the current write is <= i_size, there's no need to touch
+ * i_disksize since writeback will push i_disksize up to i_size
+ * eventually. If the end of the current write is > i_size and
+ * inside an allocated block which ext4_da_should_update_i_disksize()
+ * checked, we need to update i_disksize here as certain
+ * ext4_writepages() paths not allocating blocks and update i_disksize.
+ */
+ if (new_i_size > inode->i_size) {
+ unsigned long end;
+
+ i_size_write(inode, new_i_size);
+ end = (new_i_size - 1) & (PAGE_SIZE - 1);
+ if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) {
+ ext4_update_i_disksize(inode, new_i_size);
+ disksize_changed = true;
+ }
+ }
+
+ unlock_page(page);
+ put_page(page);
+
+ if (old_size < pos)
+ pagecache_isize_extended(inode, old_size, pos);
+
+ if (disksize_changed) {
+ handle_t *handle;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+ }
+
+ return copied;
+ }
+
static int ext4_da_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
- loff_t new_i_size;
- unsigned long start, end;
int write_mode = (int)(unsigned long)fsdata;
struct folio *folio = page_folio(page);
if (unlikely(copied < len) && !PageUptodate(page))
copied = 0;
- start = pos & (PAGE_SIZE - 1);
- end = start + copied - 1;
-
- /*
- * Since we are holding inode lock, we are sure i_disksize <=
- * i_size. We also know that if i_disksize < i_size, there are
- * delalloc writes pending in the range upto i_size. If the end of
- * the current write is <= i_size, there's no need to touch
- * i_disksize since writeback will push i_disksize upto i_size
- * eventually. If the end of the current write is > i_size and
- * inside an allocated block (ext4_da_should_update_i_disksize()
- * check), we need to update i_disksize here as certain
- * ext4_writepages() paths not allocating blocks update i_disksize.
- *
- * Note that we defer inode dirtying to generic_write_end() /
- * ext4_da_write_inline_data_end().
- */
- new_i_size = pos + copied;
- if (copied && new_i_size > inode->i_size &&
- ext4_da_should_update_i_disksize(folio, end))
- ext4_update_i_disksize(inode, new_i_size);
-
- return generic_write_end(file, mapping, pos, len, copied, &folio->page,
- fsdata);
+ return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page);
}
/*
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ret2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret2))
ret = ret2;
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
err2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(err2 && !err))
err = err2;
}
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
- EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_SET_CTIME(inode, raw_inode);
EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
}
}
- EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_GET_CTIME(inode, raw_inode);
EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
"iget: bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
- if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
+ if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) {
ext4_error_inode(inode, function, line, 0,
"casefold flag without casefold feature");
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
if ((err_str = check_igot_inode(inode, flags)) != NULL) {
ext4_error_inode(inode, function, line, 0, err_str);
ret = -EFSCORRUPTED;
spin_unlock(&inode->i_lock);
spin_lock(&ei->i_raw_lock);
- EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_SET_CTIME(inode, raw_inode);
EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
ext4_inode_csum_set(inode, raw_inode, ei);
{
int err;
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) ||
- sb_rdonly(inode->i_sb))
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
if (EXT4_SB(inode->i_sb)->s_journal) {
const unsigned int ia_valid = attr->ia_valid;
bool inc_ivers = true;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
if (unlikely(IS_IMMUTABLE(inode)))
* Update c/mtime on truncate up, ext4_truncate() will
* update c/mtime in shrink case below
*/
- if (!shrink) {
- inode->i_mtime = current_time(inode);
- inode->i_ctime = inode->i_mtime;
- }
+ if (!shrink)
+ inode->i_mtime = inode_set_ctime_current(inode);
if (shrink)
ext4_fc_track_range(handle, inode,
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
{
int err = 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
+ if (unlikely(ext4_forced_shutdown(inode->i_sb))) {
put_bh(iloc->bh);
return -EIO;
}
{
int err;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
err = ext4_get_inode_loc(inode, iloc);
if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_alloc;
out_ret:
- ret = block_page_mkwrite_return(err);
+ ret = vmf_fs_error(err);
out:
filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(inode->i_sb);
diff = size - size_bl;
swap_inode_data(inode, inode_bl);
- inode->i_ctime = inode_bl->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
+ inode_set_ctime_current(inode_bl);
inode_inc_iversion(inode);
inode->i_generation = get_random_u32();
ext4_set_inode_flags(inode, false);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
EXT4_I(inode)->i_projid = kprojid;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
out_dirty:
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH)
return -EINVAL;
- if (ext4_forced_shutdown(sbi))
+ if (ext4_forced_shutdown(sb))
return 0;
ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags);
}
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
inode->i_generation = generation;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
struct dx_hash_info *hinfo = &name->hinfo;
int len;
- if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding ||
+ if (!IS_CASEFOLDED(dir) ||
(IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) {
cf_name->name = NULL;
return 0;
#endif
#if IS_ENABLED(CONFIG_UNICODE)
- if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) &&
+ if (IS_CASEFOLDED(parent) &&
(!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) {
if (fname->cf_name.name) {
struct qstr cf = {.name = fname->cf_name.name,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
err2 = ext4_mark_inode_dirty(handle, dir);
#if IS_ENABLED(CONFIG_UNICODE)
if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
- sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name))
+ utf8_validate(sb->s_encoding, &dentry->d_name))
return -EINVAL;
#endif
return err;
}
drop_nlink(inode);
+ ext4_mark_inode_dirty(handle, inode);
ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
return err;
struct ext4_dir_entry_2 *de;
handle_t *handle = NULL;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+ if (unlikely(ext4_forced_shutdown(dir->i_sb)))
return -EIO;
/* Initialize quotas before so that eventual writes go in
* recovery. */
inode->i_size = 0;
ext4_orphan_add(handle, inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_ctime_current(inode);
retval = ext4_mark_inode_dirty(handle, inode);
if (retval)
goto end_rmdir;
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto out_handle;
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
retval = ext4_mark_inode_dirty(handle, inode);
if (dentry && !retval)
ext4_fc_track_unlink(handle, dentry);
{
int retval;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+ if (unlikely(ext4_forced_shutdown(dir->i_sb)))
return -EIO;
trace_ext4_unlink_enter(dir, dentry);
struct fscrypt_str disk_link;
int retries = 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+ if (unlikely(ext4_forced_shutdown(dir->i_sb)))
return -EIO;
err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize,
err_drop_inode:
clear_nlink(inode);
+ ext4_mark_inode_dirty(handle, inode);
ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
if (handle)
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ext4_inc_count(inode);
ihold(inode);
if (ext4_has_feature_filetype(ent->dir->i_sb))
ent->de->file_type = file_type;
inode_inc_iversion(ent->dir);
- ent->dir->i_ctime = ent->dir->i_mtime =
- current_time(ent->dir);
+ ent->dir->i_mtime = inode_set_ctime_current(ent->dir);
retval = ext4_mark_inode_dirty(handle, ent->dir);
BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
if (!ent->inlined) {
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old.inode->i_ctime = current_time(old.inode);
+ inode_set_ctime_current(old.inode);
retval = ext4_mark_inode_dirty(handle, old.inode);
if (unlikely(retval))
goto end_rename;
if (new.inode) {
ext4_dec_count(new.inode);
- new.inode->i_ctime = current_time(new.inode);
+ inode_set_ctime_current(new.inode);
}
- old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir);
+ old.dir->i_mtime = inode_set_ctime_current(old.dir);
ext4_update_dx_flag(old.dir);
if (old.dir_bh) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
ext4_resetent(handle, &old,
old.inode->i_ino, old_file_type);
drop_nlink(whiteout);
+ ext4_mark_inode_dirty(handle, whiteout);
ext4_orphan_add(handle, whiteout);
}
unlock_new_inode(whiteout);
};
u8 new_file_type;
int retval;
- struct timespec64 ctime;
if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
!projid_eq(EXT4_I(new_dir)->i_projid,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- ctime = current_time(old.inode);
- old.inode->i_ctime = ctime;
- new.inode->i_ctime = ctime;
+ inode_set_ctime_current(old.inode);
+ inode_set_ctime_current(new.inode);
retval = ext4_mark_inode_dirty(handle, old.inode);
if (unlikely(retval))
goto end_rename;
{
int err;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb))))
+ if (unlikely(ext4_forced_shutdown(old_dir->i_sb)))
return -EIO;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
static int ext4_reconfigure(struct fs_context *fc);
static void ext4_fc_free(struct fs_context *fc);
static int ext4_init_fs_context(struct fs_context *fc);
+static void ext4_kill_sb(struct super_block *sb);
static const struct fs_parameter_spec ext4_param_specs[];
/*
.name = "ext2",
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
- .kill_sb = kill_block_super,
+ .kill_sb = ext4_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext2");
MODULE_ALIAS("ext2");
-#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type)
#else
#define IS_EXT2_SB(sb) (0)
#endif
.name = "ext3",
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
- .kill_sb = kill_block_super,
+ .kill_sb = ext4_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext3");
MODULE_ALIAS("ext3");
-#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type)
static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
#define ext4_get_tstamp(es, tstamp) \
__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
+ #define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */
+ #define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */
+
+ /*
+ * The ext4_maybe_update_superblock() function checks and updates the
+ * superblock if needed.
+ *
+ * This function is designed to update the on-disk superblock only under
+ * certain conditions to prevent excessive disk writes and unnecessary
+ * waking of the disk from sleep. The superblock will be updated if:
+ * 1. More than an hour has passed since the last superblock update, and
+ * 2. More than 16MB have been written since the last superblock update.
+ *
+ * @sb: The superblock
+ */
+ static void ext4_maybe_update_superblock(struct super_block *sb)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ journal_t *journal = sbi->s_journal;
+ time64_t now;
+ __u64 last_update;
+ __u64 lifetime_write_kbytes;
+ __u64 diff_size;
+
+ if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) ||
+ !journal || (journal->j_flags & JBD2_UNMOUNT))
+ return;
+
+ now = ktime_get_real_seconds();
+ last_update = ext4_get_tstamp(es, s_wtime);
+
+ if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC))
+ return;
+
+ lifetime_write_kbytes = sbi->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
+ sbi->s_sectors_written_start) >> 1);
+
+ /* Get the number of kilobytes not written to disk to account
+ * for statistics and compare with a multiple of 16 MB. This
+ * is used to determine when the next superblock commit should
+ * occur (i.e. not more often than once per 16MB if there was
+ * less written in an hour).
+ */
+ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written);
+
+ if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB)
+ schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
+ }
+
/*
* The del_gendisk() function uninitializes the disk-specific data
* structures, including the bdi structure, without telling anyone
BUG_ON(txn->t_state == T_FINISHED);
ext4_process_freed_data(sb, txn->t_tid);
+ ext4_maybe_update_superblock(sb);
spin_lock(&sbi->s_md_lock);
while (!list_empty(&txn->t_private_list)) {
WARN_ON_ONCE(1);
if (!continue_fs && !sb_rdonly(sb)) {
- ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
+ set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
if (journal)
jbd2_journal_abort(journal, -EIO);
}
* defer superblock flushing to a workqueue.
*/
if (continue_fs && journal)
- schedule_work(&EXT4_SB(sb)->s_error_work);
+ schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
else
ext4_commit_super(sb);
}
sb->s_flags |= SB_RDONLY;
}
- static void flush_stashed_error_work(struct work_struct *work)
+ static void update_super_work(struct work_struct *work)
{
struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
- s_error_work);
+ s_sb_upd_work);
journal_t *journal = sbi->s_journal;
handle_t *handle;
*/
if (!sb_rdonly(sbi->s_sb) && journal) {
struct buffer_head *sbh = sbi->s_sbh;
+ bool call_notify_err;
handle = jbd2_journal_start(journal, 1);
if (IS_ERR(handle))
goto write_directly;
jbd2_journal_stop(handle);
goto write_directly;
}
+
+ if (sbi->s_add_error_count > 0)
+ call_notify_err = true;
+
ext4_update_super(sbi->s_sb);
if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
goto write_directly;
}
jbd2_journal_stop(handle);
- ext4_notify_error_sysfs(sbi);
+
+ if (call_notify_err)
+ ext4_notify_error_sysfs(sbi);
+
return;
}
write_directly:
struct va_format vaf;
va_list args;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ if (unlikely(ext4_forced_shutdown(sb)))
return;
trace_ext4_error(sb, function, line);
va_list args;
struct va_format vaf;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
struct inode *inode = file_inode(file);
char pathname[80], *path;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
char nbuf[16];
const char *errstr;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ if (unlikely(ext4_forced_shutdown(sb)))
return;
/* Special case: if the error is EROFS, and we're not already
struct va_format vaf;
va_list args;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ if (unlikely(ext4_forced_shutdown(sb)))
return;
trace_ext4_error(sb, function, line);
if (!bdev_read_only(sb->s_bdev)) {
save_error_info(sb, EFSCORRUPTED, ino, block, function,
line);
- schedule_work(&EXT4_SB(sb)->s_error_work);
+ schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
}
return;
}
*/
}
-static void ext4_bdev_mark_dead(struct block_device *bdev)
-{
- ext4_force_shutdown(bdev->bd_holder, EXT4_GOING_FLAGS_NOLOGFLUSH);
-}
-
-static const struct blk_holder_ops ext4_holder_ops = {
- .mark_dead = ext4_bdev_mark_dead,
-};
-
--/*
- * Open the external journal device
- * Release the journal device
-- */
- static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
-static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
--{
-- struct block_device *bdev;
-
- bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb,
- &fs_holder_ops);
- if (IS_ERR(bdev))
- goto fail;
- return bdev;
-
- fail:
- ext4_msg(sb, KERN_ERR,
- "failed to open journal device unknown-block(%u,%u) %ld",
- MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
- return NULL;
- bdev = sbi->s_journal_bdev;
- if (bdev) {
- /*
- * Invalidate the journal device's buffers. We don't want them
- * floating about in memory - the physical journal device may
- * hotswapped, and it breaks the `ro-after' testing code.
- */
- invalidate_bdev(bdev);
- blkdev_put(bdev, sbi->s_sb);
- sbi->s_journal_bdev = NULL;
- }
--}
--
static inline struct inode *orphan_list_entry(struct list_head *l)
{
return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
* Unregister sysfs before destroying jbd2 journal.
* Since we could still access attr_journal_task attribute via sysfs
* path which could have sbi->s_journal->j_task as NULL
- * Unregister sysfs before flush sbi->s_error_work.
+ * Unregister sysfs before flush sbi->s_sb_upd_work.
* Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
* read metadata verify failed then will queue error work.
- * flush_stashed_error_work will call start_this_handle may trigger
+ * update_super_work will call start_this_handle may trigger
* BUG_ON.
*/
ext4_unregister_sysfs(sb);
ext4_unregister_li_request(sb);
ext4_quotas_off(sb, EXT4_MAXQUOTAS);
- flush_work(&sbi->s_error_work);
+ flush_work(&sbi->s_sb_upd_work);
destroy_workqueue(sbi->rsv_conversion_wq);
ext4_release_orphan_info(sb);
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
if (sbi->s_journal_bdev) {
+ /*
+ * Invalidate the journal device's buffers. We don't want them
+ * floating about in memory - the physical journal device may
+ * hotswapped, and it breaks the `ro-after' testing code.
+ */
sync_blockdev(sbi->s_journal_bdev);
- ext4_blkdev_remove(sbi);
+ invalidate_bdev(sbi->s_journal_bdev);
}
ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
#endif
+ {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2},
{Opt_err, 0, 0}
};
unsigned int mask_s_mount_opt;
unsigned int vals_s_mount_opt2;
unsigned int mask_s_mount_opt2;
- unsigned long vals_s_mount_flags;
- unsigned long mask_s_mount_flags;
unsigned int opt_flags; /* MOPT flags */
unsigned int spec;
u32 s_max_batch_time;
EXT4_CLEAR_CTX(mount_opt2);
EXT4_TEST_CTX(mount_opt2);
- static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit)
- {
- set_bit(bit, &ctx->mask_s_mount_flags);
- set_bit(bit, &ctx->vals_s_mount_flags);
- }
-
static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct ext4_fs_context *ctx = fc->fs_private;
ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option",
param->key);
return 0;
- case Opt_abort:
- ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED);
- return 0;
case Opt_inlinecrypt:
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
ctx_set_flags(ctx, SB_INLINECRYPT);
sbi->s_mount_opt |= ctx->vals_s_mount_opt;
sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2;
sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2;
- sbi->s_mount_flags &= ~ctx->mask_s_mount_flags;
- sbi->s_mount_flags |= ctx->vals_s_mount_flags;
sb->s_flags &= ~ctx->mask_s_flags;
sb->s_flags |= ctx->vals_s_flags;
else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
/* j_inum for internal journal is non-zero */
j_inode = ext4_get_journal_inode(sb, j_inum);
- if (j_inode) {
+ if (!IS_ERR(j_inode)) {
j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
overhead += EXT4_NUM_B2C(sbi, j_blocks);
iput(j_inode);
return 0;
out:
- /* flush s_error_work before journal destroy. */
- flush_work(&sbi->s_error_work);
+ /* flush s_sb_upd_work before destroying the journal. */
+ flush_work(&sbi->s_sb_upd_work);
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
return -EINVAL;
timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
spin_lock_init(&sbi->s_error_lock);
- INIT_WORK(&sbi->s_error_work, flush_stashed_error_work);
+ INIT_WORK(&sbi->s_sb_upd_work, update_super_work);
err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
if (err)
spin_lock_init(&sbi->s_bdev_wb_lock);
errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
&sbi->s_bdev_wb_err);
- sb->s_bdev->bd_super = sb;
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
sbi->s_ea_block_cache = NULL;
if (sbi->s_journal) {
- /* flush s_error_work before journal destroy. */
- flush_work(&sbi->s_error_work);
+ /* flush s_sb_upd_work before journal destroy. */
+ flush_work(&sbi->s_sb_upd_work);
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
failed_mount3a:
ext4_es_unregister_shrinker(sbi);
failed_mount3:
- /* flush s_error_work before sbi destroy */
- flush_work(&sbi->s_error_work);
+ /* flush s_sb_upd_work before sbi destroy */
+ flush_work(&sbi->s_sb_upd_work);
del_timer_sync(&sbi->s_err_report);
ext4_stop_mmpd(sbi);
ext4_group_desc_free(sbi);
kfree(get_qf_name(sb, sbi, i));
#endif
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
- /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */
brelse(sbi->s_sbh);
- ext4_blkdev_remove(sbi);
+ if (sbi->s_journal_bdev) {
+ invalidate_bdev(sbi->s_journal_bdev);
+ blkdev_put(sbi->s_journal_bdev, sb);
+ }
out_fail:
invalidate_bdev(sb->s_bdev);
sb->s_fs_info = NULL;
journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
if (IS_ERR(journal_inode)) {
ext4_msg(sb, KERN_ERR, "no journal found");
- return NULL;
+ return ERR_CAST(journal_inode);
}
if (!journal_inode->i_nlink) {
make_bad_inode(journal_inode);
iput(journal_inode);
ext4_msg(sb, KERN_ERR, "journal inode is deleted");
- return NULL;
+ return ERR_PTR(-EFSCORRUPTED);
}
-
- ext4_debug("Journal inode found at %p: %lld bytes\n",
- journal_inode, journal_inode->i_size);
if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
ext4_msg(sb, KERN_ERR, "invalid journal inode");
iput(journal_inode);
- return NULL;
+ return ERR_PTR(-EFSCORRUPTED);
}
+
+ ext4_debug("Journal inode found at %p: %lld bytes\n",
+ journal_inode, journal_inode->i_size);
return journal_inode;
}
return 0;
}
- static journal_t *ext4_get_journal(struct super_block *sb,
- unsigned int journal_inum)
+ static journal_t *ext4_open_inode_journal(struct super_block *sb,
+ unsigned int journal_inum)
{
struct inode *journal_inode;
journal_t *journal;
- if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
- return NULL;
-
journal_inode = ext4_get_journal_inode(sb, journal_inum);
- if (!journal_inode)
- return NULL;
+ if (IS_ERR(journal_inode))
+ return ERR_CAST(journal_inode);
journal = jbd2_journal_init_inode(journal_inode);
- if (!journal) {
+ if (IS_ERR(journal)) {
ext4_msg(sb, KERN_ERR, "Could not load journal inode");
iput(journal_inode);
- return NULL;
+ return ERR_CAST(journal);
}
journal->j_private = sb;
journal->j_bmap = ext4_journal_bmap;
return journal;
}
- static journal_t *ext4_get_dev_journal(struct super_block *sb,
- dev_t j_dev)
+ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
+ dev_t j_dev, ext4_fsblk_t *j_start,
+ ext4_fsblk_t *j_len)
{
struct buffer_head *bh;
- journal_t *journal;
- ext4_fsblk_t start;
- ext4_fsblk_t len;
+ struct block_device *bdev;
int hblock, blocksize;
ext4_fsblk_t sb_block;
unsigned long offset;
struct ext4_super_block *es;
- struct block_device *bdev;
-
- if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
- return NULL;
+ int errno;
- bdev = ext4_blkdev_get(j_dev, sb);
+ /* see get_tree_bdev why this is needed and safe */
+ up_write(&sb->s_umount);
- &ext4_holder_ops);
+ bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb,
- if (bdev == NULL)
- return NULL;
++ &fs_holder_ops);
+ down_write(&sb->s_umount);
+ if (IS_ERR(bdev)) {
+ ext4_msg(sb, KERN_ERR,
+ "failed to open journal device unknown-block(%u,%u) %ld",
+ MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev));
+ return ERR_CAST(bdev);
+ }
blocksize = sb->s_blocksize;
hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
ext4_msg(sb, KERN_ERR,
"blocksize too small for journal device");
+ errno = -EINVAL;
goto out_bdev;
}
sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
offset = EXT4_MIN_BLOCK_SIZE % blocksize;
set_blocksize(bdev, blocksize);
- if (!(bh = __bread(bdev, sb_block, blocksize))) {
+ bh = __bread(bdev, sb_block, blocksize);
+ if (!bh) {
ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
"external journal");
+ errno = -EINVAL;
goto out_bdev;
}
if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
!(le32_to_cpu(es->s_feature_incompat) &
EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
- ext4_msg(sb, KERN_ERR, "external journal has "
- "bad superblock");
- brelse(bh);
- goto out_bdev;
+ ext4_msg(sb, KERN_ERR, "external journal has bad superblock");
+ errno = -EFSCORRUPTED;
+ goto out_bh;
}
if ((le32_to_cpu(es->s_feature_ro_compat) &
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
es->s_checksum != ext4_superblock_csum(sb, es)) {
- ext4_msg(sb, KERN_ERR, "external journal has "
- "corrupt superblock");
- brelse(bh);
- goto out_bdev;
+ ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock");
+ errno = -EFSCORRUPTED;
+ goto out_bh;
}
if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
ext4_msg(sb, KERN_ERR, "journal UUID does not match");
- brelse(bh);
- goto out_bdev;
+ errno = -EFSCORRUPTED;
+ goto out_bh;
}
- len = ext4_blocks_count(es);
- start = sb_block + 1;
- brelse(bh); /* we're done with the superblock */
+ *j_start = sb_block + 1;
+ *j_len = ext4_blocks_count(es);
+ brelse(bh);
+ return bdev;
- journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
- start, len, blocksize);
- if (!journal) {
+ out_bh:
+ brelse(bh);
+ out_bdev:
+ blkdev_put(bdev, sb);
+ return ERR_PTR(errno);
+ }
+
+ static journal_t *ext4_open_dev_journal(struct super_block *sb,
+ dev_t j_dev)
+ {
+ journal_t *journal;
+ ext4_fsblk_t j_start;
+ ext4_fsblk_t j_len;
+ struct block_device *journal_bdev;
+ int errno = 0;
+
+ journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
+ if (IS_ERR(journal_bdev))
+ return ERR_CAST(journal_bdev);
+
+ journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start,
+ j_len, sb->s_blocksize);
+ if (IS_ERR(journal)) {
ext4_msg(sb, KERN_ERR, "failed to create device journal");
+ errno = PTR_ERR(journal);
goto out_bdev;
}
- journal->j_private = sb;
- if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) {
- ext4_msg(sb, KERN_ERR, "I/O error on journal device");
- goto out_journal;
- }
if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
ext4_msg(sb, KERN_ERR, "External journal has more than one "
"user (unsupported) - %d",
be32_to_cpu(journal->j_superblock->s_nr_users));
+ errno = -EINVAL;
goto out_journal;
}
- EXT4_SB(sb)->s_journal_bdev = bdev;
+ journal->j_private = sb;
+ EXT4_SB(sb)->s_journal_bdev = journal_bdev;
ext4_init_journal_params(sb, journal);
return journal;
out_journal:
jbd2_journal_destroy(journal);
out_bdev:
- blkdev_put(bdev, sb);
- return NULL;
+ blkdev_put(journal_bdev, sb);
+ return ERR_PTR(errno);
}
static int ext4_load_journal(struct super_block *sb,
}
if (journal_inum) {
- journal = ext4_get_journal(sb, journal_inum);
- if (!journal)
- return -EINVAL;
+ journal = ext4_open_inode_journal(sb, journal_inum);
+ if (IS_ERR(journal))
+ return PTR_ERR(journal);
} else {
- journal = ext4_get_dev_journal(sb, journal_dev);
- if (!journal)
- return -EINVAL;
+ journal = ext4_open_dev_journal(sb, journal_dev);
+ if (IS_ERR(journal))
+ return PTR_ERR(journal);
}
journal_dev_ro = bdev_read_only(journal->j_dev);
* the clock is set in the future, and this will cause e2fsck
* to complain and force a full file system check.
*/
- if (!(sb->s_flags & SB_RDONLY))
+ if (!sb_rdonly(sb))
ext4_update_tstamp(es, s_wtime);
es->s_kbytes_written =
cpu_to_le64(sbi->s_kbytes_written +
*/
int ext4_force_commit(struct super_block *sb)
{
- journal_t *journal;
-
- if (sb_rdonly(sb))
- return 0;
-
- journal = EXT4_SB(sb)->s_journal;
- return ext4_journal_force_commit(journal);
+ return ext4_journal_force_commit(EXT4_SB(sb)->s_journal);
}
static int ext4_sync_fs(struct super_block *sb, int wait)
bool needs_barrier = false;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (unlikely(ext4_forced_shutdown(sbi)))
+ if (unlikely(ext4_forced_shutdown(sb)))
return 0;
trace_ext4_sync_fs(sb, wait);
static int ext4_freeze(struct super_block *sb)
{
int error = 0;
- journal_t *journal;
-
- if (sb_rdonly(sb))
- return 0;
-
- journal = EXT4_SB(sb)->s_journal;
+ journal_t *journal = EXT4_SB(sb)->s_journal;
if (journal) {
/* Now we set up the journal barrier. */
*/
static int ext4_unfreeze(struct super_block *sb)
{
- if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb)))
+ if (ext4_forced_shutdown(sb))
return 0;
if (EXT4_SB(sb)->s_journal) {
goto restore_opts;
}
- if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
+ if (test_opt2(sb, ABORT))
ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
}
/* Flush outstanding errors before changing fs state */
- flush_work(&sbi->s_error_work);
+ flush_work(&sbi->s_sb_upd_work);
if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
- if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {
+ if (ext4_forced_shutdown(sb)) {
err = -EROFS;
goto restore_opts;
}
* If there was a failing r/w to ro transition, we may need to
* re-enable quota
*/
- if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) &&
+ if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) &&
sb_any_quota_suspended(sb))
dquot_resume(sb, -1);
sb->s_flags = old_sb_flags;
err = dquot_quota_off(sb, type);
if (err || ext4_has_feature_quota(sb))
goto out_put;
+ /*
+ * When the filesystem was remounted read-only first, we cannot cleanup
+ * inode flags here. Bad luck but people should be using QUOTA feature
+ * these days anyway.
+ */
+ if (sb_rdonly(sb))
+ goto out_put;
inode_lock(inode);
/*
}
EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
err = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
out_unlock:
return 1;
}
+static void ext4_kill_sb(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL;
+
+ kill_block_super(sb);
+
+ if (journal_bdev)
+ blkdev_put(journal_bdev, sb);
+}
+
static struct file_system_type ext4_fs_type = {
.owner = THIS_MODULE,
.name = "ext4",
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .kill_sb = ext4_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("ext4");
static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
{
- return ((u64)ea_inode->i_ctime.tv_sec << 32) |
+ return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) |
(u32) inode_peek_iversion_raw(ea_inode);
}
static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
{
- ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
+ inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0);
inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff);
}
{
int error;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
if (strlen(name) > 255)
}
if (!error) {
ext4_xattr_update_super_block(handle, inode->i_sb);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
if (!value)
no_expand = 0;
#endif
/* Checksumming functions */
- static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
- {
- if (!jbd2_journal_has_csum_v2or3_feature(j))
- return 1;
-
- return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
- }
-
static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
{
__u32 csum;
int do_escape = 0;
char *mapped_data;
struct buffer_head *new_bh;
- struct page *new_page;
+ struct folio *new_folio;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
journal_t *journal = transaction->t_journal;
*/
if (jh_in->b_frozen_data) {
done_copy_out = 1;
- new_page = virt_to_page(jh_in->b_frozen_data);
- new_offset = offset_in_page(jh_in->b_frozen_data);
+ new_folio = virt_to_folio(jh_in->b_frozen_data);
+ new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
} else {
- new_page = jh2bh(jh_in)->b_page;
- new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+ new_folio = jh2bh(jh_in)->b_folio;
+ new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data);
}
- mapped_data = kmap_atomic(new_page);
+ mapped_data = kmap_local_folio(new_folio, new_offset);
/*
* Fire data frozen trigger if data already wasn't frozen. Do this
* before checking for escaping, as the trigger may modify the magic
* data in the buffer.
*/
if (!done_copy_out)
- jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+ jbd2_buffer_frozen_trigger(jh_in, mapped_data,
jh_in->b_triggers);
/*
* Check for escaping
*/
- if (*((__be32 *)(mapped_data + new_offset)) ==
- cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+ if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) {
need_copy_out = 1;
do_escape = 1;
}
- kunmap_atomic(mapped_data);
+ kunmap_local(mapped_data);
/*
* Do we need to do a data copy?
}
jh_in->b_frozen_data = tmp;
- mapped_data = kmap_atomic(new_page);
- memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
- kunmap_atomic(mapped_data);
+ memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size);
- new_page = virt_to_page(tmp);
- new_offset = offset_in_page(tmp);
+ new_folio = virt_to_folio(tmp);
+ new_offset = offset_in_folio(new_folio, tmp);
done_copy_out = 1;
/*
* copying, we can finally do so.
*/
if (do_escape) {
- mapped_data = kmap_atomic(new_page);
- *((unsigned int *)(mapped_data + new_offset)) = 0;
- kunmap_atomic(mapped_data);
+ mapped_data = kmap_local_folio(new_folio, new_offset);
+ *((unsigned int *)mapped_data) = 0;
+ kunmap_local(mapped_data);
}
- set_bh_page(new_bh, new_page, new_offset);
+ folio_set_bh(new_bh, new_folio, new_offset);
new_bh->b_size = bh_in->b_size;
new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr;
return count;
}
+ /*
+ * If the journal init or create aborts, we need to mark the journal
+ * superblock as being NULL to prevent the journal destroy from writing
+ * back a bogus superblock.
+ */
+ static void journal_fail_superblock(journal_t *journal)
+ {
+ struct buffer_head *bh = journal->j_sb_buffer;
+ brelse(bh);
+ journal->j_sb_buffer = NULL;
+ }
+
+ /*
+ * Check the superblock for a given journal, performing initial
+ * validation of the format.
+ */
+ static int journal_check_superblock(journal_t *journal)
+ {
+ journal_superblock_t *sb = journal->j_superblock;
+ int num_fc_blks;
+ int err = -EINVAL;
+
+ if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
+ sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
+ printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
+ return err;
+ }
+
+ if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 &&
+ be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) {
+ printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
+ return err;
+ }
+
+ if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {
+ printk(KERN_WARNING "JBD2: journal file too short\n");
+ return err;
+ }
+
+ if (be32_to_cpu(sb->s_first) == 0 ||
+ be32_to_cpu(sb->s_first) >= journal->j_total_len) {
+ printk(KERN_WARNING
+ "JBD2: Invalid start block of journal: %u\n",
+ be32_to_cpu(sb->s_first));
+ return err;
+ }
+
+ /*
+ * If this is a V2 superblock, then we have to check the
+ * features flags on it.
+ */
+ if (!jbd2_format_support_feature(journal))
+ return 0;
+
+ if ((sb->s_feature_ro_compat &
+ ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
+ (sb->s_feature_incompat &
+ ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
+ printk(KERN_WARNING "JBD2: Unrecognised features on journal\n");
+ return err;
+ }
+
+ num_fc_blks = jbd2_has_feature_fast_commit(journal) ?
+ jbd2_journal_get_num_fc_blks(sb) : 0;
+ if (be32_to_cpu(sb->s_maxlen) < JBD2_MIN_JOURNAL_BLOCKS ||
+ be32_to_cpu(sb->s_maxlen) - JBD2_MIN_JOURNAL_BLOCKS < num_fc_blks) {
+ printk(KERN_ERR "JBD2: journal file too short %u,%d\n",
+ be32_to_cpu(sb->s_maxlen), num_fc_blks);
+ return err;
+ }
+
+ if (jbd2_has_feature_csum2(journal) &&
+ jbd2_has_feature_csum3(journal)) {
+ /* Can't have checksum v2 and v3 at the same time! */
+ printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
+ "at the same time!\n");
+ return err;
+ }
+
+ if (jbd2_journal_has_csum_v2or3_feature(journal) &&
+ jbd2_has_feature_checksum(journal)) {
+ /* Can't have checksum v1 and v2 on at the same time! */
+ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
+ "at the same time!\n");
+ return err;
+ }
+
+ /* Load the checksum driver */
+ if (jbd2_journal_has_csum_v2or3_feature(journal)) {
+ if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) {
+ printk(KERN_ERR "JBD2: Unknown checksum type\n");
+ return err;
+ }
+
+ journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(journal->j_chksum_driver)) {
+ printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
+ err = PTR_ERR(journal->j_chksum_driver);
+ journal->j_chksum_driver = NULL;
+ return err;
+ }
+ /* Check superblock checksum */
+ if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
+ printk(KERN_ERR "JBD2: journal checksum error\n");
+ err = -EFSBADCRC;
+ return err;
+ }
+ }
+
+ return 0;
+ }
+
+ static int journal_revoke_records_per_block(journal_t *journal)
+ {
+ int record_size;
+ int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);
+
+ if (jbd2_has_feature_64bit(journal))
+ record_size = 8;
+ else
+ record_size = 4;
+
+ if (jbd2_journal_has_csum_v2or3(journal))
+ space -= sizeof(struct jbd2_journal_block_tail);
+ return space / record_size;
+ }
+
+ /*
+ * Load the on-disk journal superblock and read the key fields into the
+ * journal_t.
+ */
+ static int journal_load_superblock(journal_t *journal)
+ {
+ int err;
+ struct buffer_head *bh;
+ journal_superblock_t *sb;
+
+ bh = getblk_unmovable(journal->j_dev, journal->j_blk_offset,
+ journal->j_blocksize);
+ if (bh)
+ err = bh_read(bh, 0);
+ if (!bh || err < 0) {
+ pr_err("%s: Cannot read journal superblock\n", __func__);
+ brelse(bh);
+ return -EIO;
+ }
+
+ journal->j_sb_buffer = bh;
+ sb = (journal_superblock_t *)bh->b_data;
+ journal->j_superblock = sb;
+ err = journal_check_superblock(journal);
+ if (err) {
+ journal_fail_superblock(journal);
+ return err;
+ }
+
+ journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
+ journal->j_tail = be32_to_cpu(sb->s_start);
+ journal->j_first = be32_to_cpu(sb->s_first);
+ journal->j_errno = be32_to_cpu(sb->s_errno);
+ journal->j_last = be32_to_cpu(sb->s_maxlen);
+
+ if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len)
+ journal->j_total_len = be32_to_cpu(sb->s_maxlen);
+ /* Precompute checksum seed for all metadata */
+ if (jbd2_journal_has_csum_v2or3(journal))
+ journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+ sizeof(sb->s_uuid));
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
+
+ if (jbd2_has_feature_fast_commit(journal)) {
+ journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
+ journal->j_last = journal->j_fc_last -
+ jbd2_journal_get_num_fc_blks(sb);
+ journal->j_fc_first = journal->j_last + 1;
+ journal->j_fc_off = 0;
+ }
+
+ return 0;
+ }
+
+
/*
* Management for journal control blocks: functions to create and
* destroy journal_t structures, and to initialise and read existing
static struct lock_class_key jbd2_trans_commit_key;
journal_t *journal;
int err;
- struct buffer_head *bh;
int n;
journal = kzalloc(sizeof(*journal), GFP_KERNEL);
if (!journal)
- return NULL;
+ return ERR_PTR(-ENOMEM);
+
+ journal->j_blocksize = blocksize;
+ journal->j_dev = bdev;
+ journal->j_fs_dev = fs_dev;
+ journal->j_blk_offset = start;
+ journal->j_total_len = len;
+
+ err = journal_load_superblock(journal);
+ if (err)
+ goto err_cleanup;
init_waitqueue_head(&journal->j_wait_transaction_locked);
init_waitqueue_head(&journal->j_wait_done_commit);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
spin_lock_init(&journal->j_list_lock);
+ spin_lock_init(&journal->j_history_lock);
rwlock_init(&journal->j_state_lock);
journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
journal->j_min_batch_time = 0;
journal->j_max_batch_time = 15000; /* 15ms */
atomic_set(&journal->j_reserved_credits, 0);
+ lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
+ &jbd2_trans_commit_key, 0);
/* The journal is marked for error until we succeed with recovery! */
journal->j_flags = JBD2_ABORT;
if (err)
goto err_cleanup;
- spin_lock_init(&journal->j_history_lock);
-
- lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
- &jbd2_trans_commit_key, 0);
-
- /* journal descriptor can store up to n blocks -bzzz */
- journal->j_blocksize = blocksize;
- journal->j_dev = bdev;
- journal->j_fs_dev = fs_dev;
- journal->j_blk_offset = start;
- journal->j_total_len = len;
- /* We need enough buffers to write out full descriptor block. */
+ /*
+ * journal descriptor can store up to n blocks, we need enough
+ * buffers to write out full descriptor block.
+ */
+ err = -ENOMEM;
n = journal->j_blocksize / jbd2_min_tag_size();
journal->j_wbufsize = n;
journal->j_fc_wbuf = NULL;
if (!journal->j_wbuf)
goto err_cleanup;
- bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
- if (!bh) {
- pr_err("%s: Cannot get buffer for journal superblock\n",
- __func__);
+ err = percpu_counter_init(&journal->j_checkpoint_jh_count, 0,
+ GFP_KERNEL);
+ if (err)
goto err_cleanup;
- }
- journal->j_sb_buffer = bh;
- journal->j_superblock = (journal_superblock_t *)bh->b_data;
journal->j_shrink_transaction = NULL;
journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
journal->j_shrinker.seeks = DEFAULT_SEEKS;
journal->j_shrinker.batch = journal->j_max_transaction_buffers;
-
- if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL))
+ err = register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
+ MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+ if (err)
goto err_cleanup;
- if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
- MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) {
- percpu_counter_destroy(&journal->j_checkpoint_jh_count);
- goto err_cleanup;
- }
return journal;
err_cleanup:
- brelse(journal->j_sb_buffer);
+ percpu_counter_destroy(&journal->j_checkpoint_jh_count);
kfree(journal->j_wbuf);
jbd2_journal_destroy_revoke(journal);
+ journal_fail_superblock(journal);
kfree(journal);
- return NULL;
+ return ERR_PTR(err);
}
/* jbd2_journal_init_dev and jbd2_journal_init_inode:
journal_t *journal;
journal = journal_init_common(bdev, fs_dev, start, len, blocksize);
- if (!journal)
- return NULL;
+ if (IS_ERR(journal))
+ return ERR_CAST(journal);
snprintf(journal->j_devname, sizeof(journal->j_devname),
"%pg", journal->j_dev);
blocknr = 0;
err = bmap(inode, &blocknr);
-
if (err || !blocknr) {
- pr_err("%s: Cannot locate journal superblock\n",
- __func__);
- return NULL;
+ pr_err("%s: Cannot locate journal superblock\n", __func__);
+ return err ? ERR_PTR(err) : ERR_PTR(-EINVAL);
}
jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev,
blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits,
inode->i_sb->s_blocksize);
- if (!journal)
- return NULL;
+ if (IS_ERR(journal))
+ return ERR_CAST(journal);
journal->j_inode = inode;
snprintf(journal->j_devname, sizeof(journal->j_devname),
return journal;
}
- /*
- * If the journal init or create aborts, we need to mark the journal
- * superblock as being NULL to prevent the journal destroy from writing
- * back a bogus superblock.
- */
- static void journal_fail_superblock(journal_t *journal)
- {
- struct buffer_head *bh = journal->j_sb_buffer;
- brelse(bh);
- journal->j_sb_buffer = NULL;
- }
-
/*
* Given a journal_t structure, initialise the various fields for
* startup of a new journaling session. We use this both when creating
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
- static int journal_revoke_records_per_block(journal_t *journal)
- {
- int record_size;
- int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);
-
- if (jbd2_has_feature_64bit(journal))
- record_size = 8;
- else
- record_size = 4;
-
- if (jbd2_journal_has_csum_v2or3(journal))
- space -= sizeof(struct jbd2_journal_block_tail);
- return space / record_size;
- }
-
- /*
- * Read the superblock for a given journal, performing initial
- * validation of the format.
- */
- static int journal_get_superblock(journal_t *journal)
- {
- struct buffer_head *bh;
- journal_superblock_t *sb;
- int err;
-
- bh = journal->j_sb_buffer;
-
- J_ASSERT(bh != NULL);
- if (buffer_verified(bh))
- return 0;
-
- err = bh_read(bh, 0);
- if (err < 0) {
- printk(KERN_ERR
- "JBD2: IO error reading journal superblock\n");
- goto out;
- }
-
- sb = journal->j_superblock;
-
- err = -EINVAL;
-
- if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
- sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
- printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
- goto out;
- }
-
- if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 &&
- be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) {
- printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
- goto out;
- }
-
- if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {
- printk(KERN_WARNING "JBD2: journal file too short\n");
- goto out;
- }
-
- if (be32_to_cpu(sb->s_first) == 0 ||
- be32_to_cpu(sb->s_first) >= journal->j_total_len) {
- printk(KERN_WARNING
- "JBD2: Invalid start block of journal: %u\n",
- be32_to_cpu(sb->s_first));
- goto out;
- }
-
- if (jbd2_has_feature_csum2(journal) &&
- jbd2_has_feature_csum3(journal)) {
- /* Can't have checksum v2 and v3 at the same time! */
- printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
- "at the same time!\n");
- goto out;
- }
-
- if (jbd2_journal_has_csum_v2or3_feature(journal) &&
- jbd2_has_feature_checksum(journal)) {
- /* Can't have checksum v1 and v2 on at the same time! */
- printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
- "at the same time!\n");
- goto out;
- }
-
- if (!jbd2_verify_csum_type(journal, sb)) {
- printk(KERN_ERR "JBD2: Unknown checksum type\n");
- goto out;
- }
-
- /* Load the checksum driver */
- if (jbd2_journal_has_csum_v2or3_feature(journal)) {
- journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
- if (IS_ERR(journal->j_chksum_driver)) {
- printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
- err = PTR_ERR(journal->j_chksum_driver);
- journal->j_chksum_driver = NULL;
- goto out;
- }
- /* Check superblock checksum */
- if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
- printk(KERN_ERR "JBD2: journal checksum error\n");
- err = -EFSBADCRC;
- goto out;
- }
- }
- set_buffer_verified(bh);
- return 0;
-
- out:
- journal_fail_superblock(journal);
- return err;
- }
-
- /*
- * Load the on-disk journal superblock and read the key fields into the
- * journal_t.
- */
-
- static int load_superblock(journal_t *journal)
- {
- int err;
- journal_superblock_t *sb;
- int num_fc_blocks;
-
- err = journal_get_superblock(journal);
- if (err)
- return err;
-
- sb = journal->j_superblock;
-
- journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
- journal->j_tail = be32_to_cpu(sb->s_start);
- journal->j_first = be32_to_cpu(sb->s_first);
- journal->j_errno = be32_to_cpu(sb->s_errno);
- journal->j_last = be32_to_cpu(sb->s_maxlen);
-
- if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len)
- journal->j_total_len = be32_to_cpu(sb->s_maxlen);
- /* Precompute checksum seed for all metadata */
- if (jbd2_journal_has_csum_v2or3(journal))
- journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
- sizeof(sb->s_uuid));
- journal->j_revoke_records_per_block =
- journal_revoke_records_per_block(journal);
-
- if (jbd2_has_feature_fast_commit(journal)) {
- journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
- num_fc_blocks = jbd2_journal_get_num_fc_blks(sb);
- if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS)
- journal->j_last = journal->j_fc_last - num_fc_blocks;
- journal->j_fc_first = journal->j_last + 1;
- journal->j_fc_off = 0;
- }
-
- return 0;
- }
-
-
/**
* jbd2_journal_load() - Read journal from disk.
* @journal: Journal to act on.
int jbd2_journal_load(journal_t *journal)
{
int err;
- journal_superblock_t *sb;
-
- err = load_superblock(journal);
- if (err)
- return err;
-
- sb = journal->j_superblock;
-
- /*
- * If this is a V2 superblock, then we have to check the
- * features flags on it.
- */
- if (jbd2_format_support_feature(journal)) {
- if ((sb->s_feature_ro_compat &
- ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
- (sb->s_feature_incompat &
- ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
- printk(KERN_WARNING
- "JBD2: Unrecognised features on journal\n");
- return -EINVAL;
- }
- }
+ journal_superblock_t *sb = journal->j_superblock;
/*
* Create a slab for this blocksize
/* Let the recovery code check whether it needs to recover any
* data from the journal. */
- if (jbd2_journal_recover(journal))
- goto recovery_error;
+ err = jbd2_journal_recover(journal);
+ if (err) {
+ pr_warn("JBD2: journal recovery failed\n");
+ return err;
+ }
if (journal->j_failed_commit) {
printk(KERN_ERR "JBD2: journal transaction %u on %s "
/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
* and reset them on disk. */
- if (journal_reset(journal))
- goto recovery_error;
+ err = journal_reset(journal);
+ if (err) {
+ pr_warn("JBD2: journal reset failed\n");
+ return err;
+ }
journal->j_flags |= JBD2_LOADED;
return 0;
-
- recovery_error:
- printk(KERN_WARNING "JBD2: recovery failed\n");
- return -EIO;
}
/**
if (!compat && !ro && !incompat)
return 1;
- if (journal_get_superblock(journal))
- return 0;
if (!jbd2_format_support_feature(journal))
return 0;
int jbd2_journal_wipe(journal_t *journal, int write)
{
- int err = 0;
+ int err;
J_ASSERT (!(journal->j_flags & JBD2_LOADED));
- err = load_superblock(journal);
- if (err)
- return err;
-
if (!journal->j_tail)
- goto no_recovery;
+ return 0;
printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
write ? "Clearing" : "Ignoring");
mutex_unlock(&journal->j_checkpoint_mutex);
}
- no_recovery:
return err;
}
unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
return 0;
}
};
EXPORT_SYMBOL(simple_dir_inode_operations);
+static void offset_set(struct dentry *dentry, u32 offset)
+{
+ dentry->d_fsdata = (void *)((uintptr_t)(offset));
+}
+
+static u32 dentry2offset(struct dentry *dentry)
+{
+ return (u32)((uintptr_t)(dentry->d_fsdata));
+}
+
+static struct lock_class_key simple_offset_xa_lock;
+
+/**
+ * simple_offset_init - initialize an offset_ctx
+ * @octx: directory offset map to be initialized
+ *
+ */
+void simple_offset_init(struct offset_ctx *octx)
+{
+ xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1);
+ lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock);
+
+ /* 0 is '.', 1 is '..', so always start with offset 2 */
+ octx->next_offset = 2;
+}
+
+/**
+ * simple_offset_add - Add an entry to a directory's offset map
+ * @octx: directory offset ctx to be updated
+ * @dentry: new dentry being added
+ *
+ * Returns zero on success. @so_ctx and the dentry offset are updated.
+ * Otherwise, a negative errno value is returned.
+ */
+int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
+{
+ static const struct xa_limit limit = XA_LIMIT(2, U32_MAX);
+ u32 offset;
+ int ret;
+
+ if (dentry2offset(dentry) != 0)
+ return -EBUSY;
+
+ ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit,
+ &octx->next_offset, GFP_KERNEL);
+ if (ret < 0)
+ return ret;
+
+ offset_set(dentry, offset);
+ return 0;
+}
+
+/**
+ * simple_offset_remove - Remove an entry to a directory's offset map
+ * @octx: directory offset ctx to be updated
+ * @dentry: dentry being removed
+ *
+ */
+void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
+{
+ u32 offset;
+
+ offset = dentry2offset(dentry);
+ if (offset == 0)
+ return;
+
+ xa_erase(&octx->xa, offset);
+ offset_set(dentry, 0);
+}
+
+/**
+ * simple_offset_rename_exchange - exchange rename with directory offsets
+ * @old_dir: parent of dentry being moved
+ * @old_dentry: dentry being moved
+ * @new_dir: destination parent
+ * @new_dentry: destination dentry
+ *
+ * Returns zero on success. Otherwise a negative errno is returned and the
+ * rename is rolled back.
+ */
+int simple_offset_rename_exchange(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry)
+{
+ struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
+ struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
+ u32 old_index = dentry2offset(old_dentry);
+ u32 new_index = dentry2offset(new_dentry);
+ int ret;
+
+ simple_offset_remove(old_ctx, old_dentry);
+ simple_offset_remove(new_ctx, new_dentry);
+
+ ret = simple_offset_add(new_ctx, old_dentry);
+ if (ret)
+ goto out_restore;
+
+ ret = simple_offset_add(old_ctx, new_dentry);
+ if (ret) {
+ simple_offset_remove(new_ctx, old_dentry);
+ goto out_restore;
+ }
+
+ ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
+ if (ret) {
+ simple_offset_remove(new_ctx, old_dentry);
+ simple_offset_remove(old_ctx, new_dentry);
+ goto out_restore;
+ }
+ return 0;
+
+out_restore:
+ offset_set(old_dentry, old_index);
+ xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL);
+ offset_set(new_dentry, new_index);
+ xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL);
+ return ret;
+}
+
+/**
+ * simple_offset_destroy - Release offset map
+ * @octx: directory offset ctx that is about to be destroyed
+ *
+ * During fs teardown (eg. umount), a directory's offset map might still
+ * contain entries. xa_destroy() cleans out anything that remains.
+ */
+void simple_offset_destroy(struct offset_ctx *octx)
+{
+ xa_destroy(&octx->xa);
+}
+
+/**
+ * offset_dir_llseek - Advance the read position of a directory descriptor
+ * @file: an open directory whose position is to be updated
+ * @offset: a byte offset
+ * @whence: enumerator describing the starting position for this update
+ *
+ * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories.
+ *
+ * Returns the updated read position if successful; otherwise a
+ * negative errno is returned and the read position remains unchanged.
+ */
+static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ switch (whence) {
+ case SEEK_CUR:
+ offset += file->f_pos;
+ fallthrough;
+ case SEEK_SET:
+ if (offset >= 0)
+ break;
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ return vfs_setpos(file, offset, U32_MAX);
+}
+
+static struct dentry *offset_find_next(struct xa_state *xas)
+{
+ struct dentry *child, *found = NULL;
+
+ rcu_read_lock();
+ child = xas_next_entry(xas, U32_MAX);
+ if (!child)
+ goto out;
+ spin_lock(&child->d_lock);
+ if (simple_positive(child))
+ found = dget_dlock(child);
+ spin_unlock(&child->d_lock);
+out:
+ rcu_read_unlock();
+ return found;
+}
+
+static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
+{
+ u32 offset = dentry2offset(dentry);
+ struct inode *inode = d_inode(dentry);
+
+ return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
+ inode->i_ino, fs_umode_to_dtype(inode->i_mode));
+}
+
+static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
+{
+ struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
+ XA_STATE(xas, &so_ctx->xa, ctx->pos);
+ struct dentry *dentry;
+
+ while (true) {
+ dentry = offset_find_next(&xas);
+ if (!dentry)
+ break;
+
+ if (!offset_dir_emit(ctx, dentry)) {
+ dput(dentry);
+ break;
+ }
+
+ dput(dentry);
+ ctx->pos = xas.xa_index + 1;
+ }
+}
+
+/**
+ * offset_readdir - Emit entries starting at offset @ctx->pos
+ * @file: an open directory to iterate over
+ * @ctx: directory iteration context
+ *
+ * Caller must hold @file's i_rwsem to prevent insertion or removal of
+ * entries during this call.
+ *
+ * On entry, @ctx->pos contains an offset that represents the first entry
+ * to be read from the directory.
+ *
+ * The operation continues until there are no more entries to read, or
+ * until the ctx->actor indicates there is no more space in the caller's
+ * output buffer.
+ *
+ * On return, @ctx->pos contains an offset that will read the next entry
+ * in this directory when offset_readdir() is called again with @ctx.
+ *
+ * Return values:
+ * %0 - Complete
+ */
+static int offset_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct dentry *dir = file->f_path.dentry;
+
+ lockdep_assert_held(&d_inode(dir)->i_rwsem);
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ offset_iterate_dir(d_inode(dir), ctx);
+ return 0;
+}
+
+const struct file_operations simple_offset_dir_operations = {
+ .llseek = offset_dir_llseek,
+ .iterate_shared = offset_readdir,
+ .read = generic_read_dir,
+ .fsync = noop_fsync,
+};
+
static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
struct dentry *child = NULL;
while ((child = find_next_child(this, victim)) == NULL) {
// kill and ascend
// update metadata while it's still locked
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
clear_nlink(inode);
inode_unlock(inode);
victim = this;
dput(victim); // unpin it
}
if (victim == dentry) {
- inode->i_ctime = inode->i_mtime =
- current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (d_is_dir(dentry))
drop_nlink(inode);
inode_unlock(inode);
*/
root->i_ino = 1;
root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
- root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
+ root->i_atime = root->i_mtime = inode_set_ctime_current(root);
s->s_root = d_make_root(root);
if (!s->s_root)
return -ENOMEM;
{
struct inode *inode = d_inode(old_dentry);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
inc_nlink(inode);
ihold(inode);
dget(dentry);
{
struct inode *inode = d_inode(dentry);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
drop_nlink(inode);
dput(dentry);
return 0;
}
EXPORT_SYMBOL(simple_rmdir);
+/**
+ * simple_rename_timestamp - update the various inode timestamps for rename
+ * @old_dir: old parent directory
+ * @old_dentry: dentry that is being renamed
+ * @new_dir: new parent directory
+ * @new_dentry: target for rename
+ *
+ * POSIX mandates that the old and new parent directories have their ctime and
+ * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have
+ * their ctime updated.
+ */
+void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct inode *newino = d_inode(new_dentry);
+
+ old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ if (new_dir != old_dir)
+ new_dir->i_mtime = inode_set_ctime_current(new_dir);
+ inode_set_ctime_current(d_inode(old_dentry));
+ if (newino)
+ inode_set_ctime_current(newino);
+}
+EXPORT_SYMBOL_GPL(simple_rename_timestamp);
+
int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
inc_nlink(old_dir);
}
}
- old_dir->i_ctime = old_dir->i_mtime =
- new_dir->i_ctime = new_dir->i_mtime =
- d_inode(old_dentry)->i_ctime =
- d_inode(new_dentry)->i_ctime = current_time(old_dir);
-
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
return 0;
}
EXPORT_SYMBOL_GPL(simple_rename_exchange);
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags)
{
- struct inode *inode = d_inode(old_dentry);
int they_are_dirs = d_is_dir(old_dentry);
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
inc_nlink(new_dir);
}
- old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
- new_dir->i_mtime = inode->i_ctime = current_time(old_dir);
-
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
return 0;
}
EXPORT_SYMBOL(simple_rename);
loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
- struct page *page;
- pgoff_t index;
+ struct folio *folio;
- index = pos >> PAGE_SHIFT;
+ folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
- return -ENOMEM;
-
- *pagep = page;
+ *pagep = &folio->page;
- if (!PageUptodate(page) && (len != PAGE_SIZE)) {
- unsigned from = pos & (PAGE_SIZE - 1);
+ if (!folio_test_uptodate(folio) && (len != folio_size(folio))) {
+ size_t from = offset_in_folio(folio, pos);
- zero_user_segments(page, 0, from, from + len, PAGE_SIZE);
+ folio_zero_segments(folio, 0, from,
+ from + len, folio_size(folio));
}
return 0;
}
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- struct inode *inode = page->mapping->host;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = folio->mapping->host;
loff_t last_pos = pos + copied;
- /* zero the stale part of the page if we did a short copy */
- if (!PageUptodate(page)) {
+ /* zero the stale part of the folio if we did a short copy */
+ if (!folio_test_uptodate(folio)) {
if (copied < len) {
- unsigned from = pos & (PAGE_SIZE - 1);
+ size_t from = offset_in_folio(folio, pos);
- zero_user(page, from + copied, len - copied);
+ folio_zero_range(folio, from + copied, len - copied);
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
/*
* No need to use i_size_read() here, the i_size
if (last_pos > inode->i_size)
i_size_write(inode, last_pos);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
*/
inode->i_ino = 1;
inode->i_mode = S_IFDIR | 0755;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
set_nlink(inode, 2);
goto out;
}
inode->i_mode = S_IFREG | files->mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_fop = files->ops;
inode->i_ino = i;
d_add(dentry, inode);
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_flags |= S_PRIVATE;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
return inode;
}
EXPORT_SYMBOL(alloc_anon_inode);
* All arguments are ignored and it just returns -EINVAL.
*/
int
-simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
+simple_nosetlease(struct file *filp, int arg, struct file_lock **flp,
void **priv)
{
return -EINVAL;
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
}
}
#if IS_ENABLED(CONFIG_UNICODE)
- /*
- * Determine if the name of a dentry should be casefolded.
- *
- * Return: if names will need casefolding
- */
- static bool needs_casefold(const struct inode *dir)
- {
- return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding;
- }
-
/**
* generic_ci_d_compare - generic d_compare implementation for casefolding filesystems
* @dentry: dentry whose name we are checking against
char strbuf[DNAME_INLINE_LEN];
int ret;
- if (!dir || !needs_casefold(dir))
+ if (!dir || !IS_CASEFOLDED(dir))
goto fallback;
/*
* If the dentry name is stored in-line, then it may be concurrently
const struct unicode_map *um = sb->s_encoding;
int ret = 0;
- if (!dir || !needs_casefold(dir))
+ if (!dir || !IS_CASEFOLDED(dir))
return 0;
ret = utf8_casefold_hash(um, dentry, str);
if (osb->replay_map)
return 0;
- replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
- (osb->max_slots * sizeof(char)), GFP_KERNEL);
-
+ replay_map = kzalloc(struct_size(replay_map, rm_replay_slots,
+ osb->max_slots),
+ GFP_KERNEL);
if (!replay_map) {
mlog_errno(-ENOMEM);
return -ENOMEM;
osb->recovery_thread_task = NULL;
init_waitqueue_head(&osb->recovery_event);
- rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
- osb->max_slots * sizeof(unsigned int),
+ rm = kzalloc(struct_size(rm, rm_entries, osb->max_slots),
GFP_KERNEL);
if (!rm) {
mlog_errno(-ENOMEM);
return -ENOMEM;
}
- rm->rm_entries = (unsigned int *)((char *)rm +
- sizeof(struct ocfs2_recovery_map));
osb->recovery_map = rm;
return 0;
(unsigned long)bh,
(unsigned long long)bh->b_blocknr);
- ocfs2_error(bh->b_bdev->bd_super,
+ ocfs2_error(bh->b_assoc_map->host->i_sb,
"JBD2 has aborted our journal, ocfs2 cannot continue\n");
}
mlog_errno(status);
if (!is_handle_aborted(handle)) {
journal_t *journal = handle->h_transaction->t_journal;
- struct super_block *sb = bh->b_bdev->bd_super;
mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. "
"Aborting transaction and journal.\n");
handle->h_err = status;
jbd2_journal_abort_handle(handle);
jbd2_journal_abort(journal, status);
- ocfs2_abort(sb, "Journal already aborted.\n");
+ ocfs2_abort(bh->b_assoc_map->host->i_sb,
+ "Journal already aborted.\n");
}
}
}
/* call the kernels journal init function now */
j_journal = jbd2_journal_init_inode(inode);
- if (j_journal == NULL) {
+ if (IS_ERR(j_journal)) {
mlog(ML_ERROR, "Linux journal layer error\n");
- status = -EINVAL;
+ status = PTR_ERR(j_journal);
goto done;
}
}
journal = jbd2_journal_init_inode(inode);
- if (journal == NULL) {
+ if (IS_ERR(journal)) {
mlog(ML_ERROR, "Linux journal layer error\n");
- status = -EIO;
+ status = PTR_ERR(journal);
goto done;
}