loff_t isize = i_size_read(inode);
start_pos = pos & ~((u64)root->sectorsize - 1);
- num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
+ num_bytes = round_up(write_bytes + pos - start_pos, root->sectorsize);
end_of_last_block = start_pos + num_bytes - 1;
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
static noinline int
lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
+ size_t write_bytes,
u64 *lockstart, u64 *lockend,
struct extent_state **cached_state)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
u64 start_pos;
u64 last_pos;
int i;
int ret = 0;
- start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
- last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
+ start_pos = round_down(pos, root->sectorsize);
+ last_pos = start_pos
+ + round_up(pos + write_bytes - start_pos, root->sectorsize) - 1;
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
while (iov_iter_count(i) > 0) {
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+ size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i),
nrptrs * (size_t)PAGE_CACHE_SIZE -
offset);
size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
+ size_t dirty_sectors;
+ size_t num_sectors;
WARN_ON(num_pages > nrptrs);
break;
}
- reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+ sector_offset = pos & (root->sectorsize - 1);
+ reserve_bytes = round_up(write_bytes + sector_offset,
+ root->sectorsize);
if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC)) {
*/
num_pages = DIV_ROUND_UP(write_bytes + offset,
PAGE_CACHE_SIZE);
- reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+ reserve_bytes = round_up(write_bytes
+ + sector_offset,
+ root->sectorsize);
goto reserve_metadata;
}
}
break;
ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
- pos, &lockstart, &lockend,
- &cached_state);
+ pos, write_bytes, &lockstart,
+ &lockend, &cached_state);
if (ret < 0) {
if (ret == -EAGAIN)
goto again;
* we still have an outstanding extent for the chunk we actually
* managed to copy.
*/
- if (num_pages > dirty_pages) {
- release_bytes = (num_pages - dirty_pages) <<
- PAGE_CACHE_SHIFT;
+ num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
+ reserve_bytes);
+ dirty_sectors = round_up(copied + sector_offset,
+ root->sectorsize);
+ dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
+ dirty_sectors);
+
+ if (num_sectors > dirty_sectors) {
+ release_bytes = (write_bytes - copied)
+ & ~((u64)root->sectorsize - 1);
if (copied > 0) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
}
}
- release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
+ release_bytes = round_up(copied + sector_offset,
+ root->sectorsize);
if (copied > 0)
ret = btrfs_dirty_pages(root, inode, pages,
if (only_release_metadata && copied > 0) {
lockstart = round_down(pos, root->sectorsize);
- lockend = lockstart +
- (dirty_pages << PAGE_CACHE_SHIFT) - 1;
+ lockend = round_up(pos + copied, root->sectorsize) - 1;
set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, EXTENT_NORESERVE, NULL,
ssize_t err;
loff_t pos;
size_t count;
+ loff_t oldsize;
+ int clean_page = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
err = generic_write_checks(iocb, from);
if (err <= 0) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
current->backing_dev_info = inode_to_bdi(inode);
err = file_remove_privs(file);
if (err) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
* to stop this write operation to ensure FS consistency.
*/
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
err = -EROFS;
goto out;
}
pos = iocb->ki_pos;
count = iov_iter_count(from);
start_pos = round_down(pos, root->sectorsize);
- if (start_pos > i_size_read(inode)) {
+ oldsize = i_size_read(inode);
+ if (start_pos > oldsize) {
/* Expand hole size to cover write data, preventing empty gap */
end_pos = round_up(pos + count, root->sectorsize);
- err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
+ err = btrfs_cont_expand(inode, oldsize, end_pos);
if (err) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
+ if (start_pos > round_up(oldsize, root->sectorsize))
+ clean_page = 1;
}
if (sync)
num_written = __btrfs_buffered_write(file, from, pos);
if (num_written > 0)
iocb->ki_pos = pos + num_written;
+ if (clean_page)
+ pagecache_isize_extended(inode, oldsize,
+ i_size_read(inode));
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/*
* We also have to set last_sub_trans to the current log transid,
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
atomic_inc(&root->log_batch);
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
ret = start_ordered_ops(inode, start, end);
}
if (ret) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
atomic_inc(&root->log_batch);
*/
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
trans->sync = true;
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/*
* If any of the ordered extents had an error, just return it to user
int ret = 0;
int err = 0;
unsigned int rsv_count;
- bool same_page;
+ bool same_block;
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
u64 ino_size;
- bool truncated_page = false;
+ bool truncated_block = false;
bool updated_inode = false;
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
- ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
+ ino_size = round_up(inode->i_size, root->sectorsize);
ret = find_first_non_hole(inode, &offset, &len);
if (ret < 0)
goto out_only_mutex;
lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
lockend = round_down(offset + len,
BTRFS_I(inode)->root->sectorsize) - 1;
- same_page = ((offset >> PAGE_CACHE_SHIFT) ==
- ((offset + len - 1) >> PAGE_CACHE_SHIFT));
-
+ same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset))
+ == (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1));
/*
- * We needn't truncate any page which is beyond the end of the file
+ * We needn't truncate any block which is beyond the end of the file
* because we are sure there is no data there.
*/
/*
- * Only do this if we are in the same page and we aren't doing the
- * entire page.
+ * Only do this if we are in the same block and we aren't doing the
+ * entire block.
*/
- if (same_page && len < PAGE_CACHE_SIZE) {
+ if (same_block && len < root->sectorsize) {
if (offset < ino_size) {
- truncated_page = true;
- ret = btrfs_truncate_page(inode, offset, len, 0);
+ truncated_block = true;
+ ret = btrfs_truncate_block(inode, offset, len, 0);
} else {
ret = 0;
}
goto out_only_mutex;
}
- /* zero back part of the first page */
+ /* zero back part of the first block */
if (offset < ino_size) {
- truncated_page = true;
- ret = btrfs_truncate_page(inode, offset, 0, 0);
+ truncated_block = true;
+ ret = btrfs_truncate_block(inode, offset, 0, 0);
if (ret) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
}
if (!ret) {
/* zero the front end of the last page */
if (tail_start + tail_len < ino_size) {
- truncated_page = true;
- ret = btrfs_truncate_page(inode,
- tail_start + tail_len, 0, 1);
+ truncated_block = true;
+ ret = btrfs_truncate_block(inode,
+ tail_start + tail_len,
+ 0, 1);
if (ret)
goto out_only_mutex;
}
ret = btrfs_wait_ordered_range(inode, lockstart,
lockend - lockstart + 1);
if (ret) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
}
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
out_only_mutex:
- if (!updated_inode && truncated_page && !ret && !err) {
+ if (!updated_inode && truncated_block && !ret && !err) {
/*
* If we only end up zeroing part of a page, we still need to
* update the inode item, so that all the time fields are
ret = btrfs_end_transaction(trans, root);
}
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (ret && !err)
err = ret;
return err;
if (ret < 0)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = inode_newsize_ok(inode, alloc_end);
if (ret)
goto out;
} else if (offset + len > inode->i_size) {
/*
* If we are fallocating from the end of the file onward we
- * need to zero out the end of the page if i_size lands in the
- * middle of a page.
+ * need to zero out the end of the block if i_size lands in the
+ * middle of a block.
*/
- ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+ ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
if (ret)
goto out;
}
* So this is completely used as cleanup.
*/
btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/* Let go of our reservation. */
btrfs_free_reserved_data_space(inode, alloc_start,
alloc_end - alloc_start);
struct inode *inode = file->f_mapping->host;
int ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
switch (whence) {
case SEEK_END:
case SEEK_CUR:
case SEEK_DATA:
case SEEK_HOLE:
if (offset >= i_size_read(inode)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return -ENXIO;
}
ret = find_desired_extent(inode, &offset, whence);
if (ret) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
}
offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return offset;
}
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,
#endif
+ .copy_file_range = btrfs_copy_file_range,
+ .clone_file_range = btrfs_clone_file_range,
+ .dedupe_file_range = btrfs_dedupe_file_range,
};
void btrfs_auto_defrag_exit(void)
data_len = compressed_size;
if (start > 0 ||
- actual_end > PAGE_CACHE_SIZE ||
+ actual_end > root->sectorsize ||
data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
(!compressed_size &&
(actual_end & (root->sectorsize - 1)) == 0) ||
if (PagePrivate2(page))
goto out;
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ ordered = btrfs_lookup_ordered_range(inode, page_start,
+ PAGE_CACHE_SIZE);
if (ordered) {
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
page_end, &cached_state, GFP_NOFS);
int scanned = 0;
if (!xattr_access) {
- xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS));
- xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT));
+ xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS));
+ xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
}
slot++;
break;
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &btrfs_symlink_aops;
break;
default:
* read the extent item from disk (data not in the page cache).
*/
btrfs_release_path(path);
- return btrfs_truncate_page(inode, offset, page_end - offset, 0);
+ return btrfs_truncate_block(inode, offset, page_end - offset,
+ 0);
}
btrfs_set_file_extent_ram_bytes(leaf, fi, size);
}
/*
- * btrfs_truncate_page - read, zero a chunk and write a page
+ * btrfs_truncate_block - read, zero a chunk and write a block
* @inode - inode that we're zeroing
* @from - the offset to start zeroing
* @len - the length to zero, 0 to zero the entire range respective to the
* offset
* @front - zero up to the offset instead of from the offset on
*
- * This will find the page for the "from" offset and cow the page and zero the
+ * This will find the block for the "from" offset and cow the block and zero the
* part we want to zero. This is used with truncate and hole punching.
*/
- int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
int front)
{
struct address_space *mapping = inode->i_mapping;
char *kaddr;
u32 blocksize = root->sectorsize;
pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned offset = from & (blocksize - 1);
struct page *page;
gfp_t mask = btrfs_alloc_write_mask(mapping);
int ret = 0;
- u64 page_start;
- u64 page_end;
+ u64 block_start;
+ u64 block_end;
if ((offset & (blocksize - 1)) == 0 &&
(!len || ((len & (blocksize - 1)) == 0)))
goto out;
+
ret = btrfs_delalloc_reserve_space(inode,
- round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
+ round_down(from, blocksize), blocksize);
if (ret)
goto out;
page = find_or_create_page(mapping, index, mask);
if (!page) {
btrfs_delalloc_release_space(inode,
- round_down(from, PAGE_CACHE_SIZE),
- PAGE_CACHE_SIZE);
+ round_down(from, blocksize),
+ blocksize);
ret = -ENOMEM;
goto out;
}
- page_start = page_offset(page);
- page_end = page_start + PAGE_CACHE_SIZE - 1;
+ block_start = round_down(from, blocksize);
+ block_end = block_start + blocksize - 1;
if (!PageUptodate(page)) {
ret = btrfs_readpage(NULL, page);
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, &cached_state);
+ lock_extent_bits(io_tree, block_start, block_end, &cached_state);
set_page_extent_mapped(page);
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
- unlock_extent_cached(io_tree, page_start, page_end,
+ unlock_extent_cached(io_tree, block_start, block_end,
&cached_state, GFP_NOFS);
unlock_page(page);
page_cache_release(page);
goto again;
}
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state, GFP_NOFS);
- ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+ ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
&cached_state);
if (ret) {
- unlock_extent_cached(io_tree, page_start, page_end,
+ unlock_extent_cached(io_tree, block_start, block_end,
&cached_state, GFP_NOFS);
goto out_unlock;
}
- if (offset != PAGE_CACHE_SIZE) {
+ if (offset != blocksize) {
if (!len)
- len = PAGE_CACHE_SIZE - offset;
+ len = blocksize - offset;
kaddr = kmap(page);
if (front)
- memset(kaddr, 0, offset);
+ memset(kaddr + (block_start - page_offset(page)),
+ 0, offset);
else
- memset(kaddr + offset, 0, len);
+ memset(kaddr + (block_start - page_offset(page)) + offset,
+ 0, len);
flush_dcache_page(page);
kunmap(page);
}
ClearPageChecked(page);
set_page_dirty(page);
- unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
+ unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
GFP_NOFS);
out_unlock:
if (ret)
- btrfs_delalloc_release_space(inode, page_start,
- PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, block_start,
+ blocksize);
unlock_page(page);
page_cache_release(page);
out:
int err = 0;
/*
- * If our size started in the middle of a page we need to zero out the
- * rest of the page before we expand the i_size, otherwise we could
+ * If our size started in the middle of a block we need to zero out the
+ * rest of the block before we expand the i_size, otherwise we could
* expose stale data.
*/
- err = btrfs_truncate_page(inode, oldsize, 0, 0);
+ err = btrfs_truncate_block(inode, oldsize, 0, 0);
if (err)
return err;
}
if (newsize > oldsize) {
- truncate_pagecache(inode, newsize);
/*
* Don't do an expanding truncate while snapshoting is ongoing.
* This is to ensure the snapshot captures a fully consistent
i_size_write(inode, newsize);
btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+ pagecache_isize_extended(inode, oldsize, newsize);
ret = btrfs_update_inode(trans, root, inode);
btrfs_end_write_no_snapshoting(root);
btrfs_end_transaction(trans, root);
}
static int dio_read_error(struct inode *inode, struct bio *failed_bio,
- struct page *page, u64 start, u64 end,
- int failed_mirror, bio_end_io_t *repair_endio,
- void *repair_arg)
+ struct page *page, unsigned int pgoff,
+ u64 start, u64 end, int failed_mirror,
+ bio_end_io_t *repair_endio, void *repair_arg)
{
struct io_failure_record *failrec;
struct bio *bio;
return -EIO;
}
- if (failed_bio->bi_vcnt > 1)
+ if ((failed_bio->bi_vcnt > 1)
+ || (failed_bio->bi_io_vec->bv_len
+ > BTRFS_I(inode)->root->sectorsize))
read_mode = READ_SYNC | REQ_FAILFAST_DEV;
else
read_mode = READ_SYNC;
isector = start - btrfs_io_bio(failed_bio)->logical;
isector >>= inode->i_sb->s_blocksize_bits;
bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
- 0, isector, repair_endio, repair_arg);
+ pgoff, isector, repair_endio, repair_arg);
if (!bio) {
free_io_failure(inode, failrec);
return -EIO;
static void btrfs_retry_endio_nocsum(struct bio *bio)
{
struct btrfs_retry_complete *done = bio->bi_private;
+ struct inode *inode;
struct bio_vec *bvec;
int i;
if (bio->bi_error)
goto end;
+ ASSERT(bio->bi_vcnt == 1);
+ inode = bio->bi_io_vec->bv_page->mapping->host;
+ ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
+
done->uptodate = 1;
bio_for_each_segment_all(bvec, bio, i)
clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
static int __btrfs_correct_data_nocsum(struct inode *inode,
struct btrfs_io_bio *io_bio)
{
+ struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct btrfs_retry_complete done;
u64 start;
+ unsigned int pgoff;
+ u32 sectorsize;
+ int nr_sectors;
int i;
int ret;
+ fs_info = BTRFS_I(inode)->root->fs_info;
+ sectorsize = BTRFS_I(inode)->root->sectorsize;
+
start = io_bio->logical;
done.inode = inode;
bio_for_each_segment_all(bvec, &io_bio->bio, i) {
- try_again:
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+ pgoff = bvec->bv_offset;
+
+ next_block_or_try_again:
done.uptodate = 0;
done.start = start;
init_completion(&done.done);
- ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
- start + bvec->bv_len - 1,
- io_bio->mirror_num,
- btrfs_retry_endio_nocsum, &done);
+ ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
+ pgoff, start, start + sectorsize - 1,
+ io_bio->mirror_num,
+ btrfs_retry_endio_nocsum, &done);
if (ret)
return ret;
if (!done.uptodate) {
/* We might have another mirror, so try again */
- goto try_again;
+ goto next_block_or_try_again;
}
- start += bvec->bv_len;
+ start += sectorsize;
+
+ if (nr_sectors--) {
+ pgoff += sectorsize;
+ goto next_block_or_try_again;
+ }
}
return 0;
{
struct btrfs_retry_complete *done = bio->bi_private;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct inode *inode;
struct bio_vec *bvec;
+ u64 start;
int uptodate;
int ret;
int i;
goto end;
uptodate = 1;
+
+ start = done->start;
+
+ ASSERT(bio->bi_vcnt == 1);
+ inode = bio->bi_io_vec->bv_page->mapping->host;
+ ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
+
bio_for_each_segment_all(bvec, bio, i) {
ret = __readpage_endio_check(done->inode, io_bio, i,
- bvec->bv_page, 0,
- done->start, bvec->bv_len);
+ bvec->bv_page, bvec->bv_offset,
+ done->start, bvec->bv_len);
if (!ret)
clean_io_failure(done->inode, done->start,
- bvec->bv_page, 0);
+ bvec->bv_page, bvec->bv_offset);
else
uptodate = 0;
}
static int __btrfs_subio_endio_read(struct inode *inode,
struct btrfs_io_bio *io_bio, int err)
{
+ struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct btrfs_retry_complete done;
u64 start;
u64 offset = 0;
+ u32 sectorsize;
+ int nr_sectors;
+ unsigned int pgoff;
+ int csum_pos;
int i;
int ret;
+ fs_info = BTRFS_I(inode)->root->fs_info;
+ sectorsize = BTRFS_I(inode)->root->sectorsize;
+
err = 0;
start = io_bio->logical;
done.inode = inode;
bio_for_each_segment_all(bvec, &io_bio->bio, i) {
- ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
- 0, start, bvec->bv_len);
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+
+ pgoff = bvec->bv_offset;
+ next_block:
+ csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
+ ret = __readpage_endio_check(inode, io_bio, csum_pos,
+ bvec->bv_page, pgoff, start,
+ sectorsize);
if (likely(!ret))
goto next;
try_again:
done.start = start;
init_completion(&done.done);
- ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
- start + bvec->bv_len - 1,
- io_bio->mirror_num,
- btrfs_retry_endio, &done);
+ ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
+ pgoff, start, start + sectorsize - 1,
+ io_bio->mirror_num,
+ btrfs_retry_endio, &done);
if (ret) {
err = ret;
goto next;
goto try_again;
}
next:
- offset += bvec->bv_len;
- start += bvec->bv_len;
+ offset += sectorsize;
+ start += sectorsize;
+
+ ASSERT(nr_sectors);
+
+ if (--nr_sectors) {
+ pgoff += sectorsize;
+ goto next_block;
+ }
}
return err;
u64 file_offset = dip->logical_offset;
u64 submit_len = 0;
u64 map_length;
- int nr_pages = 0;
- int ret;
+ u32 blocksize = root->sectorsize;
int async_submit = 0;
+ int nr_sectors;
+ int ret;
+ int i;
map_length = orig_bio->bi_iter.bi_size;
ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
atomic_inc(&dip->pending_bios);
while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
- if (map_length < submit_len + bvec->bv_len ||
- bio_add_page(bio, bvec->bv_page, bvec->bv_len,
- bvec->bv_offset) < bvec->bv_len) {
+ nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len);
+ i = 0;
+ next_block:
+ if (unlikely(map_length < submit_len + blocksize ||
+ bio_add_page(bio, bvec->bv_page, blocksize,
+ bvec->bv_offset + (i * blocksize)) < blocksize)) {
/*
* inc the count before we submit the bio so
* we know the end IO handler won't happen before
file_offset += submit_len;
submit_len = 0;
- nr_pages = 0;
bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
start_sector, GFP_NOFS);
bio_put(bio);
goto out_err;
}
+
+ goto next_block;
} else {
- submit_len += bvec->bv_len;
- nr_pages++;
+ submit_len += blocksize;
+ if (--nr_sectors) {
+ i++;
+ goto next_block;
+ }
bvec++;
}
}
* not unlock the i_mutex at this case.
*/
if (offset + count <= inode->i_size) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
relock = true;
}
ret = btrfs_delalloc_reserve_space(inode, offset, count);
if (wakeup)
inode_dio_end(inode);
if (relock)
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
return ret;
}
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+ u64 start;
+ u64 end;
int inode_evicting = inode->i_state & I_FREEING;
/*
if (!inode_evicting)
lock_extent_bits(tree, page_start, page_end, &cached_state);
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ again:
+ start = page_start;
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ page_end - start + 1);
if (ordered) {
+ end = min(page_end, ordered->file_offset + ordered->len - 1);
/*
* IO on this page will never be started, so we need
* to account for any ordered extents now
*/
if (!inode_evicting)
- clear_extent_bit(tree, page_start, page_end,
+ clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 0, &cached_state,
spin_lock_irq(&tree->lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
- new_len = page_start - ordered->file_offset;
+ new_len = start - ordered->file_offset;
if (new_len < ordered->truncated_len)
ordered->truncated_len = new_len;
spin_unlock_irq(&tree->lock);
if (btrfs_dec_test_ordered_pending(inode, &ordered,
- page_start,
- PAGE_CACHE_SIZE, 1))
+ start,
+ end - start + 1, 1))
btrfs_finish_ordered_io(ordered);
}
btrfs_put_ordered_extent(ordered);
if (!inode_evicting) {
cached_state = NULL;
- lock_extent_bits(tree, page_start, page_end,
+ lock_extent_bits(tree, start, end,
&cached_state);
}
+
+ start = end + 1;
+ if (start < page_end)
+ goto again;
}
/*
loff_t size;
int ret;
int reserved = 0;
+ u64 reserved_space;
u64 page_start;
u64 page_end;
+ u64 end;
+
+ reserved_space = PAGE_CACHE_SIZE;
sb_start_pagefault(inode->i_sb);
page_start = page_offset(page);
page_end = page_start + PAGE_CACHE_SIZE - 1;
+ end = page_end;
+ /*
+ * Reserving delalloc space after obtaining the page lock can lead to
+ * deadlock. For example, if a dirty page is locked by this function
+ * and the call to btrfs_delalloc_reserve_space() ends up triggering
+ * dirty page write out, then the btrfs_writepage() function could
+ * end up waiting indefinitely to get a lock on the page currently
+ * being processed by btrfs_page_mkwrite() function.
+ */
ret = btrfs_delalloc_reserve_space(inode, page_start,
- PAGE_CACHE_SIZE);
+ reserved_space);
if (!ret) {
ret = file_update_time(vma->vm_file);
reserved = 1;
* we can't set the delalloc bits if there are pending ordered
* extents. Drop our locks and wait for them to finish
*/
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, page_end);
if (ordered) {
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state, GFP_NOFS);
goto again;
}
+ if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) {
+ reserved_space = round_up(size - page_start, root->sectorsize);
+ if (reserved_space < PAGE_CACHE_SIZE) {
+ end = page_start + reserved_space - 1;
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ btrfs_delalloc_release_space(inode, page_start,
+ PAGE_CACHE_SIZE - reserved_space);
+ }
+ }
+
/*
* XXX - page_mkwrite gets called every time the page is dirtied, even
* if it was already dirty, so for space accounting reasons we need to
* is probably a better way to do this, but for now keep consistent with
* prepare_pages in the normal write path.
*/
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state, GFP_NOFS);
- ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+ ret = btrfs_set_extent_delalloc(inode, page_start, end,
&cached_state);
if (ret) {
unlock_extent_cached(io_tree, page_start, page_end,
}
unlock_page(page);
out:
- btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, page_start, reserved_space);
out_noreserve:
sb_end_pagefault(inode->i_sb);
return ret;
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ init_once);
if (!btrfs_inode_cachep)
goto fail;
generic_fillattr(inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
- stat->blksize = PAGE_CACHE_SIZE;
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
btrfs_free_path(path);
inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &btrfs_symlink_aops;
inode_set_bytes(inode, name_len);
btrfs_i_size_write(inode, name_len);
.setattr = btrfs_setattr,
.mknod = btrfs_mknod,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.get_acl = btrfs_get_acl,
};
static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.update_time = btrfs_update_time,
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ip_oldflags = ip->flags;
i_oldflags = inode->i_flags;
}
out_unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
mnt_drop_write_file(file);
return ret;
}
out_dput:
dput(dentry);
out_unlock:
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
return error;
}
ra_index += cluster;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
BTRFS_I(inode)->force_compress = compress_type;
ret = cluster_pages_for_defrag(inode, pages, i, cluster);
if (ret < 0) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out_ra;
}
defrag_count += ret;
balance_dirty_pages_ratelimited(inode->i_mapping);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (newer_than) {
if (newer_off == (u64)-1)
out_ra:
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
if (!file)
kfree(ra);
goto out_dput;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* Don't allow to delete a subvolume with send in progress. This is
spin_unlock(&dest->root_item_lock);
}
out_unlock_inode:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!err) {
d_invalidate(dentry);
btrfs_invalidate_inodes(dest);
out_dput:
dput(dentry);
out_unlock_dir:
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
out_drop_write:
mnt_drop_write_file(file);
out:
static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
{
- mutex_unlock(&inode1->i_mutex);
- mutex_unlock(&inode2->i_mutex);
+ inode_unlock(inode1);
+ inode_unlock(inode2);
}
static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
if (inode1 < inode2)
swap(inode1, inode2);
- mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(inode1, I_MUTEX_PARENT);
+ inode_lock_nested(inode2, I_MUTEX_CHILD);
}
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
flush_dcache_page(dst_page);
if (memcmp(addr, dst_addr, cmp_len))
- ret = BTRFS_SAME_DATA_DIFFERS;
+ ret = -EBADE;
kunmap_atomic(addr);
kunmap_atomic(dst_addr);
return 0;
if (same_inode) {
- mutex_lock(&src->i_mutex);
+ inode_lock(src);
ret = extent_same_check_offsets(src, loff, &len, olen);
if (ret)
btrfs_cmp_data_free(&cmp);
out_unlock:
if (same_inode)
- mutex_unlock(&src->i_mutex);
+ inode_unlock(src);
else
btrfs_double_inode_unlock(src, dst);
#define BTRFS_MAX_DEDUPE_LEN SZ_16M
-static long btrfs_ioctl_file_extent_same(struct file *file,
- struct btrfs_ioctl_same_args __user *argp)
+ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
+ struct file *dst_file, u64 dst_loff)
{
- struct btrfs_ioctl_same_args *same = NULL;
- struct btrfs_ioctl_same_extent_info *info;
- struct inode *src = file_inode(file);
- u64 off;
- u64 len;
- int i;
- int ret;
- unsigned long size;
+ struct inode *src = file_inode(src_file);
+ struct inode *dst = file_inode(dst_file);
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
- bool is_admin = capable(CAP_SYS_ADMIN);
- u16 count;
-
- if (!(file->f_mode & FMODE_READ))
- return -EINVAL;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- if (get_user(count, &argp->dest_count)) {
- ret = -EFAULT;
- goto out;
- }
-
- size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
-
- same = memdup_user(argp, size);
-
- if (IS_ERR(same)) {
- ret = PTR_ERR(same);
- same = NULL;
- goto out;
- }
+ ssize_t res;
- off = same->logical_offset;
- len = same->length;
-
- /*
- * Limit the total length we will dedupe for each operation.
- * This is intended to bound the total time spent in this
- * ioctl to something sane.
- */
- if (len > BTRFS_MAX_DEDUPE_LEN)
- len = BTRFS_MAX_DEDUPE_LEN;
+ if (olen > BTRFS_MAX_DEDUPE_LEN)
+ olen = BTRFS_MAX_DEDUPE_LEN;
if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
/*
* result, btrfs_cmp_data() won't correctly handle
* this situation without an update.
*/
- ret = -EINVAL;
- goto out;
- }
-
- ret = -EISDIR;
- if (S_ISDIR(src->i_mode))
- goto out;
-
- ret = -EACCES;
- if (!S_ISREG(src->i_mode))
- goto out;
-
- /* pre-format output fields to sane values */
- for (i = 0; i < count; i++) {
- same->info[i].bytes_deduped = 0ULL;
- same->info[i].status = 0;
- }
-
- for (i = 0, info = same->info; i < count; i++, info++) {
- struct inode *dst;
- struct fd dst_file = fdget(info->fd);
- if (!dst_file.file) {
- info->status = -EBADF;
- continue;
- }
- dst = file_inode(dst_file.file);
-
- if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
- info->status = -EINVAL;
- } else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
- info->status = -EXDEV;
- } else if (S_ISDIR(dst->i_mode)) {
- info->status = -EISDIR;
- } else if (!S_ISREG(dst->i_mode)) {
- info->status = -EACCES;
- } else {
- info->status = btrfs_extent_same(src, off, len, dst,
- info->logical_offset);
- if (info->status == 0)
- info->bytes_deduped += len;
- }
- fdput(dst_file);
+ return -EINVAL;
}
- ret = copy_to_user(argp, same, size);
- if (ret)
- ret = -EFAULT;
-
-out:
- mnt_drop_write_file(file);
- kfree(same);
- return ret;
+ res = btrfs_extent_same(src, loff, olen, dst, dst_loff);
+ if (res)
+ return res;
+ return olen;
}
static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
- u64 off, u64 olen, u64 destoff)
+static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
+ u64 off, u64 olen, u64 destoff)
{
struct inode *inode = file_inode(file);
+ struct inode *src = file_inode(file_src);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct fd src_file;
- struct inode *src;
int ret;
u64 len = olen;
u64 bs = root->fs_info->sb->s_blocksize;
- int same_inode = 0;
+ int same_inode = src == inode;
/*
* TODO:
* be either compressed or non-compressed.
*/
- /* the destination must be opened for writing */
- if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
- return -EINVAL;
-
if (btrfs_root_readonly(root))
return -EROFS;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- src_file = fdget(srcfd);
- if (!src_file.file) {
- ret = -EBADF;
- goto out_drop_write;
- }
-
- ret = -EXDEV;
- if (src_file.file->f_path.mnt != file->f_path.mnt)
- goto out_fput;
-
- src = file_inode(src_file.file);
-
- ret = -EINVAL;
- if (src == inode)
- same_inode = 1;
-
- /* the src must be open for reading */
- if (!(src_file.file->f_mode & FMODE_READ))
- goto out_fput;
+ if (file_src->f_path.mnt != file->f_path.mnt ||
+ src->i_sb != inode->i_sb)
+ return -EXDEV;
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
- goto out_fput;
+ return -EINVAL;
- ret = -EISDIR;
if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
- goto out_fput;
-
- ret = -EXDEV;
- if (src->i_sb != inode->i_sb)
- goto out_fput;
+ return -EISDIR;
if (!same_inode) {
btrfs_double_inode_lock(src, inode);
} else {
- mutex_lock(&src->i_mutex);
+ inode_lock(src);
}
/* determine range to clone */
* Truncate page cache pages so that future reads will see the cloned
* data immediately and not the previous data.
*/
- truncate_inode_pages_range(&inode->i_data, destoff,
- PAGE_CACHE_ALIGN(destoff + len) - 1);
+ truncate_inode_pages_range(&inode->i_data,
+ round_down(destoff, PAGE_CACHE_SIZE),
+ round_up(destoff + len, PAGE_CACHE_SIZE) - 1);
out_unlock:
if (!same_inode)
btrfs_double_inode_unlock(src, inode);
else
- mutex_unlock(&src->i_mutex);
-out_fput:
- fdput(src_file);
-out_drop_write:
- mnt_drop_write_file(file);
+ inode_unlock(src);
return ret;
}
-static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
{
- struct btrfs_ioctl_clone_range_args args;
+ ssize_t ret;
- if (copy_from_user(&args, argp, sizeof(args)))
- return -EFAULT;
- return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
- args.src_length, args.dest_offset);
+ ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
+ if (ret == 0)
+ ret = len;
+ return ret;
+}
+
+int btrfs_clone_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, u64 len)
+{
+ return btrfs_clone_files(dst_file, src_file, off, len, destoff);
}
/*
return btrfs_ioctl_dev_info(root, argp);
case BTRFS_IOC_BALANCE:
return btrfs_ioctl_balance(file, NULL);
- case BTRFS_IOC_CLONE:
- return btrfs_ioctl_clone(file, arg, 0, 0, 0);
- case BTRFS_IOC_CLONE_RANGE:
- return btrfs_ioctl_clone_range(file, argp);
case BTRFS_IOC_TRANS_START:
return btrfs_ioctl_trans_start(file);
case BTRFS_IOC_TRANS_END:
return btrfs_ioctl_get_fslabel(file, argp);
case BTRFS_IOC_SET_FSLABEL:
return btrfs_ioctl_set_fslabel(file, argp);
- case BTRFS_IOC_FILE_EXTENT_SAME:
- return btrfs_ioctl_file_extent_same(file, argp);
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
return btrfs_ioctl_get_supported_features(file, argp);
case BTRFS_IOC_GET_FEATURES: