fs/zonefs/file.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Simple file system for zoned block devices exposing zones as files.
   4  *
   5  * Copyright (C) 2022 Western Digital Corporation or its affiliates.
   6  */
   7 #include <linux/module.h>
   8 #include <linux/pagemap.h>
   9 #include <linux/iomap.h>
  10 #include <linux/init.h>
  11 #include <linux/slab.h>
  12 #include <linux/blkdev.h>
  13 #include <linux/statfs.h>
  14 #include <linux/writeback.h>
  15 #include <linux/quotaops.h>
  16 #include <linux/seq_file.h>
  17 #include <linux/parser.h>
  18 #include <linux/uio.h>
  19 #include <linux/mman.h>
  20 #include <linux/sched/mm.h>
  21 #include <linux/task_io_accounting_ops.h>
  22
  23 #include "zonefs.h"
  24
  25 #include "trace.h"
  26
  27 static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
  28                                    loff_t length, unsigned int flags,
  29                                    struct iomap *iomap, struct iomap *srcmap)
  30 {
  31         struct zonefs_inode_info *zi = ZONEFS_I(inode);
  32         struct zonefs_zone *z = zonefs_inode_zone(inode);
  33         struct super_block *sb = inode->i_sb;
  34         loff_t isize;
  35
  36         /*
  37          * All blocks are always mapped below EOF. If reading past EOF,
  38          * act as if there is a hole up to the file maximum size.
  39          */
  40         mutex_lock(&zi->i_truncate_mutex);
  41         iomap->bdev = inode->i_sb->s_bdev;
  42         iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  43         isize = i_size_read(inode);
  44         if (iomap->offset >= isize) {
  45                 iomap->type = IOMAP_HOLE;
  46                 iomap->addr = IOMAP_NULL_ADDR;
  47                 iomap->length = length;
  48         } else {
  49                 iomap->type = IOMAP_MAPPED;
  50                 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  51                 iomap->length = isize - iomap->offset;
  52         }
  53         mutex_unlock(&zi->i_truncate_mutex);
  54
  55         trace_zonefs_iomap_begin(inode, iomap);
  56
  57         return 0;
  58 }
  59
  60 static const struct iomap_ops zonefs_read_iomap_ops = {
  61         .iomap_begin    = zonefs_read_iomap_begin,
  62 };
  63
  64 static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
  65                                     loff_t length, unsigned int flags,
  66                                     struct iomap *iomap, struct iomap *srcmap)
  67 {
  68         struct zonefs_inode_info *zi = ZONEFS_I(inode);
  69         struct zonefs_zone *z = zonefs_inode_zone(inode);
  70         struct super_block *sb = inode->i_sb;
  71         loff_t isize;
  72
  73         /* All write I/Os should always be within the file maximum size */
  74         if (WARN_ON_ONCE(offset + length > z->z_capacity))
  75                 return -EIO;
  76
  77         /*
  78          * Sequential zones can only accept direct writes. This is already
  79          * checked when writes are issued, so warn if we see a page writeback
  80          * operation.
  81          */
  82         if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT)))
  83                 return -EIO;
  84
  85         /*
  86          * For conventional zones, all blocks are always mapped. For sequential
  87          * zones, all blocks after always mapped below the inode size (zone
  88          * write pointer) and unwriten beyond.
  89          */
  90         mutex_lock(&zi->i_truncate_mutex);
  91         iomap->bdev = inode->i_sb->s_bdev;
  92         iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  93         iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  94         isize = i_size_read(inode);
  95         if (iomap->offset >= isize) {
  96                 iomap->type = IOMAP_UNWRITTEN;
  97                 iomap->length = z->z_capacity - iomap->offset;
  98         } else {
  99                 iomap->type = IOMAP_MAPPED;
 100                 iomap->length = isize - iomap->offset;
 101         }
 102         mutex_unlock(&zi->i_truncate_mutex);
 103
 104         trace_zonefs_iomap_begin(inode, iomap);
 105
 106         return 0;
 107 }
 108
 109 static const struct iomap_ops zonefs_write_iomap_ops = {
 110         .iomap_begin    = zonefs_write_iomap_begin,
 111 };
 112
 113 static int zonefs_read_folio(struct file *unused, struct folio *folio)
 114 {
 115         return iomap_read_folio(folio, &zonefs_read_iomap_ops);
 116 }
 117
 118 static void zonefs_readahead(struct readahead_control *rac)
 119 {
 120         iomap_readahead(rac, &zonefs_read_iomap_ops);
 121 }
 122
 123 /*
 124  * Map blocks for page writeback. This is used only on conventional zone files,
 125  * which implies that the page range can only be within the fixed inode size.
 126  */
 127 static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
 128                                    struct inode *inode, loff_t offset)
 129 {
 130         struct zonefs_zone *z = zonefs_inode_zone(inode);
 131
 132         if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
 133                 return -EIO;
 134         if (WARN_ON_ONCE(offset >= i_size_read(inode)))
 135                 return -EIO;
 136
 137         /* If the mapping is already OK, nothing needs to be done */
 138         if (offset >= wpc->iomap.offset &&
 139             offset < wpc->iomap.offset + wpc->iomap.length)
 140                 return 0;
 141
 142         return zonefs_write_iomap_begin(inode, offset,
 143                                         z->z_capacity - offset,
 144                                         IOMAP_WRITE, &wpc->iomap, NULL);
 145 }
 146
 147 static const struct iomap_writeback_ops zonefs_writeback_ops = {
 148         .map_blocks             = zonefs_write_map_blocks,
 149 };
 150
 151 static int zonefs_writepages(struct address_space *mapping,
 152                              struct writeback_control *wbc)
 153 {
 154         struct iomap_writepage_ctx wpc = { };
 155
 156         return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
 157 }
 158
 159 static int zonefs_swap_activate(struct swap_info_struct *sis,
 160                                 struct file *swap_file, sector_t *span)
 161 {
 162         struct inode *inode = file_inode(swap_file);
 163
 164         if (zonefs_inode_is_seq(inode)) {
 165                 zonefs_err(inode->i_sb,
 166                            "swap file: not a conventional zone file\n");
 167                 return -EINVAL;
 168         }
 169
 170         return iomap_swapfile_activate(sis, swap_file, span,
 171                                        &zonefs_read_iomap_ops);
 172 }
 173
 174 const struct address_space_operations zonefs_file_aops = {
 175         .read_folio             = zonefs_read_folio,
 176         .readahead              = zonefs_readahead,
 177         .writepages             = zonefs_writepages,
 178         .dirty_folio            = filemap_dirty_folio,
 179         .release_folio          = iomap_release_folio,
 180         .invalidate_folio       = iomap_invalidate_folio,
 181         .migrate_folio          = filemap_migrate_folio,
 182         .is_partially_uptodate  = iomap_is_partially_uptodate,
 183         .error_remove_page      = generic_error_remove_page,
 184         .swap_activate          = zonefs_swap_activate,
 185 };
 186
 187 int zonefs_file_truncate(struct inode *inode, loff_t isize)
 188 {
 189         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 190         struct zonefs_zone *z = zonefs_inode_zone(inode);
 191         loff_t old_isize;
 192         enum req_op op;
 193         int ret = 0;
 194
 195         /*
 196          * Only sequential zone files can be truncated and truncation is allowed
 197          * only down to a 0 size, which is equivalent to a zone reset, and to
 198          * the maximum file size, which is equivalent to a zone finish.
 199          */
 200         if (!zonefs_zone_is_seq(z))
 201                 return -EPERM;
 202
 203         if (!isize)
 204                 op = REQ_OP_ZONE_RESET;
 205         else if (isize == z->z_capacity)
 206                 op = REQ_OP_ZONE_FINISH;
 207         else
 208                 return -EPERM;
 209
 210         inode_dio_wait(inode);
 211
 212         /* Serialize against page faults */
 213         filemap_invalidate_lock(inode->i_mapping);
 214
 215         /* Serialize against zonefs_iomap_begin() */
 216         mutex_lock(&zi->i_truncate_mutex);
 217
 218         old_isize = i_size_read(inode);
 219         if (isize == old_isize)
 220                 goto unlock;
 221
 222         ret = zonefs_inode_zone_mgmt(inode, op);
 223         if (ret)
 224                 goto unlock;
 225
 226         /*
 227          * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
 228          * take care of open zones.
 229          */
 230         if (z->z_flags & ZONEFS_ZONE_OPEN) {
 231                 /*
 232                  * Truncating a zone to EMPTY or FULL is the equivalent of
 233                  * closing the zone. For a truncation to 0, we need to
 234                  * re-open the zone to ensure new writes can be processed.
 235                  * For a truncation to the maximum file size, the zone is
 236                  * closed and writes cannot be accepted anymore, so clear
 237                  * the open flag.
 238                  */
 239                 if (!isize)
 240                         ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
 241                 else
 242                         z->z_flags &= ~ZONEFS_ZONE_OPEN;
 243         }
 244
 245         zonefs_update_stats(inode, isize);
 246         truncate_setsize(inode, isize);
 247         z->z_wpoffset = isize;
 248         zonefs_inode_account_active(inode);
 249
 250 unlock:
 251         mutex_unlock(&zi->i_truncate_mutex);
 252         filemap_invalidate_unlock(inode->i_mapping);
 253
 254         return ret;
 255 }
 256
 257 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
 258                              int datasync)
 259 {
 260         struct inode *inode = file_inode(file);
 261         int ret = 0;
 262
 263         if (unlikely(IS_IMMUTABLE(inode)))
 264                 return -EPERM;
 265
 266         /*
 267          * Since only direct writes are allowed in sequential files, page cache
 268          * flush is needed only for conventional zone files.
 269          */
 270         if (zonefs_inode_is_cnv(inode))
 271                 ret = file_write_and_wait_range(file, start, end);
 272         if (!ret)
 273                 ret = blkdev_issue_flush(inode->i_sb->s_bdev);
 274
 275         if (ret)
 276                 zonefs_io_error(inode, true);
 277
 278         return ret;
 279 }
 280
 281 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
 282 {
 283         struct inode *inode = file_inode(vmf->vma->vm_file);
 284         vm_fault_t ret;
 285
 286         if (unlikely(IS_IMMUTABLE(inode)))
 287                 return VM_FAULT_SIGBUS;
 288
 289         /*
 290          * Sanity check: only conventional zone files can have shared
 291          * writeable mappings.
 292          */
 293         if (zonefs_inode_is_seq(inode))
 294                 return VM_FAULT_NOPAGE;
 295
 296         sb_start_pagefault(inode->i_sb);
 297         file_update_time(vmf->vma->vm_file);
 298
 299         /* Serialize against truncates */
 300         filemap_invalidate_lock_shared(inode->i_mapping);
 301         ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
 302         filemap_invalidate_unlock_shared(inode->i_mapping);
 303
 304         sb_end_pagefault(inode->i_sb);
 305         return ret;
 306 }
 307
 308 static const struct vm_operations_struct zonefs_file_vm_ops = {
 309         .fault          = filemap_fault,
 310         .map_pages      = filemap_map_pages,
 311         .page_mkwrite   = zonefs_filemap_page_mkwrite,
 312 };
 313
 314 static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
 315 {
 316         /*
 317          * Conventional zones accept random writes, so their files can support
 318          * shared writable mappings. For sequential zone files, only read
 319          * mappings are possible since there are no guarantees for write
 320          * ordering between msync() and page cache writeback.
 321          */
 322         if (zonefs_inode_is_seq(file_inode(file)) &&
 323             (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
 324                 return -EINVAL;
 325
 326         file_accessed(file);
 327         vma->vm_ops = &zonefs_file_vm_ops;
 328
 329         return 0;
 330 }
 331
 332 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
 333 {
 334         loff_t isize = i_size_read(file_inode(file));
 335
 336         /*
 337          * Seeks are limited to below the zone size for conventional zones
 338          * and below the zone write pointer for sequential zones. In both
 339          * cases, this limit is the inode size.
 340          */
 341         return generic_file_llseek_size(file, offset, whence, isize, isize);
 342 }
 343
 344 struct zonefs_zone_append_bio {
 345         /* The target inode of the BIO */
 346         struct inode *inode;
 347
 348         /* For sync writes, the target append write offset */
 349         u64 append_offset;
 350
 351         /*
 352          * This member must come last, bio_alloc_bioset will allocate enough
 353          * bytes for entire zonefs_bio but relies on bio being last.
 354          */
 355         struct bio bio;
 356 };
 357
 358 static inline struct zonefs_zone_append_bio *
 359 zonefs_zone_append_bio(struct bio *bio)
 360 {
 361         return container_of(bio, struct zonefs_zone_append_bio, bio);
 362 }
 363
 364 static void zonefs_file_zone_append_dio_bio_end_io(struct bio *bio)
 365 {
 366         struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio);
 367         struct zonefs_zone *z = zonefs_inode_zone(za_bio->inode);
 368         sector_t za_sector;
 369
 370         if (bio->bi_status != BLK_STS_OK)
 371                 goto bio_end;
 372
 373         /*
 374          * If the file zone was written underneath the file system, the zone
 375          * append operation can still succedd (if the zone is not full) but
 376          * the write append location will not be where we expect it to be.
 377          * Check that we wrote where we intended to, that is, at z->z_wpoffset.
 378          */
 379         za_sector = z->z_sector + (za_bio->append_offset >> SECTOR_SHIFT);
 380         if (bio->bi_iter.bi_sector != za_sector) {
 381                 zonefs_warn(za_bio->inode->i_sb,
 382                             "Invalid write sector %llu for zone at %llu\n",
 383                             bio->bi_iter.bi_sector, z->z_sector);
 384                 bio->bi_status = BLK_STS_IOERR;
 385         }
 386
 387 bio_end:
 388         iomap_dio_bio_end_io(bio);
 389 }
 390
 391 static void zonefs_file_zone_append_dio_submit_io(const struct iomap_iter *iter,
 392                                                   struct bio *bio,
 393                                                   loff_t file_offset)
 394 {
 395         struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio);
 396         struct inode *inode = iter->inode;
 397         struct zonefs_zone *z = zonefs_inode_zone(inode);
 398
 399         /*
 400          * Issue a zone append BIO to process sync dio writes. The append
 401          * file offset is saved to check the zone append write location
 402          * on completion of the BIO.
 403          */
 404         za_bio->inode = inode;
 405         za_bio->append_offset = file_offset;
 406
 407         bio->bi_opf &= ~REQ_OP_WRITE;
 408         bio->bi_opf |= REQ_OP_ZONE_APPEND;
 409         bio->bi_iter.bi_sector = z->z_sector;
 410         bio->bi_end_io = zonefs_file_zone_append_dio_bio_end_io;
 411
 412         submit_bio(bio);
 413 }
 414
 415 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
 416                                         int error, unsigned int flags)
 417 {
 418         struct inode *inode = file_inode(iocb->ki_filp);
 419         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 420
 421         if (error) {
 422                 zonefs_io_error(inode, true);
 423                 return error;
 424         }
 425
 426         if (size && zonefs_inode_is_seq(inode)) {
 427                 /*
 428                  * Note that we may be seeing completions out of order,
 429                  * but that is not a problem since a write completed
 430                  * successfully necessarily means that all preceding writes
 431                  * were also successful. So we can safely increase the inode
 432                  * size to the write end location.
 433                  */
 434                 mutex_lock(&zi->i_truncate_mutex);
 435                 if (i_size_read(inode) < iocb->ki_pos + size) {
 436                         zonefs_update_stats(inode, iocb->ki_pos + size);
 437                         zonefs_i_size_write(inode, iocb->ki_pos + size);
 438                 }
 439                 mutex_unlock(&zi->i_truncate_mutex);
 440         }
 441
 442         return 0;
 443 }
 444
 445 static struct bio_set zonefs_zone_append_bio_set;
 446
 447 static const struct iomap_dio_ops zonefs_zone_append_dio_ops = {
 448         .submit_io      = zonefs_file_zone_append_dio_submit_io,
 449         .end_io         = zonefs_file_write_dio_end_io,
 450         .bio_set        = &zonefs_zone_append_bio_set,
 451 };
 452
 453 static const struct iomap_dio_ops zonefs_write_dio_ops = {
 454         .end_io         = zonefs_file_write_dio_end_io,
 455 };
 456
 457 /*
 458  * Do not exceed the LFS limits nor the file zone size. If pos is under the
 459  * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
 460  */
 461 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
 462                                         loff_t count)
 463 {
 464         struct inode *inode = file_inode(file);
 465         struct zonefs_zone *z = zonefs_inode_zone(inode);
 466         loff_t limit = rlimit(RLIMIT_FSIZE);
 467         loff_t max_size = z->z_capacity;
 468
 469         if (limit != RLIM_INFINITY) {
 470                 if (pos >= limit) {
 471                         send_sig(SIGXFSZ, current, 0);
 472                         return -EFBIG;
 473                 }
 474                 count = min(count, limit - pos);
 475         }
 476
 477         if (!(file->f_flags & O_LARGEFILE))
 478                 max_size = min_t(loff_t, MAX_NON_LFS, max_size);
 479
 480         if (unlikely(pos >= max_size))
 481                 return -EFBIG;
 482
 483         return min(count, max_size - pos);
 484 }
 485
 486 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
 487 {
 488         struct file *file = iocb->ki_filp;
 489         struct inode *inode = file_inode(file);
 490         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 491         struct zonefs_zone *z = zonefs_inode_zone(inode);
 492         loff_t count;
 493
 494         if (IS_SWAPFILE(inode))
 495                 return -ETXTBSY;
 496
 497         if (!iov_iter_count(from))
 498                 return 0;
 499
 500         if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
 501                 return -EINVAL;
 502
 503         if (iocb->ki_flags & IOCB_APPEND) {
 504                 if (zonefs_zone_is_cnv(z))
 505                         return -EINVAL;
 506                 mutex_lock(&zi->i_truncate_mutex);
 507                 iocb->ki_pos = z->z_wpoffset;
 508                 mutex_unlock(&zi->i_truncate_mutex);
 509         }
 510
 511         count = zonefs_write_check_limits(file, iocb->ki_pos,
 512                                           iov_iter_count(from));
 513         if (count < 0)
 514                 return count;
 515
 516         iov_iter_truncate(from, count);
 517         return iov_iter_count(from);
 518 }
 519
 520 /*
 521  * Handle direct writes. For sequential zone files, this is the only possible
 522  * write path. For these files, check that the user is issuing writes
 523  * sequentially from the end of the file. This code assumes that the block layer
 524  * delivers write requests to the device in sequential order. This is always the
 525  * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
 526  * elevator feature is being used (e.g. mq-deadline). The block layer always
 527  * automatically select such an elevator for zoned block devices during the
 528  * device initialization.
 529  */
 530 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 531 {
 532         struct inode *inode = file_inode(iocb->ki_filp);
 533         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 534         struct zonefs_zone *z = zonefs_inode_zone(inode);
 535         struct super_block *sb = inode->i_sb;
 536         const struct iomap_dio_ops *dio_ops;
 537         bool sync = is_sync_kiocb(iocb);
 538         bool append = false;
 539         ssize_t ret, count;
 540
 541         /*
 542          * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
 543          * as this can cause write reordering (e.g. the first aio gets EAGAIN
 544          * on the inode lock but the second goes through but is now unaligned).
 545          */
 546         if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT))
 547                 return -EOPNOTSUPP;
 548
 549         if (iocb->ki_flags & IOCB_NOWAIT) {
 550                 if (!inode_trylock(inode))
 551                         return -EAGAIN;
 552         } else {
 553                 inode_lock(inode);
 554         }
 555
 556         count = zonefs_write_checks(iocb, from);
 557         if (count <= 0) {
 558                 ret = count;
 559                 goto inode_unlock;
 560         }
 561
 562         if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
 563                 ret = -EINVAL;
 564                 goto inode_unlock;
 565         }
 566
 567         /* Enforce sequential writes (append only) in sequential zones */
 568         if (zonefs_zone_is_seq(z)) {
 569                 mutex_lock(&zi->i_truncate_mutex);
 570                 if (iocb->ki_pos != z->z_wpoffset) {
 571                         mutex_unlock(&zi->i_truncate_mutex);
 572                         ret = -EINVAL;
 573                         goto inode_unlock;
 574                 }
 575                 mutex_unlock(&zi->i_truncate_mutex);
 576                 append = sync;
 577         }
 578
 579         if (append) {
 580                 unsigned int max = bdev_max_zone_append_sectors(sb->s_bdev);
 581
 582                 max = ALIGN_DOWN(max << SECTOR_SHIFT, sb->s_blocksize);
 583                 iov_iter_truncate(from, max);
 584
 585                 dio_ops = &zonefs_zone_append_dio_ops;
 586         } else {
 587                 dio_ops = &zonefs_write_dio_ops;
 588         }
 589
 590         /*
 591          * iomap_dio_rw() may return ENOTBLK if there was an issue with
 592          * page invalidation. Overwrite that error code with EBUSY so that
 593          * the user can make sense of the error.
 594          */
 595         ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
 596                            dio_ops, 0, NULL, 0);
 597         if (ret == -ENOTBLK)
 598                 ret = -EBUSY;
 599
 600         if (zonefs_zone_is_seq(z) &&
 601             (ret > 0 || ret == -EIOCBQUEUED)) {
 602                 if (ret > 0)
 603                         count = ret;
 604
 605                 /*
 606                  * Update the zone write pointer offset assuming the write
 607                  * operation succeeded. If it did not, the error recovery path
 608                  * will correct it. Also do active seq file accounting.
 609                  */
 610                 mutex_lock(&zi->i_truncate_mutex);
 611                 z->z_wpoffset += count;
 612                 zonefs_inode_account_active(inode);
 613                 mutex_unlock(&zi->i_truncate_mutex);
 614         }
 615
 616 inode_unlock:
 617         inode_unlock(inode);
 618
 619         return ret;
 620 }
 621
 622 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
 623                                           struct iov_iter *from)
 624 {
 625         struct inode *inode = file_inode(iocb->ki_filp);
 626         ssize_t ret;
 627
 628         /*
 629          * Direct IO writes are mandatory for sequential zone files so that the
 630          * write IO issuing order is preserved.
 631          */
 632         if (zonefs_inode_is_seq(inode))
 633                 return -EIO;
 634
 635         if (iocb->ki_flags & IOCB_NOWAIT) {
 636                 if (!inode_trylock(inode))
 637                         return -EAGAIN;
 638         } else {
 639                 inode_lock(inode);
 640         }
 641
 642         ret = zonefs_write_checks(iocb, from);
 643         if (ret <= 0)
 644                 goto inode_unlock;
 645
 646         ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
 647         if (ret == -EIO)
 648                 zonefs_io_error(inode, true);
 649
 650 inode_unlock:
 651         inode_unlock(inode);
 652         if (ret > 0)
 653                 ret = generic_write_sync(iocb, ret);
 654
 655         return ret;
 656 }
 657
 658 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 659 {
 660         struct inode *inode = file_inode(iocb->ki_filp);
 661         struct zonefs_zone *z = zonefs_inode_zone(inode);
 662
 663         if (unlikely(IS_IMMUTABLE(inode)))
 664                 return -EPERM;
 665
 666         if (sb_rdonly(inode->i_sb))
 667                 return -EROFS;
 668
 669         /* Write operations beyond the zone capacity are not allowed */
 670         if (iocb->ki_pos >= z->z_capacity)
 671                 return -EFBIG;
 672
 673         if (iocb->ki_flags & IOCB_DIRECT) {
 674                 ssize_t ret = zonefs_file_dio_write(iocb, from);
 675
 676                 if (ret != -ENOTBLK)
 677                         return ret;
 678         }
 679
 680         return zonefs_file_buffered_write(iocb, from);
 681 }
 682
 683 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
 684                                        int error, unsigned int flags)
 685 {
 686         if (error) {
 687                 zonefs_io_error(file_inode(iocb->ki_filp), false);
 688                 return error;
 689         }
 690
 691         return 0;
 692 }
 693
 694 static const struct iomap_dio_ops zonefs_read_dio_ops = {
 695         .end_io                 = zonefs_file_read_dio_end_io,
 696 };
 697
 698 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 699 {
 700         struct inode *inode = file_inode(iocb->ki_filp);
 701         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 702         struct zonefs_zone *z = zonefs_inode_zone(inode);
 703         struct super_block *sb = inode->i_sb;
 704         loff_t isize;
 705         ssize_t ret;
 706
 707         /* Offline zones cannot be read */
 708         if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
 709                 return -EPERM;
 710
 711         if (iocb->ki_pos >= z->z_capacity)
 712                 return 0;
 713
 714         if (iocb->ki_flags & IOCB_NOWAIT) {
 715                 if (!inode_trylock_shared(inode))
 716                         return -EAGAIN;
 717         } else {
 718                 inode_lock_shared(inode);
 719         }
 720
 721         /* Limit read operations to written data */
 722         mutex_lock(&zi->i_truncate_mutex);
 723         isize = i_size_read(inode);
 724         if (iocb->ki_pos >= isize) {
 725                 mutex_unlock(&zi->i_truncate_mutex);
 726                 ret = 0;
 727                 goto inode_unlock;
 728         }
 729         iov_iter_truncate(to, isize - iocb->ki_pos);
 730         mutex_unlock(&zi->i_truncate_mutex);
 731
 732         if (iocb->ki_flags & IOCB_DIRECT) {
 733                 size_t count = iov_iter_count(to);
 734
 735                 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
 736                         ret = -EINVAL;
 737                         goto inode_unlock;
 738                 }
 739                 file_accessed(iocb->ki_filp);
 740                 ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
 741                                    &zonefs_read_dio_ops, 0, NULL, 0);
 742         } else {
 743                 ret = generic_file_read_iter(iocb, to);
 744                 if (ret == -EIO)
 745                         zonefs_io_error(inode, false);
 746         }
 747
 748 inode_unlock:
 749         inode_unlock_shared(inode);
 750
 751         return ret;
 752 }
 753
 754 static ssize_t zonefs_file_splice_read(struct file *in, loff_t *ppos,
 755                                        struct pipe_inode_info *pipe,
 756                                        size_t len, unsigned int flags)
 757 {
 758         struct inode *inode = file_inode(in);
 759         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 760         struct zonefs_zone *z = zonefs_inode_zone(inode);
 761         loff_t isize;
 762         ssize_t ret = 0;
 763
 764         /* Offline zones cannot be read */
 765         if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
 766                 return -EPERM;
 767
 768         if (*ppos >= z->z_capacity)
 769                 return 0;
 770
 771         inode_lock_shared(inode);
 772
 773         /* Limit read operations to written data */
 774         mutex_lock(&zi->i_truncate_mutex);
 775         isize = i_size_read(inode);
 776         if (*ppos >= isize)
 777                 len = 0;
 778         else
 779                 len = min_t(loff_t, len, isize - *ppos);
 780         mutex_unlock(&zi->i_truncate_mutex);
 781
 782         if (len > 0) {
 783                 ret = filemap_splice_read(in, ppos, pipe, len, flags);
 784                 if (ret == -EIO)
 785                         zonefs_io_error(inode, false);
 786         }
 787
 788         inode_unlock_shared(inode);
 789         return ret;
 790 }
 791
 792 /*
 793  * Write open accounting is done only for sequential files.
 794  */
 795 static inline bool zonefs_seq_file_need_wro(struct inode *inode,
 796                                             struct file *file)
 797 {
 798         if (zonefs_inode_is_cnv(inode))
 799                 return false;
 800
 801         if (!(file->f_mode & FMODE_WRITE))
 802                 return false;
 803
 804         return true;
 805 }
 806
 807 static int zonefs_seq_file_write_open(struct inode *inode)
 808 {
 809         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 810         struct zonefs_zone *z = zonefs_inode_zone(inode);
 811         int ret = 0;
 812
 813         mutex_lock(&zi->i_truncate_mutex);
 814
 815         if (!zi->i_wr_refcnt) {
 816                 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
 817                 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
 818
 819                 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
 820
 821                         if (sbi->s_max_wro_seq_files
 822                             && wro > sbi->s_max_wro_seq_files) {
 823                                 atomic_dec(&sbi->s_wro_seq_files);
 824                                 ret = -EBUSY;
 825                                 goto unlock;
 826                         }
 827
 828                         if (i_size_read(inode) < z->z_capacity) {
 829                                 ret = zonefs_inode_zone_mgmt(inode,
 830                                                              REQ_OP_ZONE_OPEN);
 831                                 if (ret) {
 832                                         atomic_dec(&sbi->s_wro_seq_files);
 833                                         goto unlock;
 834                                 }
 835                                 z->z_flags |= ZONEFS_ZONE_OPEN;
 836                                 zonefs_inode_account_active(inode);
 837                         }
 838                 }
 839         }
 840
 841         zi->i_wr_refcnt++;
 842
 843 unlock:
 844         mutex_unlock(&zi->i_truncate_mutex);
 845
 846         return ret;
 847 }
 848
 849 static int zonefs_file_open(struct inode *inode, struct file *file)
 850 {
 851         int ret;
 852
 853         file->f_mode |= FMODE_CAN_ODIRECT;
 854         ret = generic_file_open(inode, file);
 855         if (ret)
 856                 return ret;
 857
 858         if (zonefs_seq_file_need_wro(inode, file))
 859                 return zonefs_seq_file_write_open(inode);
 860
 861         return 0;
 862 }
 863
 864 static void zonefs_seq_file_write_close(struct inode *inode)
 865 {
 866         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 867         struct zonefs_zone *z = zonefs_inode_zone(inode);
 868         struct super_block *sb = inode->i_sb;
 869         struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 870         int ret = 0;
 871
 872         mutex_lock(&zi->i_truncate_mutex);
 873
 874         zi->i_wr_refcnt--;
 875         if (zi->i_wr_refcnt)
 876                 goto unlock;
 877
 878         /*
 879          * The file zone may not be open anymore (e.g. the file was truncated to
 880          * its maximum size or it was fully written). For this case, we only
 881          * need to decrement the write open count.
 882          */
 883         if (z->z_flags & ZONEFS_ZONE_OPEN) {
 884                 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
 885                 if (ret) {
 886                         __zonefs_io_error(inode, false);
 887                         /*
 888                          * Leaving zones explicitly open may lead to a state
 889                          * where most zones cannot be written (zone resources
 890                          * exhausted). So take preventive action by remounting
 891                          * read-only.
 892                          */
 893                         if (z->z_flags & ZONEFS_ZONE_OPEN &&
 894                             !(sb->s_flags & SB_RDONLY)) {
 895                                 zonefs_warn(sb,
 896                                         "closing zone at %llu failed %d\n",
 897                                         z->z_sector, ret);
 898                                 zonefs_warn(sb,
 899                                         "remounting filesystem read-only\n");
 900                                 sb->s_flags |= SB_RDONLY;
 901                         }
 902                         goto unlock;
 903                 }
 904
 905                 z->z_flags &= ~ZONEFS_ZONE_OPEN;
 906                 zonefs_inode_account_active(inode);
 907         }
 908
 909         atomic_dec(&sbi->s_wro_seq_files);
 910
 911 unlock:
 912         mutex_unlock(&zi->i_truncate_mutex);
 913 }
 914
 915 static int zonefs_file_release(struct inode *inode, struct file *file)
 916 {
 917         /*
 918          * If we explicitly open a zone we must close it again as well, but the
 919          * zone management operation can fail (either due to an IO error or as
 920          * the zone has gone offline or read-only). Make sure we don't fail the
 921          * close(2) for user-space.
 922          */
 923         if (zonefs_seq_file_need_wro(inode, file))
 924                 zonefs_seq_file_write_close(inode);
 925
 926         return 0;
 927 }
 928
 929 const struct file_operations zonefs_file_operations = {
 930         .open           = zonefs_file_open,
 931         .release        = zonefs_file_release,
 932         .fsync          = zonefs_file_fsync,
 933         .mmap           = zonefs_file_mmap,
 934         .llseek         = zonefs_file_llseek,
 935         .read_iter      = zonefs_file_read_iter,
 936         .write_iter     = zonefs_file_write_iter,
 937         .splice_read    = zonefs_file_splice_read,
 938         .splice_write   = iter_file_splice_write,
 939         .iopoll         = iocb_bio_iopoll,
 940 };
 941
 942 int zonefs_file_bioset_init(void)
 943 {
 944         return bioset_init(&zonefs_zone_append_bio_set, BIO_POOL_SIZE,
 945                            offsetof(struct zonefs_zone_append_bio, bio),
 946                            BIOSET_NEED_BVECS);
 947 }
 948
 949 void zonefs_file_bioset_exit(void)
 950 {
 951         bioset_exit(&zonefs_zone_append_bio_set);
 952 }