]> Git Repo - linux.git/commitdiff
Merge tag 'ceph-for-5.16-rc1' of git://github.com/ceph/ceph-client
authorLinus Torvalds <[email protected]>
Sat, 13 Nov 2021 19:31:07 +0000 (11:31 -0800)
committerLinus Torvalds <[email protected]>
Sat, 13 Nov 2021 19:31:07 +0000 (11:31 -0800)
Pull ceph updates from Ilya Dryomov:
 "One notable change here is that async creates and unlinks introduced
  in 5.7 are now enabled by default. This should greatly speed up things
  like rm, tar and rsync. To opt out, wsync mount option can be used.

  Other than that we have a pile of bug fixes all across the filesystem
  from Jeff, Xiubo and Kotresh and a metrics infrastructure rework from
  Luis"

* tag 'ceph-for-5.16-rc1' of git://github.com/ceph/ceph-client:
  ceph: add a new metric to keep track of remote object copies
  libceph, ceph: move ceph_osdc_copy_from() into cephfs code
  ceph: clean-up metrics data structures to reduce code duplication
  ceph: split 'metric' debugfs file into several files
  ceph: return the real size read when it hits EOF
  ceph: properly handle statfs on multifs setups
  ceph: shut down mount on bad mdsmap or fsmap decode
  ceph: fix mdsmap decode when there are MDS's beyond max_mds
  ceph: ignore the truncate when size won't change with Fx caps issued
  ceph: don't rely on error_string to validate blocklisted session.
  ceph: just use ci->i_version for fscache aux info
  ceph: shut down access to inode when async create fails
  ceph: refactor remove_session_caps_cb
  ceph: fix auth cap handling logic in remove_session_caps_cb
  ceph: drop private list from remove_session_caps_cb
  ceph: don't use -ESTALE as special return code in try_get_cap_refs
  ceph: print inode numbers instead of pointer values
  ceph: enable async dirops by default
  libceph: drop ->monmap and err initialization
  ceph: convert to noop_direct_IO

1  2 
fs/ceph/addr.c
fs/ceph/file.c
fs/ceph/locks.c

diff --combined fs/ceph/addr.c
index 04bbe853bcb1a9b566f703d904d6474fec1d4139,b39aebc2ed9528c0ab5fbbf65e1f87195ede0cb5..e53c8541f5b234ffe3a0009707be119d60fc34f7
@@@ -63,7 -63,7 +63,7 @@@
         (CONGESTION_ON_THRESH(congestion_kb) >> 2))
  
  static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
 -                                      struct page *page, void **_fsdata);
 +                                      struct folio *folio, void **_fsdata);
  
  static inline struct ceph_snap_context *page_snap_context(struct page *page)
  {
@@@ -317,14 -317,13 +317,14 @@@ static const struct netfs_read_request_
  };
  
  /* read a single page, without unlocking it. */
 -static int ceph_readpage(struct file *file, struct page *page)
 +static int ceph_readpage(struct file *file, struct page *subpage)
  {
 +      struct folio *folio = page_folio(subpage);
        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_vino vino = ceph_vino(inode);
 -      u64 off = page_offset(page);
 -      u64 len = thp_size(page);
 +      size_t len = folio_size(folio);
 +      u64 off = folio_file_pos(folio);
  
        if (ci->i_inline_version != CEPH_INLINE_NONE) {
                /*
                 * into page cache while getting Fcr caps.
                 */
                if (off == 0) {
 -                      unlock_page(page);
 +                      folio_unlock(folio);
                        return -EINVAL;
                }
 -              zero_user_segment(page, 0, thp_size(page));
 -              SetPageUptodate(page);
 -              unlock_page(page);
 +              zero_user_segment(&folio->page, 0, folio_size(folio));
 +              folio_mark_uptodate(folio);
 +              folio_unlock(folio);
                return 0;
        }
  
 -      dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
 -           vino.ino, vino.snap, file, off, len, page, page->index);
 +      dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n",
 +           vino.ino, vino.snap, file, off, len, folio, folio_index(folio));
  
 -      return netfs_readpage(file, page, &ceph_netfs_read_ops, NULL);
 +      return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL);
  }
  
  static void ceph_readahead(struct readahead_control *ractl)
@@@ -725,7 -724,7 +725,7 @@@ static int ceph_writepages_start(struc
             wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
  
-       if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
+       if (ceph_inode_is_shutdown(inode)) {
                if (ci->i_wrbuffer_ref > 0) {
                        pr_warn_ratelimited(
                                "writepage_start %p %lld forced umount\n",
@@@ -1146,12 -1145,12 +1146,12 @@@ static struct ceph_snap_context 
  ceph_find_incompatible(struct page *page)
  {
        struct inode *inode = page->mapping->host;
-       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_inode_info *ci = ceph_inode(inode);
  
-       if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
-               dout(" page %p forced umount\n", page);
-               return ERR_PTR(-EIO);
+       if (ceph_inode_is_shutdown(inode)) {
+               dout(" page %p %llx:%llx is shutdown\n", page,
+                    ceph_vinop(inode));
+               return ERR_PTR(-ESTALE);
        }
  
        for (;;) {
  }
  
  static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
 -                                      struct page *page, void **_fsdata)
 +                                      struct folio *folio, void **_fsdata)
  {
        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_snap_context *snapc;
  
 -      snapc = ceph_find_incompatible(page);
 +      snapc = ceph_find_incompatible(folio_page(folio, 0));
        if (snapc) {
                int r;
  
 -              unlock_page(page);
 -              put_page(page);
 +              folio_unlock(folio);
 +              folio_put(folio);
                if (IS_ERR(snapc))
                        return PTR_ERR(snapc);
  
   * clean, or already dirty within the same snap context.
   */
  static int ceph_write_begin(struct file *file, struct address_space *mapping,
 -                          loff_t pos, unsigned len, unsigned flags,
 +                          loff_t pos, unsigned len, unsigned aop_flags,
                            struct page **pagep, void **fsdata)
  {
        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
 -      struct page *page = NULL;
 +      struct folio *folio = NULL;
        pgoff_t index = pos >> PAGE_SHIFT;
        int r;
  
         * for inline_version sent to the MDS.
         */
        if (ci->i_inline_version != CEPH_INLINE_NONE) {
 -              page = grab_cache_page_write_begin(mapping, index, flags);
 -              if (!page)
 +              unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
 +              if (aop_flags & AOP_FLAG_NOFS)
 +                      fgp_flags |= FGP_NOFS;
 +              folio = __filemap_get_folio(mapping, index, fgp_flags,
 +                                          mapping_gfp_mask(mapping));
 +              if (!folio)
                        return -ENOMEM;
  
                /*
                 * The inline_version on a new inode is set to 1. If that's the
 -               * case, then the page is brand new and isn't yet Uptodate.
 +               * case, then the folio is brand new and isn't yet Uptodate.
                 */
                r = 0;
                if (index == 0 && ci->i_inline_version != 1) {
 -                      if (!PageUptodate(page)) {
 +                      if (!folio_test_uptodate(folio)) {
                                WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
                                          ci->i_inline_version);
                                r = -EINVAL;
                        }
                        goto out;
                }
 -              zero_user_segment(page, 0, thp_size(page));
 -              SetPageUptodate(page);
 +              zero_user_segment(&folio->page, 0, folio_size(folio));
 +              folio_mark_uptodate(folio);
                goto out;
        }
  
 -      r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &page, NULL,
 +      r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL,
                              &ceph_netfs_read_ops, NULL);
  out:
        if (r == 0)
 -              wait_on_page_fscache(page);
 +              folio_wait_fscache(folio);
        if (r < 0) {
 -              if (page)
 -                      put_page(page);
 +              if (folio)
 +                      folio_put(folio);
        } else {
 -              WARN_ON_ONCE(!PageLocked(page));
 -              *pagep = page;
 +              WARN_ON_ONCE(!folio_test_locked(folio));
 +              *pagep = &folio->page;
        }
        return r;
  }
   */
  static int ceph_write_end(struct file *file, struct address_space *mapping,
                          loff_t pos, unsigned len, unsigned copied,
 -                        struct page *page, void *fsdata)
 +                        struct page *subpage, void *fsdata)
  {
 +      struct folio *folio = page_folio(subpage);
        struct inode *inode = file_inode(file);
        bool check_cap = false;
  
 -      dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
 -           inode, page, (int)pos, (int)copied, (int)len);
 +      dout("write_end file %p inode %p folio %p %d~%d (%d)\n", file,
 +           inode, folio, (int)pos, (int)copied, (int)len);
  
 -      if (!PageUptodate(page)) {
 +      if (!folio_test_uptodate(folio)) {
                /* just return that nothing was copied on a short copy */
                if (copied < len) {
                        copied = 0;
                        goto out;
                }
 -              SetPageUptodate(page);
 +              folio_mark_uptodate(folio);
        }
  
        /* did file size increase? */
        if (pos+copied > i_size_read(inode))
                check_cap = ceph_inode_set_size(inode, pos+copied);
  
 -      set_page_dirty(page);
 +      folio_mark_dirty(folio);
  
  out:
 -      unlock_page(page);
 -      put_page(page);
 +      folio_unlock(folio);
 +      folio_put(folio);
  
        if (check_cap)
                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
        return copied;
  }
  
- /*
-  * we set .direct_IO to indicate direct io is supported, but since we
-  * intercept O_DIRECT reads and writes early, this function should
-  * never get called.
-  */
- static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter)
- {
-       WARN_ON(1);
-       return -EINVAL;
- }
  const struct address_space_operations ceph_aops = {
        .readpage = ceph_readpage,
        .readahead = ceph_readahead,
        .set_page_dirty = ceph_set_page_dirty,
        .invalidatepage = ceph_invalidatepage,
        .releasepage = ceph_releasepage,
-       .direct_IO = ceph_direct_io,
+       .direct_IO = noop_direct_IO,
  };
  
  static void ceph_block_sigs(sigset_t *oldset)
@@@ -1362,6 -1345,9 +1351,9 @@@ static vm_fault_t ceph_filemap_fault(st
        sigset_t oldset;
        vm_fault_t ret = VM_FAULT_SIGBUS;
  
+       if (ceph_inode_is_shutdown(inode))
+               return ret;
        ceph_block_sigs(&oldset);
  
        dout("filemap_fault %p %llx.%llx %llu trying to get caps\n",
@@@ -1453,6 -1439,9 +1445,9 @@@ static vm_fault_t ceph_page_mkwrite(str
        sigset_t oldset;
        vm_fault_t ret = VM_FAULT_SIGBUS;
  
+       if (ceph_inode_is_shutdown(inode))
+               return ret;
        prealloc_cf = ceph_alloc_cap_flush();
        if (!prealloc_cf)
                return VM_FAULT_OOM;
diff --combined fs/ceph/file.c
index b129ea551378c222d657f286515563d1f13576e6,220a41831b46f935b8f3d6f1d797a9db0d4fab9f..02a0a0fd9ccd51c7f4d11b60c875eaa081d0e2a6
@@@ -525,6 -525,7 +525,7 @@@ static void ceph_async_create_cb(struc
  
        if (result) {
                struct dentry *dentry = req->r_dentry;
+               struct inode *inode = d_inode(dentry);
                int pathlen = 0;
                u64 base = 0;
                char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
                if (!d_unhashed(dentry))
                        d_drop(dentry);
  
-               /* FIXME: start returning I/O errors on all accesses? */
+               ceph_inode_shutdown(inode);
                pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
                        base, IS_ERR(path) ? "<<bad>>" : path, result);
                ceph_mdsc_free_path(path, pathlen);
                }
                ceph_kick_flushing_inode_caps(req->r_session, ci);
                spin_unlock(&ci->i_ceph_lock);
-       } else {
+       } else if (!result) {
                pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
                        req->r_deleg_ino);
        }
@@@ -845,6 -847,7 +847,7 @@@ static ssize_t ceph_sync_read(struct ki
        ssize_t ret;
        u64 off = iocb->ki_pos;
        u64 len = iov_iter_count(to);
+       u64 i_size;
  
        dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
                struct page **pages;
                int num_pages;
                size_t page_off;
-               u64 i_size;
                bool more;
                int idx;
                size_t left;
        }
  
        if (off > iocb->ki_pos) {
-               if (ret >= 0 &&
-                   iov_iter_count(to) > 0 && off >= i_size_read(inode))
+               if (off >= i_size) {
                        *retry_op = CHECK_EOF;
-               ret = off - iocb->ki_pos;
-               iocb->ki_pos = off;
+                       ret = i_size - iocb->ki_pos;
+                       iocb->ki_pos = i_size;
+               } else {
+                       ret = off - iocb->ki_pos;
+                       iocb->ki_pos = off;
+               }
        }
  
        dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
@@@ -1022,7 -1027,7 +1027,7 @@@ static void ceph_aio_complete(struct in
        ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
                                                CEPH_CAP_FILE_RD));
  
 -      aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
 +      aio_req->iocb->ki_complete(aio_req->iocb, ret);
  
        ceph_free_cap_flush(aio_req->prealloc_cf);
        kfree(aio_req);
@@@ -1526,6 -1531,9 +1531,9 @@@ again
        dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
             inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
  
+       if (ceph_inode_is_shutdown(inode))
+               return -ESTALE;
        if (direct_lock)
                ceph_start_io_direct(inode);
        else
@@@ -1678,6 -1686,9 +1686,9 @@@ static ssize_t ceph_write_iter(struct k
        loff_t pos;
        loff_t limit = max(i_size_read(inode), fsc->max_file_size);
  
+       if (ceph_inode_is_shutdown(inode))
+               return -ESTALE;
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EROFS;
  
@@@ -2200,6 -2211,54 +2211,54 @@@ static int is_file_size_ok(struct inod
        return 0;
  }
  
+ static struct ceph_osd_request *
+ ceph_alloc_copyfrom_request(struct ceph_osd_client *osdc,
+                           u64 src_snapid,
+                           struct ceph_object_id *src_oid,
+                           struct ceph_object_locator *src_oloc,
+                           struct ceph_object_id *dst_oid,
+                           struct ceph_object_locator *dst_oloc,
+                           u32 truncate_seq, u64 truncate_size)
+ {
+       struct ceph_osd_request *req;
+       int ret;
+       u32 src_fadvise_flags =
+               CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+               CEPH_OSD_OP_FLAG_FADVISE_NOCACHE;
+       u32 dst_fadvise_flags =
+               CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+               CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+       req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
+       if (!req)
+               return ERR_PTR(-ENOMEM);
+       req->r_flags = CEPH_OSD_FLAG_WRITE;
+       ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc);
+       ceph_oid_copy(&req->r_t.base_oid, dst_oid);
+       ret = osd_req_op_copy_from_init(req, src_snapid, 0,
+                                       src_oid, src_oloc,
+                                       src_fadvise_flags,
+                                       dst_fadvise_flags,
+                                       truncate_seq,
+                                       truncate_size,
+                                       CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
+       if (ret)
+               goto out;
+       ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
+       if (ret)
+               goto out;
+       return req;
+ out:
+       ceph_osdc_put_request(req);
+       return ERR_PTR(ret);
+ }
  static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
                                    struct ceph_inode_info *dst_ci, u64 *dst_off,
                                    struct ceph_fs_client *fsc,
  {
        struct ceph_object_locator src_oloc, dst_oloc;
        struct ceph_object_id src_oid, dst_oid;
+       struct ceph_osd_client *osdc;
+       struct ceph_osd_request *req;
        size_t bytes = 0;
        u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
        u32 src_objlen, dst_objlen;
        src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
        dst_oloc.pool = dst_ci->i_layout.pool_id;
        dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
+       osdc = &fsc->client->osdc;
  
        while (len >= object_size) {
                ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
                ceph_oid_printf(&dst_oid, "%llx.%08llx",
                                dst_ci->i_vino.ino, dst_objnum);
                /* Do an object remote copy */
-               ret = ceph_osdc_copy_from(&fsc->client->osdc,
-                                         src_ci->i_vino.snap, 0,
-                                         &src_oid, &src_oloc,
-                                         CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
-                                         CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
-                                         &dst_oid, &dst_oloc,
-                                         CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
-                                         CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
-                                         dst_ci->i_truncate_seq,
-                                         dst_ci->i_truncate_size,
-                                         CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
+               req = ceph_alloc_copyfrom_request(osdc, src_ci->i_vino.snap,
+                                                 &src_oid, &src_oloc,
+                                                 &dst_oid, &dst_oloc,
+                                                 dst_ci->i_truncate_seq,
+                                                 dst_ci->i_truncate_size);
+               if (IS_ERR(req))
+                       ret = PTR_ERR(req);
+               else {
+                       ceph_osdc_start_request(osdc, req, false);
+                       ret = ceph_osdc_wait_request(osdc, req);
+                       ceph_update_copyfrom_metrics(&fsc->mdsc->metric,
+                                                    req->r_start_latency,
+                                                    req->r_end_latency,
+                                                    object_size, ret);
+                       ceph_osdc_put_request(req);
+               }
                if (ret) {
                        if (ret == -EOPNOTSUPP) {
                                fsc->have_copy_from2 = false;
diff --combined fs/ceph/locks.c
index d8c31069fbf2b9eed2ec3e8825f7f4dae9ac4856,74c227d9abf56cf7b5c9e42bbf4c63d9900f5a63..d1f154aec249bf3d38fb8d10dd66c311065fdb68
@@@ -241,6 -241,9 +241,9 @@@ int ceph_lock(struct file *file, int cm
        if (!(fl->fl_flags & FL_POSIX))
                return -ENOLCK;
  
+       if (ceph_inode_is_shutdown(inode))
+               return -ESTALE;
        dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
  
        /* set wait bit as appropriate, then make command as Ceph expects it*/
@@@ -302,7 -305,13 +305,10 @@@ int ceph_flock(struct file *file, int c
  
        if (!(fl->fl_flags & FL_FLOCK))
                return -ENOLCK;
 -      /* No mandatory locks */
 -      if (fl->fl_type & LOCK_MAND)
 -              return -EOPNOTSUPP;
  
+       if (ceph_inode_is_shutdown(inode))
+               return -ESTALE;
        dout("ceph_flock, fl_file: %p\n", fl->fl_file);
  
        spin_lock(&ci->i_ceph_lock);
This page took 0.103887 seconds and 4 git commands to generate.