]> Git Repo - linux.git/commitdiff
Merge tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client
authorLinus Torvalds <[email protected]>
Wed, 6 Sep 2023 19:10:15 +0000 (12:10 -0700)
committerLinus Torvalds <[email protected]>
Wed, 6 Sep 2023 19:10:15 +0000 (12:10 -0700)
Pull ceph updates from Ilya Dryomov:
 "Mixed with some fixes and cleanups, this brings in reasonably complete
  fscrypt support to CephFS! The list of things which don't work with
  encryption should be fairly short, mostly around the edges: fallocate
  (not supported well in CephFS to begin with), copy_file_range
  (requires re-encryption), non-default striping patterns.

  This was a multi-year effort principally by Jeff Layton with
  assistance from Xiubo Li, Luís Henriques and others, including several
  dependant changes in the MDS, netfs helper library and fscrypt
  framework itself"

* tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client: (53 commits)
  ceph: make num_fwd and num_retry to __u32
  ceph: make members in struct ceph_mds_request_args_ext a union
  rbd: use list_for_each_entry() helper
  libceph: do not include crypto/algapi.h
  ceph: switch ceph_lookup/atomic_open() to use new fscrypt helper
  ceph: fix updating i_truncate_pagecache_size for fscrypt
  ceph: wait for OSD requests' callbacks to finish when unmounting
  ceph: drop messages from MDS when unmounting
  ceph: update documentation regarding snapshot naming limitations
  ceph: prevent snapshot creation in encrypted locked directories
  ceph: add support for encrypted snapshot names
  ceph: invalidate pages when doing direct/sync writes
  ceph: plumb in decryption during reads
  ceph: add encryption support to writepage and writepages
  ceph: add read/modify/write to ceph_sync_write
  ceph: align data in pages in ceph_sync_write
  ceph: don't use special DIO path for encrypted inodes
  ceph: add truncate size handling support for fscrypt
  ceph: add object version support for sync read
  libceph: allow ceph_osdc_new_request to accept a multi-op read
  ...

1  2 
fs/ceph/acl.c
fs/ceph/caps.c
fs/ceph/inode.c
fs/ceph/snap.c
fs/ceph/xattr.c

diff --combined fs/ceph/acl.c
index c91b293267d748af2f35e4fdcfa67563337e15f9,8a56f979c7cbaa4d4be117ad104e582399e118a5..c53a1d2206225ca26f669642a22cfde767e45bc1
@@@ -93,7 -93,7 +93,7 @@@ int ceph_set_acl(struct mnt_idmap *idma
        char *value = NULL;
        struct iattr newattrs;
        struct inode *inode = d_inode(dentry);
 -      struct timespec64 old_ctime = inode->i_ctime;
 +      struct timespec64 old_ctime = inode_get_ctime(inode);
        umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
  
        if (ceph_snap(inode) != CEPH_NOSNAP) {
                newattrs.ia_ctime = current_time(inode);
                newattrs.ia_mode = new_mode;
                newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-               ret = __ceph_setattr(inode, &newattrs);
+               ret = __ceph_setattr(inode, &newattrs, NULL);
                if (ret)
                        goto out_free;
        }
                        newattrs.ia_ctime = old_ctime;
                        newattrs.ia_mode = old_mode;
                        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-                       __ceph_setattr(inode, &newattrs);
+                       __ceph_setattr(inode, &newattrs, NULL);
                }
                goto out_free;
        }
diff --combined fs/ceph/caps.c
index 09cd6d334604e7b7c47beed10f35a8e843b968d7,028b5140a85dc985bbc2d84eac4e634a4864111d..14215ec646f7ae6bd9a6ee0fdc6878351850ab49
@@@ -14,6 -14,7 +14,7 @@@
  #include "super.h"
  #include "mds_client.h"
  #include "cache.h"
+ #include "crypto.h"
  #include <linux/ceph/decode.h>
  #include <linux/ceph/messenger.h>
  
@@@ -1216,15 -1217,11 +1217,11 @@@ struct cap_msg_args 
        umode_t                 mode;
        bool                    inline_data;
        bool                    wake;
+       bool                    encrypted;
+       u32                     fscrypt_auth_len;
+       u8                      fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context
  };
  
- /*
-  * cap struct size + flock buffer size + inline version + inline data size +
-  * osd_epoch_barrier + oldest_flush_tid
-  */
- #define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
-                     4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
  /* Marshal up the cap msg to the MDS */
  static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
  {
             arg->size, arg->max_size, arg->xattr_version,
             arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
  
-       msg->hdr.version = cpu_to_le16(10);
+       msg->hdr.version = cpu_to_le16(12);
        msg->hdr.tid = cpu_to_le64(arg->flush_tid);
  
        fc = msg->front.iov_base;
        fc->ino = cpu_to_le64(arg->ino);
        fc->snap_follows = cpu_to_le64(arg->follows);
  
-       fc->size = cpu_to_le64(arg->size);
+ #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+       if (arg->encrypted)
+               fc->size = cpu_to_le64(round_up(arg->size,
+                                               CEPH_FSCRYPT_BLOCK_SIZE));
+       else
+ #endif
+               fc->size = cpu_to_le64(arg->size);
        fc->max_size = cpu_to_le64(arg->max_size);
        ceph_encode_timespec64(&fc->mtime, &arg->mtime);
        ceph_encode_timespec64(&fc->atime, &arg->atime);
  
        /* Advisory flags (version 10) */
        ceph_encode_32(&p, arg->flags);
+       /* dirstats (version 11) - these are r/o on the client */
+       ceph_encode_64(&p, 0);
+       ceph_encode_64(&p, 0);
+ #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+       /*
+        * fscrypt_auth and fscrypt_file (version 12)
+        *
+        * fscrypt_auth holds the crypto context (if any). fscrypt_file
+        * tracks the real i_size as an __le64 field (and we use a rounded-up
+        * i_size in the traditional size field).
+        */
+       ceph_encode_32(&p, arg->fscrypt_auth_len);
+       ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len);
+       ceph_encode_32(&p, sizeof(__le64));
+       ceph_encode_64(&p, arg->size);
+ #else /* CONFIG_FS_ENCRYPTION */
+       ceph_encode_32(&p, 0);
+       ceph_encode_32(&p, 0);
+ #endif /* CONFIG_FS_ENCRYPTION */
  }
  
  /*
@@@ -1378,7 -1402,6 +1402,6 @@@ static void __prep_cap(struct cap_msg_a
        arg->follows = flushing ? ci->i_head_snapc->seq : 0;
        arg->flush_tid = flush_tid;
        arg->oldest_flush_tid = oldest_flush_tid;
        arg->size = i_size_read(inode);
        ci->i_reported_size = arg->size;
        arg->max_size = ci->i_wanted_max_size;
  
        arg->mtime = inode->i_mtime;
        arg->atime = inode->i_atime;
 -      arg->ctime = inode->i_ctime;
 +      arg->ctime = inode_get_ctime(inode);
        arg->btime = ci->i_btime;
        arg->change_attr = inode_peek_iversion_raw(inode);
  
                }
        }
        arg->flags = flags;
+       arg->encrypted = IS_ENCRYPTED(inode);
+ #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+       if (ci->fscrypt_auth_len &&
+           WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) {
+               /* Don't set this if it's too big */
+               arg->fscrypt_auth_len = 0;
+       } else {
+               arg->fscrypt_auth_len = ci->fscrypt_auth_len;
+               memcpy(arg->fscrypt_auth, ci->fscrypt_auth,
+                      min_t(size_t, ci->fscrypt_auth_len,
+                            sizeof(arg->fscrypt_auth)));
+       }
+ #endif /* CONFIG_FS_ENCRYPTION */
  }
  
+ #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
+                     4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8)
+ static inline int cap_msg_size(struct cap_msg_args *arg)
+ {
+       return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len;
+ }
+ #else
+ #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
+                     4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4)
+ static inline int cap_msg_size(struct cap_msg_args *arg)
+ {
+       return CAP_MSG_FIXED_FIELDS;
+ }
+ #endif /* CONFIG_FS_ENCRYPTION */
  /*
   * Send a cap msg on the given inode.
   *
@@@ -1444,7 -1498,8 +1498,8 @@@ static void __send_cap(struct cap_msg_a
        struct ceph_msg *msg;
        struct inode *inode = &ci->netfs.inode;
  
-       msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
+       msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS,
+                          false);
        if (!msg) {
                pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
                       ceph_vinop(inode), ceph_cap_string(arg->dirty),
@@@ -1470,10 -1525,6 +1525,6 @@@ static inline int __send_flush_snap(str
        struct cap_msg_args     arg;
        struct ceph_msg         *msg;
  
-       msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
-       if (!msg)
-               return -ENOMEM;
        arg.session = session;
        arg.ino = ceph_vino(inode).ino;
        arg.cid = 0;
        arg.inline_data = capsnap->inline_data;
        arg.flags = 0;
        arg.wake = false;
+       arg.encrypted = IS_ENCRYPTED(inode);
+       /* No fscrypt_auth changes from a capsnap.*/
+       arg.fscrypt_auth_len = 0;
+       msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg),
+                          GFP_NOFS, false);
+       if (!msg)
+               return -ENOMEM;
  
        encode_cap_msg(msg, &arg);
        ceph_con_send(&arg.session->s_con, msg);
@@@ -2900,10 -2960,9 +2960,9 @@@ int ceph_try_get_caps(struct inode *ino
   * due to a small max_size, make sure we check_max_size (and possibly
   * ask the mds) so we don't get hung up indefinitely.
   */
- int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got)
+ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
+                   int want, loff_t endoff, int *got)
  {
-       struct ceph_file_info *fi = filp->private_data;
-       struct inode *inode = file_inode(filp);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        int ret, _got, flags;
        if (ret < 0)
                return ret;
  
-       if ((fi->fmode & CEPH_FILE_MODE_WR) &&
+       if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
            fi->filp_gen != READ_ONCE(fsc->filp_gen))
                return -EBADF;
  
                                continue;
                }
  
-               if ((fi->fmode & CEPH_FILE_MODE_WR) &&
+               if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
                    fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
                        if (ret >= 0 && _got)
                                ceph_put_cap_refs(ci, _got);
        return 0;
  }
  
+ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff,
+                 int *got)
+ {
+       struct ceph_file_info *fi = filp->private_data;
+       struct inode *inode = file_inode(filp);
+       return __ceph_get_caps(inode, fi, need, want, endoff, got);
+ }
  /*
   * Take cap refs.  Caller must already know we hold at least one ref
   * on the caps in question or we don't know this is safe.
@@@ -3323,6 -3391,9 +3391,9 @@@ struct cap_extra_info 
        /* currently issued */
        int issued;
        struct timespec64 btime;
+       u8 *fscrypt_auth;
+       u32 fscrypt_auth_len;
+       u64 fscrypt_file_size;
  };
  
  /*
@@@ -3355,6 -3426,14 +3426,14 @@@ static void handle_cap_grant(struct ino
        bool deleted_inode = false;
        bool fill_inline = false;
  
+       /*
+        * If there is at least one crypto block then we'll trust
+        * fscrypt_file_size. If the real length of the file is 0, then
+        * ignore it (it has probably been truncated down to 0 by the MDS).
+        */
+       if (IS_ENCRYPTED(inode) && size)
+               size = extra_info->fscrypt_file_size;
        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
             inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
                dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
                     from_kuid(&init_user_ns, inode->i_uid),
                     from_kgid(&init_user_ns, inode->i_gid));
+ #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+               if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len ||
+                   memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth,
+                          ci->fscrypt_auth_len))
+                       pr_warn_ratelimited("%s: cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n",
+                               __func__, ci->fscrypt_auth_len,
+                               extra_info->fscrypt_auth_len);
+ #endif
        }
  
        if ((newcaps & CEPH_CAP_LINK_SHARED) &&
@@@ -3837,7 -3924,8 +3924,8 @@@ static void handle_cap_flushsnap_ack(st
   */
  static bool handle_cap_trunc(struct inode *inode,
                             struct ceph_mds_caps *trunc,
-                            struct ceph_mds_session *session)
+                            struct ceph_mds_session *session,
+                            struct cap_extra_info *extra_info)
  {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
  
        issued |= implemented | dirty;
  
-       dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
-            inode, mds, seq, truncate_size, truncate_seq);
+       /*
+        * If there is at least one crypto block then we'll trust
+        * fscrypt_file_size. If the real length of the file is 0, then
+        * ignore it (it has probably been truncated down to 0 by the MDS).
+        */
+       if (IS_ENCRYPTED(inode) && size)
+               size = extra_info->fscrypt_file_size;
+       dout("%s inode %p mds%d seq %d to %lld truncate seq %d\n",
+            __func__, inode, mds, seq, truncate_size, truncate_seq);
        queue_trunc = ceph_fill_file_size(inode, issued,
                                          truncate_seq, truncate_size, size);
        return queue_trunc;
@@@ -4075,6 -4171,52 +4171,52 @@@ retry
        *target_cap = cap;
  }
  
+ #ifdef CONFIG_FS_ENCRYPTION
+ static int parse_fscrypt_fields(void **p, void *end,
+                               struct cap_extra_info *extra)
+ {
+       u32 len;
+       ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad);
+       if (extra->fscrypt_auth_len) {
+               ceph_decode_need(p, end, extra->fscrypt_auth_len, bad);
+               extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len,
+                                             GFP_KERNEL);
+               if (!extra->fscrypt_auth)
+                       return -ENOMEM;
+               ceph_decode_copy_safe(p, end, extra->fscrypt_auth,
+                                       extra->fscrypt_auth_len, bad);
+       }
+       ceph_decode_32_safe(p, end, len, bad);
+       if (len >= sizeof(u64)) {
+               ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad);
+               len -= sizeof(u64);
+       }
+       ceph_decode_skip_n(p, end, len, bad);
+       return 0;
+ bad:
+       return -EIO;
+ }
+ #else
+ static int parse_fscrypt_fields(void **p, void *end,
+                               struct cap_extra_info *extra)
+ {
+       u32 len;
+       /* Don't care about these fields unless we're encryption-capable */
+       ceph_decode_32_safe(p, end, len, bad);
+       if (len)
+               ceph_decode_skip_n(p, end, len, bad);
+       ceph_decode_32_safe(p, end, len, bad);
+       if (len)
+               ceph_decode_skip_n(p, end, len, bad);
+       return 0;
+ bad:
+       return -EIO;
+ }
+ #endif
  /*
   * Handle a caps message from the MDS.
   *
@@@ -4105,6 -4247,9 +4247,9 @@@ void ceph_handle_caps(struct ceph_mds_s
  
        dout("handle_caps from mds%d\n", session->s_mds);
  
+       if (!ceph_inc_mds_stopping_blocker(mdsc, session))
+               return;
        /* decode */
        end = msg->front.iov_base + msg->front.iov_len;
        if (msg->front.iov_len < sizeof(*h))
                ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad);
        }
  
+       if (msg_version >= 12) {
+               if (parse_fscrypt_fields(&p, end, &extra_info))
+                       goto bad;
+       }
        /* lookup ino */
        inode = ceph_find_inode(mdsc->fsc->sb, vino);
        dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
             vino.snap, inode);
  
        mutex_lock(&session->s_mutex);
-       inc_session_sequence(session);
        dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
             (unsigned)seq);
  
                break;
  
        case CEPH_CAP_OP_TRUNC:
-               queue_trunc = handle_cap_trunc(inode, h, session);
+               queue_trunc = handle_cap_trunc(inode, h, session,
+                                               &extra_info);
                spin_unlock(&ci->i_ceph_lock);
                if (queue_trunc)
                        ceph_queue_vmtruncate(inode);
@@@ -4309,12 -4459,15 +4459,15 @@@ done
  done_unlocked:
        iput(inode);
  out:
+       ceph_dec_mds_stopping_blocker(mdsc);
        ceph_put_string(extra_info.pool_ns);
  
        /* Defer closing the sessions after s_mutex lock being released */
        if (close_sessions)
                ceph_mdsc_close_sessions(mdsc);
  
+       kfree(extra_info.fscrypt_auth);
        return;
  
  flush_cap_releases:
@@@ -4611,6 -4764,18 +4764,18 @@@ int ceph_encode_inode_release(void **p
        return ret;
  }
  
+ /**
+  * ceph_encode_dentry_release - encode a dentry release into an outgoing request
+  * @p: outgoing request buffer
+  * @dentry: dentry to release
+  * @dir: dir to release it from
+  * @mds: mds that we're speaking to
+  * @drop: caps being dropped
+  * @unless: unless we have these caps
+  *
+  * Encode a dentry release into an outgoing request buffer. Returns 1 if the
+  * thing was released, or a negative error code otherwise.
+  */
  int ceph_encode_dentry_release(void **p, struct dentry *dentry,
                               struct inode *dir,
                               int mds, int drop, int unless)
        if (ret && di->lease_session && di->lease_session->s_mds == mds) {
                dout("encode_dentry_release %p mds%d seq %d\n",
                     dentry, mds, (int)di->lease_seq);
-               rel->dname_len = cpu_to_le32(dentry->d_name.len);
-               memcpy(*p, dentry->d_name.name, dentry->d_name.len);
-               *p += dentry->d_name.len;
                rel->dname_seq = cpu_to_le32(di->lease_seq);
                __ceph_mdsc_drop_dentry_lease(dentry);
+               spin_unlock(&dentry->d_lock);
+               if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) {
+                       int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p);
+                       if (ret2 < 0)
+                               return ret2;
+                       rel->dname_len = cpu_to_le32(ret2);
+                       *p += ret2;
+               } else {
+                       rel->dname_len = cpu_to_le32(dentry->d_name.len);
+                       memcpy(*p, dentry->d_name.name, dentry->d_name.len);
+                       *p += dentry->d_name.len;
+               }
+       } else {
+               spin_unlock(&dentry->d_lock);
        }
-       spin_unlock(&dentry->d_lock);
        return ret;
  }
  
diff --combined fs/ceph/inode.c
index fd05d68e2990878dabc75729b18b6586faa4c71b,ea6f966dacd5bfb409f69c8db4f181b43ec7c247..800ab79205137942c9b3c843218e5ea998b3802d
  #include <linux/random.h>
  #include <linux/sort.h>
  #include <linux/iversion.h>
+ #include <linux/fscrypt.h>
  
  #include "super.h"
  #include "mds_client.h"
  #include "cache.h"
+ #include "crypto.h"
  #include <linux/ceph/decode.h>
  
  /*
@@@ -33,6 -35,7 +35,7 @@@
   */
  
  static const struct inode_operations ceph_symlink_iops;
+ static const struct inode_operations ceph_encrypted_symlink_iops;
  
  static void ceph_inode_work(struct work_struct *work);
  
@@@ -52,17 -55,99 +55,99 @@@ static int ceph_set_ino_cb(struct inod
        return 0;
  }
  
- struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+ /**
+  * ceph_new_inode - allocate a new inode in advance of an expected create
+  * @dir: parent directory for new inode
+  * @dentry: dentry that may eventually point to new inode
+  * @mode: mode of new inode
+  * @as_ctx: pointer to inherited security context
+  *
+  * Allocate a new inode in advance of an operation to create a new inode.
+  * This allocates the inode and sets up the acl_sec_ctx with appropriate
+  * info for the new inode.
+  *
+  * Returns a pointer to the new inode or an ERR_PTR.
+  */
+ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
+                            umode_t *mode, struct ceph_acl_sec_ctx *as_ctx)
+ {
+       int err;
+       struct inode *inode;
+       inode = new_inode(dir->i_sb);
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+       if (!S_ISLNK(*mode)) {
+               err = ceph_pre_init_acls(dir, mode, as_ctx);
+               if (err < 0)
+                       goto out_err;
+       }
+       inode->i_state = 0;
+       inode->i_mode = *mode;
+       err = ceph_security_init_secctx(dentry, *mode, as_ctx);
+       if (err < 0)
+               goto out_err;
+       /*
+        * We'll skip setting fscrypt context for snapshots, leaving that for
+        * the handle_reply().
+        */
+       if (ceph_snap(dir) != CEPH_SNAPDIR) {
+               err = ceph_fscrypt_prepare_context(dir, inode, as_ctx);
+               if (err)
+                       goto out_err;
+       }
+       return inode;
+ out_err:
+       iput(inode);
+       return ERR_PTR(err);
+ }
+ void ceph_as_ctx_to_req(struct ceph_mds_request *req,
+                       struct ceph_acl_sec_ctx *as_ctx)
+ {
+       if (as_ctx->pagelist) {
+               req->r_pagelist = as_ctx->pagelist;
+               as_ctx->pagelist = NULL;
+       }
+       ceph_fscrypt_as_ctx_to_req(req, as_ctx);
+ }
+ /**
+  * ceph_get_inode - find or create/hash a new inode
+  * @sb: superblock to search and allocate in
+  * @vino: vino to search for
+  * @newino: optional new inode to insert if one isn't found (may be NULL)
+  *
+  * Search for or insert a new inode into the hash for the given vino, and
+  * return a reference to it. If new is non-NULL, its reference is consumed.
+  */
+ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
+                            struct inode *newino)
  {
        struct inode *inode;
  
        if (ceph_vino_is_reserved(vino))
                return ERR_PTR(-EREMOTEIO);
  
-       inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
-                            ceph_set_ino_cb, &vino);
-       if (!inode)
+       if (newino) {
+               inode = inode_insert5(newino, (unsigned long)vino.ino,
+                                     ceph_ino_compare, ceph_set_ino_cb, &vino);
+               if (inode != newino)
+                       iput(newino);
+       } else {
+               inode = iget5_locked(sb, (unsigned long)vino.ino,
+                                    ceph_ino_compare, ceph_set_ino_cb, &vino);
+       }
+       if (!inode) {
+               dout("No inode found for %llx.%llx\n", vino.ino, vino.snap);
                return ERR_PTR(-ENOMEM);
+       }
  
        dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
             ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
@@@ -78,8 -163,9 +163,9 @@@ struct inode *ceph_get_snapdir(struct i
                .ino = ceph_ino(parent),
                .snap = CEPH_SNAPDIR,
        };
-       struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+       struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL);
        struct ceph_inode_info *ci = ceph_inode(inode);
+       int ret = -ENOTDIR;
  
        if (IS_ERR(inode))
                return inode;
        inode->i_uid = parent->i_uid;
        inode->i_gid = parent->i_gid;
        inode->i_mtime = parent->i_mtime;
 -      inode->i_ctime = parent->i_ctime;
 +      inode_set_ctime_to_ts(inode, inode_get_ctime(parent));
        inode->i_atime = parent->i_atime;
        ci->i_rbytes = 0;
        ci->i_btime = ceph_inode(parent)->i_btime;
  
+ #ifdef CONFIG_FS_ENCRYPTION
+       /* if encrypted, just borrow fscrypt_auth from parent */
+       if (IS_ENCRYPTED(parent)) {
+               struct ceph_inode_info *pci = ceph_inode(parent);
+               ci->fscrypt_auth = kmemdup(pci->fscrypt_auth,
+                                          pci->fscrypt_auth_len,
+                                          GFP_KERNEL);
+               if (ci->fscrypt_auth) {
+                       inode->i_flags |= S_ENCRYPTED;
+                       ci->fscrypt_auth_len = pci->fscrypt_auth_len;
+               } else {
+                       dout("Failed to alloc snapdir fscrypt_auth\n");
+                       ret = -ENOMEM;
+                       goto err;
+               }
+       }
+ #endif
        if (inode->i_state & I_NEW) {
                inode->i_op = &ceph_snapdir_iops;
                inode->i_fop = &ceph_snapdir_fops;
@@@ -118,7 -222,7 +222,7 @@@ err
                discard_new_inode(inode);
        else
                iput(inode);
-       return ERR_PTR(-ENOTDIR);
+       return ERR_PTR(ret);
  }
  
  const struct inode_operations ceph_file_iops = {
@@@ -517,6 -621,7 +621,7 @@@ struct inode *ceph_alloc_inode(struct s
        ci->i_truncate_seq = 0;
        ci->i_truncate_size = 0;
        ci->i_truncate_pending = 0;
+       ci->i_truncate_pagecache_size = 0;
  
        ci->i_max_size = 0;
        ci->i_reported_size = 0;
        INIT_WORK(&ci->i_work, ceph_inode_work);
        ci->i_work_mask = 0;
        memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
+ #ifdef CONFIG_FS_ENCRYPTION
+       ci->fscrypt_auth = NULL;
+       ci->fscrypt_auth_len = 0;
+ #endif
        return &ci->netfs.inode;
  }
  
@@@ -555,6 -664,10 +664,10 @@@ void ceph_free_inode(struct inode *inod
        struct ceph_inode_info *ci = ceph_inode(inode);
  
        kfree(ci->i_symlink);
+ #ifdef CONFIG_FS_ENCRYPTION
+       kfree(ci->fscrypt_auth);
+ #endif
+       fscrypt_free_inode(inode);
        kmem_cache_free(ceph_inode_cachep, ci);
  }
  
@@@ -575,6 -688,7 +688,7 @@@ void ceph_evict_inode(struct inode *ino
        clear_inode(inode);
  
        ceph_fscache_unregister_inode_cookie(ci);
+       fscrypt_put_encryption_info(inode);
  
        __ceph_remove_caps(ci);
  
@@@ -650,7 -764,7 +764,7 @@@ int ceph_fill_file_size(struct inode *i
                        ceph_fscache_update(inode);
                ci->i_reported_size = size;
                if (truncate_seq != ci->i_truncate_seq) {
-                       dout("truncate_seq %u -> %u\n",
+                       dout("%s truncate_seq %u -> %u\n", __func__,
                             ci->i_truncate_seq, truncate_seq);
                        ci->i_truncate_seq = truncate_seq;
  
                        }
                }
        }
-       if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
-           ci->i_truncate_size != truncate_size) {
-               dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
-                    truncate_size);
+       /*
+        * It's possible that the new sizes of the two consecutive
+        * size truncations will be in the same fscrypt last block,
+        * and we need to truncate the corresponding page caches
+        * anyway.
+        */
+       if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) {
+               dout("%s truncate_size %lld -> %llu, encrypted %d\n", __func__,
+                    ci->i_truncate_size, truncate_size, !!IS_ENCRYPTED(inode));
                ci->i_truncate_size = truncate_size;
+               if (IS_ENCRYPTED(inode)) {
+                       dout("%s truncate_pagecache_size %lld -> %llu\n",
+                            __func__, ci->i_truncate_pagecache_size, size);
+                       ci->i_truncate_pagecache_size = size;
+               } else {
+                       ci->i_truncate_pagecache_size = truncate_size;
+               }
        }
        return queue_trunc;
  }
@@@ -688,7 -817,6 +817,7 @@@ void ceph_fill_file_time(struct inode *
                         struct timespec64 *mtime, struct timespec64 *atime)
  {
        struct ceph_inode_info *ci = ceph_inode(inode);
 +      struct timespec64 ictime = inode_get_ctime(inode);
        int warn = 0;
  
        if (issued & (CEPH_CAP_FILE_EXCL|
                      CEPH_CAP_AUTH_EXCL|
                      CEPH_CAP_XATTR_EXCL)) {
                if (ci->i_version == 0 ||
 -                  timespec64_compare(ctime, &inode->i_ctime) > 0) {
 +                  timespec64_compare(ctime, &ictime) > 0) {
                        dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
 -                           inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
 +                           ictime.tv_sec, ictime.tv_nsec,
                             ctime->tv_sec, ctime->tv_nsec);
 -                      inode->i_ctime = *ctime;
 +                      inode_set_ctime_to_ts(inode, *ctime);
                }
                if (ci->i_version == 0 ||
                    ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
        } else {
                /* we have no write|excl caps; whatever the MDS says is true */
                if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
 -                      inode->i_ctime = *ctime;
 +                      inode_set_ctime_to_ts(inode, *ctime);
                        inode->i_mtime = *mtime;
                        inode->i_atime = *atime;
                        ci->i_time_warp_seq = time_warp_seq;
                     inode, time_warp_seq, ci->i_time_warp_seq);
  }
  
+ #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym)
+ {
+       int declen;
+       u8 *sym;
+       sym = kmalloc(enclen + 1, GFP_NOFS);
+       if (!sym)
+               return -ENOMEM;
+       declen = ceph_base64_decode(encsym, enclen, sym);
+       if (declen < 0) {
+               pr_err("%s: can't decode symlink (%d). Content: %.*s\n",
+                      __func__, declen, enclen, encsym);
+               kfree(sym);
+               return -EIO;
+       }
+       sym[declen + 1] = '\0';
+       *decsym = sym;
+       return declen;
+ }
+ #else
+ static int decode_encrypted_symlink(const char *encsym, int symlen, u8 **decsym)
+ {
+       return -EOPNOTSUPP;
+ }
+ #endif
  /*
   * Populate an inode based on info from mds.  May be called on new or
   * existing inodes.
@@@ -857,15 -1013,20 +1014,20 @@@ int ceph_fill_inode(struct inode *inode
        issued |= __ceph_caps_dirty(ci);
        new_issued = ~issued & info_caps;
  
-       /* directories have fl_stripe_unit set to zero */
-       if (le32_to_cpu(info->layout.fl_stripe_unit))
-               inode->i_blkbits =
-                       fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
-       else
-               inode->i_blkbits = CEPH_BLOCK_SHIFT;
        __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
  
+ #ifdef CONFIG_FS_ENCRYPTION
+       if (iinfo->fscrypt_auth_len &&
+           ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) {
+               kfree(ci->fscrypt_auth);
+               ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
+               ci->fscrypt_auth = iinfo->fscrypt_auth;
+               iinfo->fscrypt_auth = NULL;
+               iinfo->fscrypt_auth_len = 0;
+               inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
+       }
+ #endif
        if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
            (issued & CEPH_CAP_AUTH_EXCL) == 0) {
                inode->i_mode = mode;
                ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
        }
  
+       /* directories have fl_stripe_unit set to zero */
+       if (IS_ENCRYPTED(inode))
+               inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
+       else if (le32_to_cpu(info->layout.fl_stripe_unit))
+               inode->i_blkbits =
+                       fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+       else
+               inode->i_blkbits = CEPH_BLOCK_SHIFT;
        if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
            (issued & CEPH_CAP_LINK_EXCL) == 0)
                set_nlink(inode, le32_to_cpu(info->nlink));
  
        if (new_version ||
            (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+               u64 size = le64_to_cpu(info->size);
                s64 old_pool = ci->i_layout.pool_id;
                struct ceph_string *old_ns;
  
  
                pool_ns = old_ns;
  
+               if (IS_ENCRYPTED(inode) && size &&
+                   iinfo->fscrypt_file_len == sizeof(__le64)) {
+                       u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file);
+                       if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) {
+                               size = fsize;
+                       } else {
+                               pr_warn("fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
+                                       info->size, size);
+                       }
+               }
                queue_trunc = ceph_fill_file_size(inode, issued,
                                        le32_to_cpu(info->truncate_seq),
                                        le64_to_cpu(info->truncate_size),
-                                       le64_to_cpu(info->size));
+                                       size);
                /* only update max_size on auth cap */
                if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
                    ci->i_max_size != le64_to_cpu(info->max_size)) {
                inode->i_fop = &ceph_file_fops;
                break;
        case S_IFLNK:
-               inode->i_op = &ceph_symlink_iops;
                if (!ci->i_symlink) {
                        u32 symlen = iinfo->symlink_len;
                        char *sym;
  
                        spin_unlock(&ci->i_ceph_lock);
  
-                       if (symlen != i_size_read(inode)) {
-                               pr_err("%s %llx.%llx BAD symlink "
-                                       "size %lld\n", __func__,
-                                       ceph_vinop(inode),
-                                       i_size_read(inode));
+                       if (IS_ENCRYPTED(inode)) {
+                               if (symlen != i_size_read(inode))
+                                       pr_err("%s %llx.%llx BAD symlink size %lld\n",
+                                               __func__, ceph_vinop(inode),
+                                               i_size_read(inode));
+                               err = decode_encrypted_symlink(iinfo->symlink,
+                                                              symlen, (u8 **)&sym);
+                               if (err < 0) {
+                                       pr_err("%s decoding encrypted symlink failed: %d\n",
+                                               __func__, err);
+                                       goto out;
+                               }
+                               symlen = err;
                                i_size_write(inode, symlen);
                                inode->i_blocks = calc_inode_blocks(symlen);
-                       }
+                       } else {
+                               if (symlen != i_size_read(inode)) {
+                                       pr_err("%s %llx.%llx BAD symlink size %lld\n",
+                                               __func__, ceph_vinop(inode),
+                                               i_size_read(inode));
+                                       i_size_write(inode, symlen);
+                                       inode->i_blocks = calc_inode_blocks(symlen);
+                               }
  
-                       err = -ENOMEM;
-                       sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
-                       if (!sym)
-                               goto out;
+                               err = -ENOMEM;
+                               sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
+                               if (!sym)
+                                       goto out;
+                       }
  
                        spin_lock(&ci->i_ceph_lock);
                        if (!ci->i_symlink)
                        else
                                kfree(sym); /* lost a race */
                }
-               inode->i_link = ci->i_symlink;
+               if (IS_ENCRYPTED(inode)) {
+                       /*
+                        * Encrypted symlinks need to be decrypted before we can
+                        * cache their targets in i_link. Don't touch it here.
+                        */
+                       inode->i_op = &ceph_encrypted_symlink_iops;
+               } else {
+                       inode->i_link = ci->i_symlink;
+                       inode->i_op = &ceph_symlink_iops;
+               }
                break;
        case S_IFDIR:
                inode->i_op = &ceph_dir_iops;
@@@ -1310,8 -1519,15 +1520,15 @@@ int ceph_fill_trace(struct super_block 
                if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
                    test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
                    !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
+                       bool is_nokey = false;
                        struct qstr dname;
                        struct dentry *dn, *parent;
+                       struct fscrypt_str oname = FSTR_INIT(NULL, 0);
+                       struct ceph_fname fname = { .dir        = dir,
+                                                   .name       = rinfo->dname,
+                                                   .ctext      = rinfo->altname,
+                                                   .name_len   = rinfo->dname_len,
+                                                   .ctext_len  = rinfo->altname_len };
  
                        BUG_ON(!rinfo->head->is_target);
                        BUG_ON(req->r_dentry);
                        parent = d_find_any_alias(dir);
                        BUG_ON(!parent);
  
-                       dname.name = rinfo->dname;
-                       dname.len = rinfo->dname_len;
+                       err = ceph_fname_alloc_buffer(dir, &oname);
+                       if (err < 0) {
+                               dput(parent);
+                               goto done;
+                       }
+                       err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
+                       if (err < 0) {
+                               dput(parent);
+                               ceph_fname_free_buffer(dir, &oname);
+                               goto done;
+                       }
+                       dname.name = oname.name;
+                       dname.len = oname.len;
                        dname.hash = full_name_hash(parent, dname.name, dname.len);
                        tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
                        tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
@@@ -1335,9 -1563,15 +1564,15 @@@ retry_lookup
                                     dname.len, dname.name, dn);
                                if (!dn) {
                                        dput(parent);
+                                       ceph_fname_free_buffer(dir, &oname);
                                        err = -ENOMEM;
                                        goto done;
                                }
+                               if (is_nokey) {
+                                       spin_lock(&dn->d_lock);
+                                       dn->d_flags |= DCACHE_NOKEY_NAME;
+                                       spin_unlock(&dn->d_lock);
+                               }
                                err = 0;
                        } else if (d_really_is_positive(dn) &&
                                   (ceph_ino(d_inode(dn)) != tvino.ino ||
                                dput(dn);
                                goto retry_lookup;
                        }
+                       ceph_fname_free_buffer(dir, &oname);
  
                        req->r_dentry = dn;
                        dput(parent);
@@@ -1552,7 -1787,7 +1788,7 @@@ static int readdir_prepopulate_inodes_o
                vino.ino = le64_to_cpu(rde->inode.in->ino);
                vino.snap = le64_to_cpu(rde->inode.in->snapid);
  
-               in = ceph_get_inode(req->r_dentry->d_sb, vino);
+               in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL);
                if (IS_ERR(in)) {
                        err = PTR_ERR(in);
                        dout("new_inode badness got %d\n", err);
@@@ -1630,7 -1865,8 +1866,8 @@@ int ceph_readdir_prepopulate(struct cep
                             struct ceph_mds_session *session)
  {
        struct dentry *parent = req->r_dentry;
-       struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
+       struct inode *inode = d_inode(parent);
+       struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct qstr dname;
        struct dentry *dn;
                tvino.snap = le64_to_cpu(rde->inode.in->snapid);
  
                if (rinfo->hash_order) {
-                       u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
-                                                rde->name, rde->name_len);
-                       hash = ceph_frag_value(hash);
+                       u32 hash = ceph_frag_value(rde->raw_hash);
                        if (hash != last_hash)
                                fpos_offset = 2;
                        last_hash = hash;
@@@ -1729,6 -1963,11 +1964,11 @@@ retry_lookup
                                err = -ENOMEM;
                                goto out;
                        }
+                       if (rde->is_nokey) {
+                               spin_lock(&dn->d_lock);
+                               dn->d_flags |= DCACHE_NOKEY_NAME;
+                               spin_unlock(&dn->d_lock);
+                       }
                } else if (d_really_is_positive(dn) &&
                           (ceph_ino(d_inode(dn)) != tvino.ino ||
                            ceph_snap(d_inode(dn)) != tvino.snap)) {
                if (d_really_is_positive(dn)) {
                        in = d_inode(dn);
                } else {
-                       in = ceph_get_inode(parent->d_sb, tvino);
+                       in = ceph_get_inode(parent->d_sb, tvino, NULL);
                        if (IS_ERR(in)) {
                                dout("new_inode badness\n");
                                d_drop(dn);
@@@ -1927,7 -2166,7 +2167,7 @@@ void __ceph_do_pending_vmtruncate(struc
  retry:
        spin_lock(&ci->i_ceph_lock);
        if (ci->i_truncate_pending == 0) {
-               dout("__do_pending_vmtruncate %p none pending\n", inode);
+               dout("%s %p none pending\n", __func__, inode);
                spin_unlock(&ci->i_ceph_lock);
                mutex_unlock(&ci->i_truncate_mutex);
                return;
         */
        if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
                spin_unlock(&ci->i_ceph_lock);
-               dout("__do_pending_vmtruncate %p flushing snaps first\n",
-                    inode);
+               dout("%s %p flushing snaps first\n", __func__, inode);
                filemap_write_and_wait_range(&inode->i_data, 0,
                                             inode->i_sb->s_maxbytes);
                goto retry;
        /* there should be no reader or writer */
        WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
  
-       to = ci->i_truncate_size;
+       to = ci->i_truncate_pagecache_size;
        wrbuffer_refs = ci->i_wrbuffer_ref;
-       dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
+       dout("%s %p (%d) to %lld\n", __func__, inode,
             ci->i_truncate_pending, to);
        spin_unlock(&ci->i_ceph_lock);
  
        truncate_pagecache(inode, to);
  
        spin_lock(&ci->i_ceph_lock);
-       if (to == ci->i_truncate_size) {
+       if (to == ci->i_truncate_pagecache_size) {
                ci->i_truncate_pending = 0;
                finish = 1;
        }
@@@ -2000,6 -2238,32 +2239,32 @@@ static void ceph_inode_work(struct work
        iput(inode);
  }
  
+ static const char *ceph_encrypted_get_link(struct dentry *dentry,
+                                          struct inode *inode,
+                                          struct delayed_call *done)
+ {
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       if (!dentry)
+               return ERR_PTR(-ECHILD);
+       return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode),
+                                  done);
+ }
+ static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap,
+                                         const struct path *path,
+                                         struct kstat *stat, u32 request_mask,
+                                         unsigned int query_flags)
+ {
+       int ret;
+       ret = ceph_getattr(idmap, path, stat, request_mask, query_flags);
+       if (ret)
+               return ret;
+       return fscrypt_symlink_getattr(path, stat);
+ }
  /*
   * symlinks
   */
@@@ -2010,20 -2274,173 +2275,173 @@@ static const struct inode_operations ce
        .listxattr = ceph_listxattr,
  };
  
- int __ceph_setattr(struct inode *inode, struct iattr *attr)
+ static const struct inode_operations ceph_encrypted_symlink_iops = {
+       .get_link = ceph_encrypted_get_link,
+       .setattr = ceph_setattr,
+       .getattr = ceph_encrypted_symlink_getattr,
+       .listxattr = ceph_listxattr,
+ };
+ /*
+  * Transfer the encrypted last block to the MDS and the MDS
+  * will help update it when truncating a smaller size.
+  *
+  * We don't support a PAGE_SIZE that is smaller than the
+  * CEPH_FSCRYPT_BLOCK_SIZE.
+  */
+ static int fill_fscrypt_truncate(struct inode *inode,
+                                struct ceph_mds_request *req,
+                                struct iattr *attr)
+ {
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
+       loff_t pos, orig_pos = round_down(attr->ia_size,
+                                         CEPH_FSCRYPT_BLOCK_SIZE);
+       u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT;
+       struct ceph_pagelist *pagelist = NULL;
+       struct kvec iov = {0};
+       struct iov_iter iter;
+       struct page *page = NULL;
+       struct ceph_fscrypt_truncate_size_header header;
+       int retry_op = 0;
+       int len = CEPH_FSCRYPT_BLOCK_SIZE;
+       loff_t i_size = i_size_read(inode);
+       int got, ret, issued;
+       u64 objver;
+       ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got);
+       if (ret < 0)
+               return ret;
+       issued = __ceph_caps_issued(ci, NULL);
+       dout("%s size %lld -> %lld got cap refs on %s, issued %s\n", __func__,
+            i_size, attr->ia_size, ceph_cap_string(got),
+            ceph_cap_string(issued));
+       /* Try to writeback the dirty pagecaches */
+       if (issued & (CEPH_CAP_FILE_BUFFER)) {
+               loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1;
+               ret = filemap_write_and_wait_range(inode->i_mapping,
+                                                  orig_pos, lend);
+               if (ret < 0)
+                       goto out;
+       }
+       page = __page_cache_alloc(GFP_KERNEL);
+       if (page == NULL) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       pagelist = ceph_pagelist_alloc(GFP_KERNEL);
+       if (!pagelist) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       iov.iov_base = kmap_local_page(page);
+       iov.iov_len = len;
+       iov_iter_kvec(&iter, READ, &iov, 1, len);
+       pos = orig_pos;
+       ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver);
+       if (ret < 0)
+               goto out;
+       /* Insert the header first */
+       header.ver = 1;
+       header.compat = 1;
+       header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode));
+       /*
+        * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
+        * because in MDS it may need this to do the truncate.
+        */
+       header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE);
+       /*
+        * If we hit a hole here, we should just skip filling
+        * the fscrypt for the request, because once the fscrypt
+        * is enabled, the file will be split into many blocks
+        * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there
+        * has a hole, the hole size should be multiple of block
+        * size.
+        *
+        * If the Rados object doesn't exist, it will be set to 0.
+        */
+       if (!objver) {
+               dout("%s hit hole, ppos %lld < size %lld\n", __func__,
+                    pos, i_size);
+               header.data_len = cpu_to_le32(8 + 8 + 4);
+               header.file_offset = 0;
+               ret = 0;
+       } else {
+               header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE);
+               header.file_offset = cpu_to_le64(orig_pos);
+               dout("%s encrypt block boff/bsize %d/%lu\n", __func__,
+                    boff, CEPH_FSCRYPT_BLOCK_SIZE);
+               /* truncate and zero out the extra contents for the last block */
+               memset(iov.iov_base + boff, 0, PAGE_SIZE - boff);
+               /* encrypt the last block */
+               ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
+                                                   CEPH_FSCRYPT_BLOCK_SIZE,
+                                                   0, block,
+                                                   GFP_KERNEL);
+               if (ret)
+                       goto out;
+       }
+       /* Insert the header */
+       ret = ceph_pagelist_append(pagelist, &header, sizeof(header));
+       if (ret)
+               goto out;
+       if (header.block_size) {
+               /* Append the last block contents to pagelist */
+               ret = ceph_pagelist_append(pagelist, iov.iov_base,
+                                          CEPH_FSCRYPT_BLOCK_SIZE);
+               if (ret)
+                       goto out;
+       }
+       req->r_pagelist = pagelist;
+ out:
+       dout("%s %p size dropping cap refs on %s\n", __func__,
+            inode, ceph_cap_string(got));
+       ceph_put_cap_refs(ci, got);
+       if (iov.iov_base)
+               kunmap_local(iov.iov_base);
+       if (page)
+               __free_pages(page, 0);
+       if (ret && pagelist)
+               ceph_pagelist_release(pagelist);
+       return ret;
+ }
+ int __ceph_setattr(struct inode *inode, struct iattr *attr,
+                  struct ceph_iattr *cia)
  {
        struct ceph_inode_info *ci = ceph_inode(inode);
        unsigned int ia_valid = attr->ia_valid;
        struct ceph_mds_request *req;
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_cap_flush *prealloc_cf;
+       loff_t isize = i_size_read(inode);
        int issued;
        int release = 0, dirtied = 0;
        int mask = 0;
        int err = 0;
        int inode_dirty_flags = 0;
        bool lock_snap_rwsem = false;
+       bool fill_fscrypt;
+       int truncate_retry = 20; /* The RMW will take around 50ms */
  
+ retry:
        prealloc_cf = ceph_alloc_cap_flush();
        if (!prealloc_cf)
                return -ENOMEM;
                return PTR_ERR(req);
        }
  
+       fill_fscrypt = false;
        spin_lock(&ci->i_ceph_lock);
        issued = __ceph_caps_issued(ci, NULL);
  
        }
  
        dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+ #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+       if (cia && cia->fscrypt_auth) {
+               u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth);
+               if (len > sizeof(*cia->fscrypt_auth)) {
+                       err = -EINVAL;
+                       spin_unlock(&ci->i_ceph_lock);
+                       goto out;
+               }
+               dout("setattr %llx:%llx fscrypt_auth len %u to %u)\n",
+                       ceph_vinop(inode), ci->fscrypt_auth_len, len);
+               /* It should never be re-set once set */
+               WARN_ON_ONCE(ci->fscrypt_auth);
+               if (issued & CEPH_CAP_AUTH_EXCL) {
+                       dirtied |= CEPH_CAP_AUTH_EXCL;
+                       kfree(ci->fscrypt_auth);
+                       ci->fscrypt_auth = (u8 *)cia->fscrypt_auth;
+                       ci->fscrypt_auth_len = len;
+               } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+                          ci->fscrypt_auth_len != len ||
+                          memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) {
+                       req->r_fscrypt_auth = cia->fscrypt_auth;
+                       mask |= CEPH_SETATTR_FSCRYPT_AUTH;
+                       release |= CEPH_CAP_AUTH_SHARED;
+               }
+               cia->fscrypt_auth = NULL;
+       }
+ #else
+       if (cia && cia->fscrypt_auth) {
+               err = -EINVAL;
+               spin_unlock(&ci->i_ceph_lock);
+               goto out;
+       }
+ #endif /* CONFIG_FS_ENCRYPTION */
  
        if (ia_valid & ATTR_UID) {
                dout("setattr %p uid %d -> %d\n", inode,
                }
        }
        if (ia_valid & ATTR_SIZE) {
-               loff_t isize = i_size_read(inode);
                dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
-               if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
+               /*
+                * Only when the new size is smaller and not aligned to
+                * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
+                */
+               if (IS_ENCRYPTED(inode) && attr->ia_size < isize &&
+                   (attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) {
+                       mask |= CEPH_SETATTR_SIZE;
+                       release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+                                  CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+                       set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+                       mask |= CEPH_SETATTR_FSCRYPT_FILE;
+                       req->r_args.setattr.size =
+                               cpu_to_le64(round_up(attr->ia_size,
+                                                    CEPH_FSCRYPT_BLOCK_SIZE));
+                       req->r_args.setattr.old_size =
+                               cpu_to_le64(round_up(isize,
+                                                    CEPH_FSCRYPT_BLOCK_SIZE));
+                       req->r_fscrypt_file = attr->ia_size;
+                       fill_fscrypt = true;
+               } else if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
                        if (attr->ia_size > isize) {
                                i_size_write(inode, attr->ia_size);
                                inode->i_blocks = calc_inode_blocks(attr->ia_size);
                        }
                } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
                           attr->ia_size != isize) {
-                       req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
-                       req->r_args.setattr.old_size = cpu_to_le64(isize);
                        mask |= CEPH_SETATTR_SIZE;
                        release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
                                   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+                       if (IS_ENCRYPTED(inode) && attr->ia_size) {
+                               set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+                               mask |= CEPH_SETATTR_FSCRYPT_FILE;
+                               req->r_args.setattr.size =
+                                       cpu_to_le64(round_up(attr->ia_size,
+                                                            CEPH_FSCRYPT_BLOCK_SIZE));
+                               req->r_args.setattr.old_size =
+                                       cpu_to_le64(round_up(isize,
+                                                            CEPH_FSCRYPT_BLOCK_SIZE));
+                               req->r_fscrypt_file = attr->ia_size;
+                       } else {
+                               req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+                               req->r_args.setattr.old_size = cpu_to_le64(isize);
+                               req->r_fscrypt_file = 0;
+                       }
                }
        }
        if (ia_valid & ATTR_MTIME) {
                bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
                                         ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
                dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode,
 -                   inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
 +                   inode_get_ctime(inode).tv_sec,
 +                   inode_get_ctime(inode).tv_nsec,
                     attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
                     only ? "ctime only" : "ignored");
                if (only) {
        if (dirtied) {
                inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
                                                           &prealloc_cf);
 -              inode->i_ctime = attr->ia_ctime;
 +              inode_set_ctime_to_ts(inode, attr->ia_ctime);
                inode_inc_iversion_raw(inode);
        }
  
        release &= issued;
        spin_unlock(&ci->i_ceph_lock);
-       if (lock_snap_rwsem)
+       if (lock_snap_rwsem) {
                up_read(&mdsc->snap_rwsem);
+               lock_snap_rwsem = false;
+       }
  
        if (inode_dirty_flags)
                __mark_inode_dirty(inode, inode_dirty_flags);
                req->r_args.setattr.mask = cpu_to_le32(mask);
                req->r_num_caps = 1;
                req->r_stamp = attr->ia_ctime;
+               if (fill_fscrypt) {
+                       err = fill_fscrypt_truncate(inode, req, attr);
+                       if (err)
+                               goto out;
+               }
+               /*
+                * The truncate request will return -EAGAIN when the
+                * last block has been updated just before the MDS
+                * successfully gets the xlock for the FILE lock. To
+                * avoid corrupting the file contents we need to retry
+                * it.
+                */
                err = ceph_mdsc_do_request(mdsc, NULL, req);
+               if (err == -EAGAIN && truncate_retry--) {
+                       dout("setattr %p result=%d (%s locally, %d remote), retry it!\n",
+                            inode, err, ceph_cap_string(dirtied), mask);
+                       ceph_mdsc_put_request(req);
+                       ceph_free_cap_flush(prealloc_cf);
+                       goto retry;
+               }
        }
+ out:
        dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
             ceph_cap_string(dirtied), mask);
  
@@@ -2242,6 -2749,10 +2751,10 @@@ int ceph_setattr(struct mnt_idmap *idma
        if (ceph_inode_is_shutdown(inode))
                return -ESTALE;
  
+       err = fscrypt_prepare_setattr(dentry, attr);
+       if (err)
+               return err;
        err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
        if (err != 0)
                return err;
            ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
                return -EDQUOT;
  
-       err = __ceph_setattr(inode, attr);
+       err = __ceph_setattr(inode, attr, NULL);
  
        if (err >= 0 && (attr->ia_valid & ATTR_MODE))
                err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode);
@@@ -2467,7 -2978,7 +2980,7 @@@ int ceph_getattr(struct mnt_idmap *idma
                        return err;
        }
  
 -      generic_fillattr(&nop_mnt_idmap, inode, stat);
 +      generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        stat->ino = ceph_present_inode(inode);
  
        /*
                        stat->nlink = 1 + 1 + ci->i_subdirs;
        }
  
-       stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC;
        stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
+       if (IS_ENCRYPTED(inode))
+               stat->attributes |= STATX_ATTR_ENCRYPTED;
+       stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC |
+                                 STATX_ATTR_ENCRYPTED);
        stat->result_mask = request_mask & valid_mask;
        return err;
  }
diff --combined fs/ceph/snap.c
index c9920ade15f5f497a8acdb2378ec99b5f2960186,7ddc6bad77ef3f6d04432a1a3608ebdabe10fcec..813f21add992c144678de60faae62b341dff4d38
@@@ -660,7 -660,7 +660,7 @@@ int __ceph_finish_cap_snap(struct ceph_
        capsnap->size = i_size_read(inode);
        capsnap->mtime = inode->i_mtime;
        capsnap->atime = inode->i_atime;
 -      capsnap->ctime = inode->i_ctime;
 +      capsnap->ctime = inode_get_ctime(inode);
        capsnap->btime = ci->i_btime;
        capsnap->change_attr = inode_peek_iversion_raw(inode);
        capsnap->time_warp_seq = ci->i_time_warp_seq;
@@@ -1015,6 -1015,9 +1015,9 @@@ void ceph_handle_snap(struct ceph_mds_c
        int locked_rwsem = 0;
        bool close_sessions = false;
  
+       if (!ceph_inc_mds_stopping_blocker(mdsc, session))
+               return;
        /* decode */
        if (msg->front.iov_len < sizeof(*h))
                goto bad;
        dout("%s from mds%d op %s split %llx tracelen %d\n", __func__,
             mds, ceph_snap_op_name(op), split, trace_len);
  
-       mutex_lock(&session->s_mutex);
-       inc_session_sequence(session);
-       mutex_unlock(&session->s_mutex);
        down_write(&mdsc->snap_rwsem);
        locked_rwsem = 1;
  
@@@ -1151,6 -1150,7 +1150,7 @@@ skip_inode
        up_write(&mdsc->snap_rwsem);
  
        flush_snaps(mdsc);
+       ceph_dec_mds_stopping_blocker(mdsc);
        return;
  
  bad:
@@@ -1160,6 -1160,8 +1160,8 @@@ out
        if (locked_rwsem)
                up_write(&mdsc->snap_rwsem);
  
+       ceph_dec_mds_stopping_blocker(mdsc);
        if (close_sessions)
                ceph_mdsc_close_sessions(mdsc);
        return;
diff --combined fs/ceph/xattr.c
index 1cbd84cc82a83724193a4e8d0a33ab2f7b73cc92,6cd82404202acedb841c2c8ef57ea5c7770c9ee3..0deae4a0f5f169b4738019487ca4a25624d3f014
@@@ -352,6 -352,24 +352,24 @@@ static ssize_t ceph_vxattrcb_auth_mds(s
        return ret;
  }
  
+ #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ static bool ceph_vxattrcb_fscrypt_auth_exists(struct ceph_inode_info *ci)
+ {
+       return ci->fscrypt_auth_len;
+ }
+ static ssize_t ceph_vxattrcb_fscrypt_auth(struct ceph_inode_info *ci,
+                                         char *val, size_t size)
+ {
+       if (size) {
+               if (size < ci->fscrypt_auth_len)
+                       return -ERANGE;
+               memcpy(val, ci->fscrypt_auth, ci->fscrypt_auth_len);
+       }
+       return ci->fscrypt_auth_len;
+ }
+ #endif /* CONFIG_FS_ENCRYPTION */
  #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
  #define CEPH_XATTR_NAME2(_type, _name, _name2)        \
        XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
@@@ -500,6 -518,15 +518,15 @@@ static struct ceph_vxattr ceph_common_v
                .exists_cb = NULL,
                .flags = VXATTR_FLAG_READONLY,
        },
+ #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+       {
+               .name = "ceph.fscrypt.auth",
+               .name_size = sizeof("ceph.fscrypt.auth"),
+               .getxattr_cb = ceph_vxattrcb_fscrypt_auth,
+               .exists_cb = ceph_vxattrcb_fscrypt_auth_exists,
+               .flags = VXATTR_FLAG_READONLY,
+       },
+ #endif /* CONFIG_FS_ENCRYPTION */
        { .name = NULL, 0 }     /* Required table terminator */
  };
  
@@@ -1238,7 -1265,7 +1265,7 @@@ retry
                dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
                                               &prealloc_cf);
                ci->i_xattrs.dirty = true;
 -              inode->i_ctime = current_time(inode);
 +              inode_set_ctime_current(inode);
        }
  
        spin_unlock(&ci->i_ceph_lock);
@@@ -1407,6 -1434,9 +1434,9 @@@ void ceph_release_acl_sec_ctx(struct ce
  #endif
  #ifdef CONFIG_CEPH_FS_SECURITY_LABEL
        security_release_secctx(as_ctx->sec_ctx, as_ctx->sec_ctxlen);
+ #endif
+ #ifdef CONFIG_FS_ENCRYPTION
+       kfree(as_ctx->fscrypt_auth);
  #endif
        if (as_ctx->pagelist)
                ceph_pagelist_release(as_ctx->pagelist);
This page took 0.189372 seconds and 4 git commands to generate.