]> Git Repo - linux.git/blob - fs/bcachefs/fs.c
filelock: Remove locks reliably when fcntl/close race is detected
[linux.git] / fs / bcachefs / fs.c
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "errcode.h"
12 #include "extents.h"
13 #include "fs.h"
14 #include "fs-common.h"
15 #include "fs-io.h"
16 #include "fs-ioctl.h"
17 #include "fs-io-buffered.h"
18 #include "fs-io-direct.h"
19 #include "fs-io-pagecache.h"
20 #include "fsck.h"
21 #include "inode.h"
22 #include "io_read.h"
23 #include "journal.h"
24 #include "keylist.h"
25 #include "quota.h"
26 #include "snapshot.h"
27 #include "super.h"
28 #include "xattr.h"
29
30 #include <linux/aio.h>
31 #include <linux/backing-dev.h>
32 #include <linux/exportfs.h>
33 #include <linux/fiemap.h>
34 #include <linux/module.h>
35 #include <linux/pagemap.h>
36 #include <linux/posix_acl.h>
37 #include <linux/random.h>
38 #include <linux/seq_file.h>
39 #include <linux/statfs.h>
40 #include <linux/string.h>
41 #include <linux/xattr.h>
42
43 static struct kmem_cache *bch2_inode_cache;
44
45 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
46                                 struct bch_inode_info *,
47                                 struct bch_inode_unpacked *,
48                                 struct bch_subvolume *);
49
50 void bch2_inode_update_after_write(struct btree_trans *trans,
51                                    struct bch_inode_info *inode,
52                                    struct bch_inode_unpacked *bi,
53                                    unsigned fields)
54 {
55         struct bch_fs *c = trans->c;
56
57         BUG_ON(bi->bi_inum != inode->v.i_ino);
58
59         bch2_assert_pos_locked(trans, BTREE_ID_inodes,
60                                POS(0, bi->bi_inum),
61                                c->opts.inodes_use_key_cache);
62
63         set_nlink(&inode->v, bch2_inode_nlink_get(bi));
64         i_uid_write(&inode->v, bi->bi_uid);
65         i_gid_write(&inode->v, bi->bi_gid);
66         inode->v.i_mode = bi->bi_mode;
67
68         if (fields & ATTR_ATIME)
69                 inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
70         if (fields & ATTR_MTIME)
71                 inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
72         if (fields & ATTR_CTIME)
73                 inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
74
75         inode->ei_inode         = *bi;
76
77         bch2_inode_flags_to_vfs(inode);
78 }
79
80 int __must_check bch2_write_inode(struct bch_fs *c,
81                                   struct bch_inode_info *inode,
82                                   inode_set_fn set,
83                                   void *p, unsigned fields)
84 {
85         struct btree_trans *trans = bch2_trans_get(c);
86         struct btree_iter iter = { NULL };
87         struct bch_inode_unpacked inode_u;
88         int ret;
89 retry:
90         bch2_trans_begin(trans);
91
92         ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
93                                 BTREE_ITER_intent) ?:
94                 (set ? set(trans, inode, &inode_u, p) : 0) ?:
95                 bch2_inode_write(trans, &iter, &inode_u) ?:
96                 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
97
98         /*
99          * the btree node lock protects inode->ei_inode, not ei_update_lock;
100          * this is important for inode updates via bchfs_write_index_update
101          */
102         if (!ret)
103                 bch2_inode_update_after_write(trans, inode, &inode_u, fields);
104
105         bch2_trans_iter_exit(trans, &iter);
106
107         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
108                 goto retry;
109
110         bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
111                              "%s: inode %u:%llu not found when updating",
112                              bch2_err_str(ret),
113                              inode_inum(inode).subvol,
114                              inode_inum(inode).inum);
115
116         bch2_trans_put(trans);
117         return ret < 0 ? ret : 0;
118 }
119
120 int bch2_fs_quota_transfer(struct bch_fs *c,
121                            struct bch_inode_info *inode,
122                            struct bch_qid new_qid,
123                            unsigned qtypes,
124                            enum quota_acct_mode mode)
125 {
126         unsigned i;
127         int ret;
128
129         qtypes &= enabled_qtypes(c);
130
131         for (i = 0; i < QTYP_NR; i++)
132                 if (new_qid.q[i] == inode->ei_qid.q[i])
133                         qtypes &= ~(1U << i);
134
135         if (!qtypes)
136                 return 0;
137
138         mutex_lock(&inode->ei_quota_lock);
139
140         ret = bch2_quota_transfer(c, qtypes, new_qid,
141                                   inode->ei_qid,
142                                   inode->v.i_blocks +
143                                   inode->ei_quota_reserved,
144                                   mode);
145         if (!ret)
146                 for (i = 0; i < QTYP_NR; i++)
147                         if (qtypes & (1 << i))
148                                 inode->ei_qid.q[i] = new_qid.q[i];
149
150         mutex_unlock(&inode->ei_quota_lock);
151
152         return ret;
153 }
154
155 static int bch2_iget5_test(struct inode *vinode, void *p)
156 {
157         struct bch_inode_info *inode = to_bch_ei(vinode);
158         subvol_inum *inum = p;
159
160         return inode->ei_subvol == inum->subvol &&
161                 inode->ei_inode.bi_inum == inum->inum;
162 }
163
164 static int bch2_iget5_set(struct inode *vinode, void *p)
165 {
166         struct bch_inode_info *inode = to_bch_ei(vinode);
167         subvol_inum *inum = p;
168
169         inode->v.i_ino          = inum->inum;
170         inode->ei_subvol        = inum->subvol;
171         inode->ei_inode.bi_inum = inum->inum;
172         return 0;
173 }
174
175 static unsigned bch2_inode_hash(subvol_inum inum)
176 {
177         return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
178 }
179
180 static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
181 {
182         subvol_inum inum = inode_inum(inode);
183         struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
184                                       bch2_inode_hash(inum),
185                                       bch2_iget5_test,
186                                       bch2_iget5_set,
187                                       &inum));
188         BUG_ON(!old);
189
190         if (unlikely(old != inode)) {
191                 /*
192                  * bcachefs doesn't use I_NEW; we have no use for it since we
193                  * only insert fully created inodes in the inode hash table. But
194                  * discard_new_inode() expects it to be set...
195                  */
196                 inode->v.i_flags |= I_NEW;
197                 discard_new_inode(&inode->v);
198                 inode = old;
199         } else {
200                 mutex_lock(&c->vfs_inodes_lock);
201                 list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
202                 mutex_unlock(&c->vfs_inodes_lock);
203                 /*
204                  * Again, I_NEW makes no sense for bcachefs. This is only needed
205                  * for clearing I_NEW, but since the inode was already fully
206                  * created and initialized we didn't actually want
207                  * inode_insert5() to set it for us.
208                  */
209                 unlock_new_inode(&inode->v);
210         }
211
212         return inode;
213 }
214
215 #define memalloc_flags_do(_flags, _do)                                          \
216 ({                                                                              \
217         unsigned _saved_flags = memalloc_flags_save(_flags);                    \
218         typeof(_do) _ret = _do;                                                 \
219         memalloc_noreclaim_restore(_saved_flags);                               \
220         _ret;                                                                   \
221 })
222
223 static struct inode *bch2_alloc_inode(struct super_block *sb)
224 {
225         BUG();
226 }
227
228 static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
229 {
230         struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
231         if (!inode)
232                 return NULL;
233
234         inode_init_once(&inode->v);
235         mutex_init(&inode->ei_update_lock);
236         two_state_lock_init(&inode->ei_pagecache_lock);
237         INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
238         inode->ei_flags = 0;
239         mutex_init(&inode->ei_quota_lock);
240         memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
241         inode->v.i_state = 0;
242
243         if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
244                 kmem_cache_free(bch2_inode_cache, inode);
245                 return NULL;
246         }
247
248         return inode;
249 }
250
251 /*
252  * Allocate a new inode, dropping/retaking btree locks if necessary:
253  */
254 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
255 {
256         struct bch_inode_info *inode =
257                 memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
258                                   __bch2_new_inode(trans->c));
259
260         if (unlikely(!inode)) {
261                 int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM);
262                 if (ret && inode) {
263                         __destroy_inode(&inode->v);
264                         kmem_cache_free(bch2_inode_cache, inode);
265                 }
266                 if (ret)
267                         return ERR_PTR(ret);
268         }
269
270         return inode;
271 }
272
273 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
274 {
275         struct bch_inode_info *inode =
276                 to_bch_ei(ilookup5_nowait(c->vfs_sb,
277                                           bch2_inode_hash(inum),
278                                           bch2_iget5_test,
279                                           &inum));
280         if (inode)
281                 return &inode->v;
282
283         struct btree_trans *trans = bch2_trans_get(c);
284
285         struct bch_inode_unpacked inode_u;
286         struct bch_subvolume subvol;
287         int ret = lockrestart_do(trans,
288                 bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
289                 bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
290                 PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
291         if (!ret) {
292                 bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
293                 inode = bch2_inode_insert(c, inode);
294         }
295         bch2_trans_put(trans);
296
297         return ret ? ERR_PTR(ret) : &inode->v;
298 }
299
300 struct bch_inode_info *
301 __bch2_create(struct mnt_idmap *idmap,
302               struct bch_inode_info *dir, struct dentry *dentry,
303               umode_t mode, dev_t rdev, subvol_inum snapshot_src,
304               unsigned flags)
305 {
306         struct bch_fs *c = dir->v.i_sb->s_fs_info;
307         struct btree_trans *trans;
308         struct bch_inode_unpacked dir_u;
309         struct bch_inode_info *inode;
310         struct bch_inode_unpacked inode_u;
311         struct posix_acl *default_acl = NULL, *acl = NULL;
312         subvol_inum inum;
313         struct bch_subvolume subvol;
314         u64 journal_seq = 0;
315         int ret;
316
317         /*
318          * preallocate acls + vfs inode before btree transaction, so that
319          * nothing can fail after the transaction succeeds:
320          */
321 #ifdef CONFIG_BCACHEFS_POSIX_ACL
322         ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
323         if (ret)
324                 return ERR_PTR(ret);
325 #endif
326         inode = __bch2_new_inode(c);
327         if (unlikely(!inode)) {
328                 inode = ERR_PTR(-ENOMEM);
329                 goto err;
330         }
331
332         bch2_inode_init_early(c, &inode_u);
333
334         if (!(flags & BCH_CREATE_TMPFILE))
335                 mutex_lock(&dir->ei_update_lock);
336
337         trans = bch2_trans_get(c);
338 retry:
339         bch2_trans_begin(trans);
340
341         ret   = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
342                 bch2_create_trans(trans,
343                                   inode_inum(dir), &dir_u, &inode_u,
344                                   !(flags & BCH_CREATE_TMPFILE)
345                                   ? &dentry->d_name : NULL,
346                                   from_kuid(i_user_ns(&dir->v), current_fsuid()),
347                                   from_kgid(i_user_ns(&dir->v), current_fsgid()),
348                                   mode, rdev,
349                                   default_acl, acl, snapshot_src, flags) ?:
350                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
351                                 KEY_TYPE_QUOTA_PREALLOC);
352         if (unlikely(ret))
353                 goto err_before_quota;
354
355         inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
356         inum.inum = inode_u.bi_inum;
357
358         ret   = bch2_subvolume_get(trans, inum.subvol, true,
359                                    BTREE_ITER_with_updates, &subvol) ?:
360                 bch2_trans_commit(trans, NULL, &journal_seq, 0);
361         if (unlikely(ret)) {
362                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
363                                 KEY_TYPE_QUOTA_WARN);
364 err_before_quota:
365                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
366                         goto retry;
367                 goto err_trans;
368         }
369
370         if (!(flags & BCH_CREATE_TMPFILE)) {
371                 bch2_inode_update_after_write(trans, dir, &dir_u,
372                                               ATTR_MTIME|ATTR_CTIME);
373                 mutex_unlock(&dir->ei_update_lock);
374         }
375
376         bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
377
378         set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
379         set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
380
381         /*
382          * we must insert the new inode into the inode cache before calling
383          * bch2_trans_exit() and dropping locks, else we could race with another
384          * thread pulling the inode in and modifying it:
385          */
386         inode = bch2_inode_insert(c, inode);
387         bch2_trans_put(trans);
388 err:
389         posix_acl_release(default_acl);
390         posix_acl_release(acl);
391         return inode;
392 err_trans:
393         if (!(flags & BCH_CREATE_TMPFILE))
394                 mutex_unlock(&dir->ei_update_lock);
395
396         bch2_trans_put(trans);
397         make_bad_inode(&inode->v);
398         iput(&inode->v);
399         inode = ERR_PTR(ret);
400         goto err;
401 }
402
403 /* methods */
404
405 static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
406                         subvol_inum dir, struct bch_hash_info *dir_hash_info,
407                         const struct qstr *name)
408 {
409         struct bch_fs *c = trans->c;
410         struct btree_iter dirent_iter = {};
411         subvol_inum inum = {};
412         struct printbuf buf = PRINTBUF;
413
414         struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
415                                              dir_hash_info, dir, name, 0);
416         int ret = bkey_err(k);
417         if (ret)
418                 return ERR_PTR(ret);
419
420         ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
421         if (ret > 0)
422                 ret = -ENOENT;
423         if (ret)
424                 goto err;
425
426         struct bch_inode_info *inode =
427                 to_bch_ei(ilookup5_nowait(c->vfs_sb,
428                                           bch2_inode_hash(inum),
429                                           bch2_iget5_test,
430                                           &inum));
431         if (inode)
432                 goto out;
433
434         struct bch_subvolume subvol;
435         struct bch_inode_unpacked inode_u;
436         ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
437                 bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
438                 PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
439
440         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
441                                 c, "dirent to missing inode:\n  %s",
442                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
443         if (ret)
444                 goto err;
445
446         /* regular files may have hardlinks: */
447         if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) &&
448                                     !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
449                                     c,
450                                     "dirent points to inode that does not point back:\n  %s",
451                                     (bch2_bkey_val_to_text(&buf, c, k),
452                                      prt_printf(&buf, "\n  "),
453                                      bch2_inode_unpacked_to_text(&buf, &inode_u),
454                                      buf.buf))) {
455                 ret = -ENOENT;
456                 goto err;
457         }
458
459         bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
460         inode = bch2_inode_insert(c, inode);
461 out:
462         bch2_trans_iter_exit(trans, &dirent_iter);
463         printbuf_exit(&buf);
464         return inode;
465 err:
466         inode = ERR_PTR(ret);
467         goto out;
468 }
469
470 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
471                                   unsigned int flags)
472 {
473         struct bch_fs *c = vdir->i_sb->s_fs_info;
474         struct bch_inode_info *dir = to_bch_ei(vdir);
475         struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
476
477         struct bch_inode_info *inode;
478         bch2_trans_do(c, NULL, NULL, 0,
479                 PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
480                                                           &hash, &dentry->d_name)));
481         if (IS_ERR(inode))
482                 inode = NULL;
483
484         return d_splice_alias(&inode->v, dentry);
485 }
486
487 static int bch2_mknod(struct mnt_idmap *idmap,
488                       struct inode *vdir, struct dentry *dentry,
489                       umode_t mode, dev_t rdev)
490 {
491         struct bch_inode_info *inode =
492                 __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
493                               (subvol_inum) { 0 }, 0);
494
495         if (IS_ERR(inode))
496                 return bch2_err_class(PTR_ERR(inode));
497
498         d_instantiate(dentry, &inode->v);
499         return 0;
500 }
501
502 static int bch2_create(struct mnt_idmap *idmap,
503                        struct inode *vdir, struct dentry *dentry,
504                        umode_t mode, bool excl)
505 {
506         return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
507 }
508
509 static int __bch2_link(struct bch_fs *c,
510                        struct bch_inode_info *inode,
511                        struct bch_inode_info *dir,
512                        struct dentry *dentry)
513 {
514         struct btree_trans *trans = bch2_trans_get(c);
515         struct bch_inode_unpacked dir_u, inode_u;
516         int ret;
517
518         mutex_lock(&inode->ei_update_lock);
519
520         ret = commit_do(trans, NULL, NULL, 0,
521                         bch2_link_trans(trans,
522                                         inode_inum(dir),   &dir_u,
523                                         inode_inum(inode), &inode_u,
524                                         &dentry->d_name));
525
526         if (likely(!ret)) {
527                 bch2_inode_update_after_write(trans, dir, &dir_u,
528                                               ATTR_MTIME|ATTR_CTIME);
529                 bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
530         }
531
532         bch2_trans_put(trans);
533         mutex_unlock(&inode->ei_update_lock);
534         return ret;
535 }
536
537 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
538                      struct dentry *dentry)
539 {
540         struct bch_fs *c = vdir->i_sb->s_fs_info;
541         struct bch_inode_info *dir = to_bch_ei(vdir);
542         struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
543         int ret;
544
545         lockdep_assert_held(&inode->v.i_rwsem);
546
547         ret   = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
548                 bch2_subvol_is_ro(c, inode->ei_subvol) ?:
549                 __bch2_link(c, inode, dir, dentry);
550         if (unlikely(ret))
551                 return bch2_err_class(ret);
552
553         ihold(&inode->v);
554         d_instantiate(dentry, &inode->v);
555         return 0;
556 }
557
558 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
559                   bool deleting_snapshot)
560 {
561         struct bch_fs *c = vdir->i_sb->s_fs_info;
562         struct bch_inode_info *dir = to_bch_ei(vdir);
563         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
564         struct bch_inode_unpacked dir_u, inode_u;
565         struct btree_trans *trans = bch2_trans_get(c);
566         int ret;
567
568         bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
569
570         ret = commit_do(trans, NULL, NULL,
571                         BCH_TRANS_COMMIT_no_enospc,
572                 bch2_unlink_trans(trans,
573                                   inode_inum(dir), &dir_u,
574                                   &inode_u, &dentry->d_name,
575                                   deleting_snapshot));
576         if (unlikely(ret))
577                 goto err;
578
579         bch2_inode_update_after_write(trans, dir, &dir_u,
580                                       ATTR_MTIME|ATTR_CTIME);
581         bch2_inode_update_after_write(trans, inode, &inode_u,
582                                       ATTR_MTIME);
583
584         if (inode_u.bi_subvol) {
585                 /*
586                  * Subvolume deletion is asynchronous, but we still want to tell
587                  * the VFS that it's been deleted here:
588                  */
589                 set_nlink(&inode->v, 0);
590         }
591 err:
592         bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
593         bch2_trans_put(trans);
594
595         return ret;
596 }
597
598 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
599 {
600         struct bch_inode_info *dir= to_bch_ei(vdir);
601         struct bch_fs *c = dir->v.i_sb->s_fs_info;
602
603         int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
604                 __bch2_unlink(vdir, dentry, false);
605         return bch2_err_class(ret);
606 }
607
608 static int bch2_symlink(struct mnt_idmap *idmap,
609                         struct inode *vdir, struct dentry *dentry,
610                         const char *symname)
611 {
612         struct bch_fs *c = vdir->i_sb->s_fs_info;
613         struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
614         int ret;
615
616         inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
617                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
618         if (IS_ERR(inode))
619                 return bch2_err_class(PTR_ERR(inode));
620
621         inode_lock(&inode->v);
622         ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
623         inode_unlock(&inode->v);
624
625         if (unlikely(ret))
626                 goto err;
627
628         ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
629         if (unlikely(ret))
630                 goto err;
631
632         ret = __bch2_link(c, inode, dir, dentry);
633         if (unlikely(ret))
634                 goto err;
635
636         d_instantiate(dentry, &inode->v);
637         return 0;
638 err:
639         iput(&inode->v);
640         return bch2_err_class(ret);
641 }
642
643 static int bch2_mkdir(struct mnt_idmap *idmap,
644                       struct inode *vdir, struct dentry *dentry, umode_t mode)
645 {
646         return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
647 }
648
649 static int bch2_rename2(struct mnt_idmap *idmap,
650                         struct inode *src_vdir, struct dentry *src_dentry,
651                         struct inode *dst_vdir, struct dentry *dst_dentry,
652                         unsigned flags)
653 {
654         struct bch_fs *c = src_vdir->i_sb->s_fs_info;
655         struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
656         struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
657         struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
658         struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
659         struct bch_inode_unpacked dst_dir_u, src_dir_u;
660         struct bch_inode_unpacked src_inode_u, dst_inode_u;
661         struct btree_trans *trans;
662         enum bch_rename_mode mode = flags & RENAME_EXCHANGE
663                 ? BCH_RENAME_EXCHANGE
664                 : dst_dentry->d_inode
665                 ? BCH_RENAME_OVERWRITE : BCH_RENAME;
666         int ret;
667
668         if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
669                 return -EINVAL;
670
671         if (mode == BCH_RENAME_OVERWRITE) {
672                 ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
673                                                    0, LLONG_MAX);
674                 if (ret)
675                         return ret;
676         }
677
678         trans = bch2_trans_get(c);
679
680         bch2_lock_inodes(INODE_UPDATE_LOCK,
681                          src_dir,
682                          dst_dir,
683                          src_inode,
684                          dst_inode);
685
686         ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
687                 bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
688         if (ret)
689                 goto err;
690
691         if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
692                 ret = bch2_fs_quota_transfer(c, src_inode,
693                                              dst_dir->ei_qid,
694                                              1 << QTYP_PRJ,
695                                              KEY_TYPE_QUOTA_PREALLOC);
696                 if (ret)
697                         goto err;
698         }
699
700         if (mode == BCH_RENAME_EXCHANGE &&
701             inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
702                 ret = bch2_fs_quota_transfer(c, dst_inode,
703                                              src_dir->ei_qid,
704                                              1 << QTYP_PRJ,
705                                              KEY_TYPE_QUOTA_PREALLOC);
706                 if (ret)
707                         goto err;
708         }
709
710         ret = commit_do(trans, NULL, NULL, 0,
711                         bch2_rename_trans(trans,
712                                           inode_inum(src_dir), &src_dir_u,
713                                           inode_inum(dst_dir), &dst_dir_u,
714                                           &src_inode_u,
715                                           &dst_inode_u,
716                                           &src_dentry->d_name,
717                                           &dst_dentry->d_name,
718                                           mode));
719         if (unlikely(ret))
720                 goto err;
721
722         BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
723         BUG_ON(dst_inode &&
724                dst_inode->v.i_ino != dst_inode_u.bi_inum);
725
726         bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
727                                       ATTR_MTIME|ATTR_CTIME);
728
729         if (src_dir != dst_dir)
730                 bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
731                                               ATTR_MTIME|ATTR_CTIME);
732
733         bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
734                                       ATTR_CTIME);
735
736         if (dst_inode)
737                 bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
738                                               ATTR_CTIME);
739 err:
740         bch2_trans_put(trans);
741
742         bch2_fs_quota_transfer(c, src_inode,
743                                bch_qid(&src_inode->ei_inode),
744                                1 << QTYP_PRJ,
745                                KEY_TYPE_QUOTA_NOCHECK);
746         if (dst_inode)
747                 bch2_fs_quota_transfer(c, dst_inode,
748                                        bch_qid(&dst_inode->ei_inode),
749                                        1 << QTYP_PRJ,
750                                        KEY_TYPE_QUOTA_NOCHECK);
751
752         bch2_unlock_inodes(INODE_UPDATE_LOCK,
753                            src_dir,
754                            dst_dir,
755                            src_inode,
756                            dst_inode);
757
758         return bch2_err_class(ret);
759 }
760
761 static void bch2_setattr_copy(struct mnt_idmap *idmap,
762                               struct bch_inode_info *inode,
763                               struct bch_inode_unpacked *bi,
764                               struct iattr *attr)
765 {
766         struct bch_fs *c = inode->v.i_sb->s_fs_info;
767         unsigned int ia_valid = attr->ia_valid;
768
769         if (ia_valid & ATTR_UID)
770                 bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
771         if (ia_valid & ATTR_GID)
772                 bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
773
774         if (ia_valid & ATTR_SIZE)
775                 bi->bi_size = attr->ia_size;
776
777         if (ia_valid & ATTR_ATIME)
778                 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
779         if (ia_valid & ATTR_MTIME)
780                 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
781         if (ia_valid & ATTR_CTIME)
782                 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
783
784         if (ia_valid & ATTR_MODE) {
785                 umode_t mode = attr->ia_mode;
786                 kgid_t gid = ia_valid & ATTR_GID
787                         ? attr->ia_gid
788                         : inode->v.i_gid;
789
790                 if (!in_group_p(gid) &&
791                     !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
792                         mode &= ~S_ISGID;
793                 bi->bi_mode = mode;
794         }
795 }
796
797 int bch2_setattr_nonsize(struct mnt_idmap *idmap,
798                          struct bch_inode_info *inode,
799                          struct iattr *attr)
800 {
801         struct bch_fs *c = inode->v.i_sb->s_fs_info;
802         struct bch_qid qid;
803         struct btree_trans *trans;
804         struct btree_iter inode_iter = { NULL };
805         struct bch_inode_unpacked inode_u;
806         struct posix_acl *acl = NULL;
807         int ret;
808
809         mutex_lock(&inode->ei_update_lock);
810
811         qid = inode->ei_qid;
812
813         if (attr->ia_valid & ATTR_UID)
814                 qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
815
816         if (attr->ia_valid & ATTR_GID)
817                 qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
818
819         ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
820                                      KEY_TYPE_QUOTA_PREALLOC);
821         if (ret)
822                 goto err;
823
824         trans = bch2_trans_get(c);
825 retry:
826         bch2_trans_begin(trans);
827         kfree(acl);
828         acl = NULL;
829
830         ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
831                               BTREE_ITER_intent);
832         if (ret)
833                 goto btree_err;
834
835         bch2_setattr_copy(idmap, inode, &inode_u, attr);
836
837         if (attr->ia_valid & ATTR_MODE) {
838                 ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
839                                      inode_u.bi_mode, &acl);
840                 if (ret)
841                         goto btree_err;
842         }
843
844         ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
845                 bch2_trans_commit(trans, NULL, NULL,
846                                   BCH_TRANS_COMMIT_no_enospc);
847 btree_err:
848         bch2_trans_iter_exit(trans, &inode_iter);
849
850         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
851                 goto retry;
852         if (unlikely(ret))
853                 goto err_trans;
854
855         bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
856
857         if (acl)
858                 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
859 err_trans:
860         bch2_trans_put(trans);
861 err:
862         mutex_unlock(&inode->ei_update_lock);
863
864         return bch2_err_class(ret);
865 }
866
867 static int bch2_getattr(struct mnt_idmap *idmap,
868                         const struct path *path, struct kstat *stat,
869                         u32 request_mask, unsigned query_flags)
870 {
871         struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
872         struct bch_fs *c = inode->v.i_sb->s_fs_info;
873
874         stat->dev       = inode->v.i_sb->s_dev;
875         stat->ino       = inode->v.i_ino;
876         stat->mode      = inode->v.i_mode;
877         stat->nlink     = inode->v.i_nlink;
878         stat->uid       = inode->v.i_uid;
879         stat->gid       = inode->v.i_gid;
880         stat->rdev      = inode->v.i_rdev;
881         stat->size      = i_size_read(&inode->v);
882         stat->atime     = inode_get_atime(&inode->v);
883         stat->mtime     = inode_get_mtime(&inode->v);
884         stat->ctime     = inode_get_ctime(&inode->v);
885         stat->blksize   = block_bytes(c);
886         stat->blocks    = inode->v.i_blocks;
887
888         stat->subvol    = inode->ei_subvol;
889         stat->result_mask |= STATX_SUBVOL;
890
891         if (request_mask & STATX_BTIME) {
892                 stat->result_mask |= STATX_BTIME;
893                 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
894         }
895
896         if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
897                 stat->attributes |= STATX_ATTR_IMMUTABLE;
898         stat->attributes_mask    |= STATX_ATTR_IMMUTABLE;
899
900         if (inode->ei_inode.bi_flags & BCH_INODE_append)
901                 stat->attributes |= STATX_ATTR_APPEND;
902         stat->attributes_mask    |= STATX_ATTR_APPEND;
903
904         if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
905                 stat->attributes |= STATX_ATTR_NODUMP;
906         stat->attributes_mask    |= STATX_ATTR_NODUMP;
907
908         return 0;
909 }
910
911 static int bch2_setattr(struct mnt_idmap *idmap,
912                         struct dentry *dentry, struct iattr *iattr)
913 {
914         struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
915         struct bch_fs *c = inode->v.i_sb->s_fs_info;
916         int ret;
917
918         lockdep_assert_held(&inode->v.i_rwsem);
919
920         ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
921                 setattr_prepare(idmap, dentry, iattr);
922         if (ret)
923                 return ret;
924
925         return iattr->ia_valid & ATTR_SIZE
926                 ? bchfs_truncate(idmap, inode, iattr)
927                 : bch2_setattr_nonsize(idmap, inode, iattr);
928 }
929
930 static int bch2_tmpfile(struct mnt_idmap *idmap,
931                         struct inode *vdir, struct file *file, umode_t mode)
932 {
933         struct bch_inode_info *inode =
934                 __bch2_create(idmap, to_bch_ei(vdir),
935                               file->f_path.dentry, mode, 0,
936                               (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
937
938         if (IS_ERR(inode))
939                 return bch2_err_class(PTR_ERR(inode));
940
941         d_mark_tmpfile(file, &inode->v);
942         d_instantiate(file->f_path.dentry, &inode->v);
943         return finish_open_simple(file, 0);
944 }
945
946 static int bch2_fill_extent(struct bch_fs *c,
947                             struct fiemap_extent_info *info,
948                             struct bkey_s_c k, unsigned flags)
949 {
950         if (bkey_extent_is_direct_data(k.k)) {
951                 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
952                 const union bch_extent_entry *entry;
953                 struct extent_ptr_decoded p;
954                 int ret;
955
956                 if (k.k->type == KEY_TYPE_reflink_v)
957                         flags |= FIEMAP_EXTENT_SHARED;
958
959                 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
960                         int flags2 = 0;
961                         u64 offset = p.ptr.offset;
962
963                         if (p.ptr.unwritten)
964                                 flags2 |= FIEMAP_EXTENT_UNWRITTEN;
965
966                         if (p.crc.compression_type)
967                                 flags2 |= FIEMAP_EXTENT_ENCODED;
968                         else
969                                 offset += p.crc.offset;
970
971                         if ((offset & (block_sectors(c) - 1)) ||
972                             (k.k->size & (block_sectors(c) - 1)))
973                                 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
974
975                         ret = fiemap_fill_next_extent(info,
976                                                 bkey_start_offset(k.k) << 9,
977                                                 offset << 9,
978                                                 k.k->size << 9, flags|flags2);
979                         if (ret)
980                                 return ret;
981                 }
982
983                 return 0;
984         } else if (bkey_extent_is_inline_data(k.k)) {
985                 return fiemap_fill_next_extent(info,
986                                                bkey_start_offset(k.k) << 9,
987                                                0, k.k->size << 9,
988                                                flags|
989                                                FIEMAP_EXTENT_DATA_INLINE);
990         } else if (k.k->type == KEY_TYPE_reservation) {
991                 return fiemap_fill_next_extent(info,
992                                                bkey_start_offset(k.k) << 9,
993                                                0, k.k->size << 9,
994                                                flags|
995                                                FIEMAP_EXTENT_DELALLOC|
996                                                FIEMAP_EXTENT_UNWRITTEN);
997         } else {
998                 BUG();
999         }
1000 }
1001
1002 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
1003                        u64 start, u64 len)
1004 {
1005         struct bch_fs *c = vinode->i_sb->s_fs_info;
1006         struct bch_inode_info *ei = to_bch_ei(vinode);
1007         struct btree_trans *trans;
1008         struct btree_iter iter;
1009         struct bkey_s_c k;
1010         struct bkey_buf cur, prev;
1011         unsigned offset_into_extent, sectors;
1012         bool have_extent = false;
1013         u32 snapshot;
1014         int ret = 0;
1015
1016         ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
1017         if (ret)
1018                 return ret;
1019
1020         struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
1021         if (start + len < start)
1022                 return -EINVAL;
1023
1024         start >>= 9;
1025
1026         bch2_bkey_buf_init(&cur);
1027         bch2_bkey_buf_init(&prev);
1028         trans = bch2_trans_get(c);
1029 retry:
1030         bch2_trans_begin(trans);
1031
1032         ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
1033         if (ret)
1034                 goto err;
1035
1036         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1037                              SPOS(ei->v.i_ino, start, snapshot), 0);
1038
1039         while (!(ret = btree_trans_too_many_iters(trans)) &&
1040                (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
1041                !(ret = bkey_err(k))) {
1042                 enum btree_id data_btree = BTREE_ID_extents;
1043
1044                 if (!bkey_extent_is_data(k.k) &&
1045                     k.k->type != KEY_TYPE_reservation) {
1046                         bch2_btree_iter_advance(&iter);
1047                         continue;
1048                 }
1049
1050                 offset_into_extent      = iter.pos.offset -
1051                         bkey_start_offset(k.k);
1052                 sectors                 = k.k->size - offset_into_extent;
1053
1054                 bch2_bkey_buf_reassemble(&cur, c, k);
1055
1056                 ret = bch2_read_indirect_extent(trans, &data_btree,
1057                                         &offset_into_extent, &cur);
1058                 if (ret)
1059                         break;
1060
1061                 k = bkey_i_to_s_c(cur.k);
1062                 bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
1063
1064                 sectors = min(sectors, k.k->size - offset_into_extent);
1065
1066                 bch2_cut_front(POS(k.k->p.inode,
1067                                    bkey_start_offset(k.k) +
1068                                    offset_into_extent),
1069                                cur.k);
1070                 bch2_key_resize(&cur.k->k, sectors);
1071                 cur.k->k.p = iter.pos;
1072                 cur.k->k.p.offset += cur.k->k.size;
1073
1074                 if (have_extent) {
1075                         bch2_trans_unlock(trans);
1076                         ret = bch2_fill_extent(c, info,
1077                                         bkey_i_to_s_c(prev.k), 0);
1078                         if (ret)
1079                                 break;
1080                 }
1081
1082                 bkey_copy(prev.k, cur.k);
1083                 have_extent = true;
1084
1085                 bch2_btree_iter_set_pos(&iter,
1086                         POS(iter.pos.inode, iter.pos.offset + sectors));
1087
1088                 ret = bch2_trans_relock(trans);
1089                 if (ret)
1090                         break;
1091         }
1092         start = iter.pos.offset;
1093         bch2_trans_iter_exit(trans, &iter);
1094 err:
1095         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1096                 goto retry;
1097
1098         if (!ret && have_extent) {
1099                 bch2_trans_unlock(trans);
1100                 ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
1101                                        FIEMAP_EXTENT_LAST);
1102         }
1103
1104         bch2_trans_put(trans);
1105         bch2_bkey_buf_exit(&cur, c);
1106         bch2_bkey_buf_exit(&prev, c);
1107         return ret < 0 ? ret : 0;
1108 }
1109
1110 static const struct vm_operations_struct bch_vm_ops = {
1111         .fault          = bch2_page_fault,
1112         .map_pages      = filemap_map_pages,
1113         .page_mkwrite   = bch2_page_mkwrite,
1114 };
1115
1116 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
1117 {
1118         file_accessed(file);
1119
1120         vma->vm_ops = &bch_vm_ops;
1121         return 0;
1122 }
1123
1124 /* Directories: */
1125
1126 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1127 {
1128         return generic_file_llseek_size(file, offset, whence,
1129                                         S64_MAX, S64_MAX);
1130 }
1131
1132 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1133 {
1134         struct bch_inode_info *inode = file_bch_inode(file);
1135         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1136
1137         if (!dir_emit_dots(file, ctx))
1138                 return 0;
1139
1140         int ret = bch2_readdir(c, inode_inum(inode), ctx);
1141
1142         bch_err_fn(c, ret);
1143         return bch2_err_class(ret);
1144 }
1145
1146 static int bch2_open(struct inode *vinode, struct file *file)
1147 {
1148         if (file->f_flags & (O_WRONLY|O_RDWR)) {
1149                 struct bch_inode_info *inode = to_bch_ei(vinode);
1150                 struct bch_fs *c = inode->v.i_sb->s_fs_info;
1151
1152                 int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
1153                 if (ret)
1154                         return ret;
1155         }
1156
1157         file->f_mode |= FMODE_CAN_ODIRECT;
1158
1159         return generic_file_open(vinode, file);
1160 }
1161
1162 static const struct file_operations bch_file_operations = {
1163         .open           = bch2_open,
1164         .llseek         = bch2_llseek,
1165         .read_iter      = bch2_read_iter,
1166         .write_iter     = bch2_write_iter,
1167         .mmap           = bch2_mmap,
1168         .get_unmapped_area = thp_get_unmapped_area,
1169         .fsync          = bch2_fsync,
1170         .splice_read    = filemap_splice_read,
1171         .splice_write   = iter_file_splice_write,
1172         .fallocate      = bch2_fallocate_dispatch,
1173         .unlocked_ioctl = bch2_fs_file_ioctl,
1174 #ifdef CONFIG_COMPAT
1175         .compat_ioctl   = bch2_compat_fs_ioctl,
1176 #endif
1177         .remap_file_range = bch2_remap_file_range,
1178 };
1179
1180 static const struct inode_operations bch_file_inode_operations = {
1181         .getattr        = bch2_getattr,
1182         .setattr        = bch2_setattr,
1183         .fiemap         = bch2_fiemap,
1184         .listxattr      = bch2_xattr_list,
1185 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1186         .get_acl        = bch2_get_acl,
1187         .set_acl        = bch2_set_acl,
1188 #endif
1189 };
1190
1191 static const struct inode_operations bch_dir_inode_operations = {
1192         .lookup         = bch2_lookup,
1193         .create         = bch2_create,
1194         .link           = bch2_link,
1195         .unlink         = bch2_unlink,
1196         .symlink        = bch2_symlink,
1197         .mkdir          = bch2_mkdir,
1198         .rmdir          = bch2_unlink,
1199         .mknod          = bch2_mknod,
1200         .rename         = bch2_rename2,
1201         .getattr        = bch2_getattr,
1202         .setattr        = bch2_setattr,
1203         .tmpfile        = bch2_tmpfile,
1204         .listxattr      = bch2_xattr_list,
1205 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1206         .get_acl        = bch2_get_acl,
1207         .set_acl        = bch2_set_acl,
1208 #endif
1209 };
1210
1211 static const struct file_operations bch_dir_file_operations = {
1212         .llseek         = bch2_dir_llseek,
1213         .read           = generic_read_dir,
1214         .iterate_shared = bch2_vfs_readdir,
1215         .fsync          = bch2_fsync,
1216         .unlocked_ioctl = bch2_fs_file_ioctl,
1217 #ifdef CONFIG_COMPAT
1218         .compat_ioctl   = bch2_compat_fs_ioctl,
1219 #endif
1220 };
1221
1222 static const struct inode_operations bch_symlink_inode_operations = {
1223         .get_link       = page_get_link,
1224         .getattr        = bch2_getattr,
1225         .setattr        = bch2_setattr,
1226         .listxattr      = bch2_xattr_list,
1227 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1228         .get_acl        = bch2_get_acl,
1229         .set_acl        = bch2_set_acl,
1230 #endif
1231 };
1232
1233 static const struct inode_operations bch_special_inode_operations = {
1234         .getattr        = bch2_getattr,
1235         .setattr        = bch2_setattr,
1236         .listxattr      = bch2_xattr_list,
1237 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1238         .get_acl        = bch2_get_acl,
1239         .set_acl        = bch2_set_acl,
1240 #endif
1241 };
1242
1243 static const struct address_space_operations bch_address_space_operations = {
1244         .read_folio     = bch2_read_folio,
1245         .writepages     = bch2_writepages,
1246         .readahead      = bch2_readahead,
1247         .dirty_folio    = filemap_dirty_folio,
1248         .write_begin    = bch2_write_begin,
1249         .write_end      = bch2_write_end,
1250         .invalidate_folio = bch2_invalidate_folio,
1251         .release_folio  = bch2_release_folio,
1252 #ifdef CONFIG_MIGRATION
1253         .migrate_folio  = filemap_migrate_folio,
1254 #endif
1255         .error_remove_folio = generic_error_remove_folio,
1256 };
1257
1258 struct bcachefs_fid {
1259         u64             inum;
1260         u32             subvol;
1261         u32             gen;
1262 } __packed;
1263
1264 struct bcachefs_fid_with_parent {
1265         struct bcachefs_fid     fid;
1266         struct bcachefs_fid     dir;
1267 } __packed;
1268
1269 static int bcachefs_fid_valid(int fh_len, int fh_type)
1270 {
1271         switch (fh_type) {
1272         case FILEID_BCACHEFS_WITHOUT_PARENT:
1273                 return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
1274         case FILEID_BCACHEFS_WITH_PARENT:
1275                 return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
1276         default:
1277                 return false;
1278         }
1279 }
1280
1281 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
1282 {
1283         return (struct bcachefs_fid) {
1284                 .inum   = inode->ei_inode.bi_inum,
1285                 .subvol = inode->ei_subvol,
1286                 .gen    = inode->ei_inode.bi_generation,
1287         };
1288 }
1289
1290 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
1291                           struct inode *vdir)
1292 {
1293         struct bch_inode_info *inode    = to_bch_ei(vinode);
1294         struct bch_inode_info *dir      = to_bch_ei(vdir);
1295         int min_len;
1296
1297         if (!S_ISDIR(inode->v.i_mode) && dir) {
1298                 struct bcachefs_fid_with_parent *fid = (void *) fh;
1299
1300                 min_len = sizeof(*fid) / sizeof(u32);
1301                 if (*len < min_len) {
1302                         *len = min_len;
1303                         return FILEID_INVALID;
1304                 }
1305
1306                 fid->fid = bch2_inode_to_fid(inode);
1307                 fid->dir = bch2_inode_to_fid(dir);
1308
1309                 *len = min_len;
1310                 return FILEID_BCACHEFS_WITH_PARENT;
1311         } else {
1312                 struct bcachefs_fid *fid = (void *) fh;
1313
1314                 min_len = sizeof(*fid) / sizeof(u32);
1315                 if (*len < min_len) {
1316                         *len = min_len;
1317                         return FILEID_INVALID;
1318                 }
1319                 *fid = bch2_inode_to_fid(inode);
1320
1321                 *len = min_len;
1322                 return FILEID_BCACHEFS_WITHOUT_PARENT;
1323         }
1324 }
1325
1326 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1327                                         struct bcachefs_fid fid)
1328 {
1329         struct bch_fs *c = sb->s_fs_info;
1330         struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
1331                                     .subvol = fid.subvol,
1332                                     .inum = fid.inum,
1333         });
1334         if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
1335                 iput(vinode);
1336                 vinode = ERR_PTR(-ESTALE);
1337         }
1338         return vinode;
1339 }
1340
1341 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
1342                 int fh_len, int fh_type)
1343 {
1344         struct bcachefs_fid *fid = (void *) _fid;
1345
1346         if (!bcachefs_fid_valid(fh_len, fh_type))
1347                 return NULL;
1348
1349         return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
1350 }
1351
1352 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
1353                 int fh_len, int fh_type)
1354 {
1355         struct bcachefs_fid_with_parent *fid = (void *) _fid;
1356
1357         if (!bcachefs_fid_valid(fh_len, fh_type) ||
1358             fh_type != FILEID_BCACHEFS_WITH_PARENT)
1359                 return NULL;
1360
1361         return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
1362 }
1363
1364 static struct dentry *bch2_get_parent(struct dentry *child)
1365 {
1366         struct bch_inode_info *inode = to_bch_ei(child->d_inode);
1367         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1368         subvol_inum parent_inum = {
1369                 .subvol = inode->ei_inode.bi_parent_subvol ?:
1370                         inode->ei_subvol,
1371                 .inum = inode->ei_inode.bi_dir,
1372         };
1373
1374         return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
1375 }
1376
1377 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
1378 {
1379         struct bch_inode_info *inode    = to_bch_ei(child->d_inode);
1380         struct bch_inode_info *dir      = to_bch_ei(parent->d_inode);
1381         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1382         struct btree_trans *trans;
1383         struct btree_iter iter1;
1384         struct btree_iter iter2;
1385         struct bkey_s_c k;
1386         struct bkey_s_c_dirent d;
1387         struct bch_inode_unpacked inode_u;
1388         subvol_inum target;
1389         u32 snapshot;
1390         struct qstr dirent_name;
1391         unsigned name_len = 0;
1392         int ret;
1393
1394         if (!S_ISDIR(dir->v.i_mode))
1395                 return -EINVAL;
1396
1397         trans = bch2_trans_get(c);
1398
1399         bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
1400                              POS(dir->ei_inode.bi_inum, 0), 0);
1401         bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
1402                              POS(dir->ei_inode.bi_inum, 0), 0);
1403 retry:
1404         bch2_trans_begin(trans);
1405
1406         ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
1407         if (ret)
1408                 goto err;
1409
1410         bch2_btree_iter_set_snapshot(&iter1, snapshot);
1411         bch2_btree_iter_set_snapshot(&iter2, snapshot);
1412
1413         ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
1414         if (ret)
1415                 goto err;
1416
1417         if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
1418                 bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
1419
1420                 k = bch2_btree_iter_peek_slot(&iter1);
1421                 ret = bkey_err(k);
1422                 if (ret)
1423                         goto err;
1424
1425                 if (k.k->type != KEY_TYPE_dirent) {
1426                         ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1427                         goto err;
1428                 }
1429
1430                 d = bkey_s_c_to_dirent(k);
1431                 ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1432                 if (ret > 0)
1433                         ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1434                 if (ret)
1435                         goto err;
1436
1437                 if (target.subvol       == inode->ei_subvol &&
1438                     target.inum         == inode->ei_inode.bi_inum)
1439                         goto found;
1440         } else {
1441                 /*
1442                  * File with multiple hardlinks and our backref is to the wrong
1443                  * directory - linear search:
1444                  */
1445                 for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
1446                         if (k.k->p.inode > dir->ei_inode.bi_inum)
1447                                 break;
1448
1449                         if (k.k->type != KEY_TYPE_dirent)
1450                                 continue;
1451
1452                         d = bkey_s_c_to_dirent(k);
1453                         ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1454                         if (ret < 0)
1455                                 break;
1456                         if (ret)
1457                                 continue;
1458
1459                         if (target.subvol       == inode->ei_subvol &&
1460                             target.inum         == inode->ei_inode.bi_inum)
1461                                 goto found;
1462                 }
1463         }
1464
1465         ret = -ENOENT;
1466         goto err;
1467 found:
1468         dirent_name = bch2_dirent_get_name(d);
1469
1470         name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
1471         memcpy(name, dirent_name.name, name_len);
1472         name[name_len] = '\0';
1473 err:
1474         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1475                 goto retry;
1476
1477         bch2_trans_iter_exit(trans, &iter1);
1478         bch2_trans_iter_exit(trans, &iter2);
1479         bch2_trans_put(trans);
1480
1481         return ret;
1482 }
1483
1484 static const struct export_operations bch_export_ops = {
1485         .encode_fh      = bch2_encode_fh,
1486         .fh_to_dentry   = bch2_fh_to_dentry,
1487         .fh_to_parent   = bch2_fh_to_parent,
1488         .get_parent     = bch2_get_parent,
1489         .get_name       = bch2_get_name,
1490 };
1491
1492 static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
1493                                 struct bch_inode_info *inode,
1494                                 struct bch_inode_unpacked *bi,
1495                                 struct bch_subvolume *subvol)
1496 {
1497         bch2_iget5_set(&inode->v, &inum);
1498         bch2_inode_update_after_write(trans, inode, bi, ~0);
1499
1500         inode->v.i_blocks       = bi->bi_sectors;
1501         inode->v.i_ino          = bi->bi_inum;
1502         inode->v.i_rdev         = bi->bi_dev;
1503         inode->v.i_generation   = bi->bi_generation;
1504         inode->v.i_size         = bi->bi_size;
1505
1506         inode->ei_flags         = 0;
1507         inode->ei_quota_reserved = 0;
1508         inode->ei_qid           = bch_qid(bi);
1509         inode->ei_subvol        = inum.subvol;
1510
1511         if (BCH_SUBVOLUME_SNAP(subvol))
1512                 set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1513
1514         inode->v.i_mapping->a_ops = &bch_address_space_operations;
1515
1516         switch (inode->v.i_mode & S_IFMT) {
1517         case S_IFREG:
1518                 inode->v.i_op   = &bch_file_inode_operations;
1519                 inode->v.i_fop  = &bch_file_operations;
1520                 break;
1521         case S_IFDIR:
1522                 inode->v.i_op   = &bch_dir_inode_operations;
1523                 inode->v.i_fop  = &bch_dir_file_operations;
1524                 break;
1525         case S_IFLNK:
1526                 inode_nohighmem(&inode->v);
1527                 inode->v.i_op   = &bch_symlink_inode_operations;
1528                 break;
1529         default:
1530                 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1531                 inode->v.i_op   = &bch_special_inode_operations;
1532                 break;
1533         }
1534
1535         mapping_set_large_folios(inode->v.i_mapping);
1536 }
1537
1538 static void bch2_free_inode(struct inode *vinode)
1539 {
1540         kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
1541 }
1542
1543 static int inode_update_times_fn(struct btree_trans *trans,
1544                                  struct bch_inode_info *inode,
1545                                  struct bch_inode_unpacked *bi,
1546                                  void *p)
1547 {
1548         struct bch_fs *c = inode->v.i_sb->s_fs_info;
1549
1550         bi->bi_atime    = timespec_to_bch2_time(c, inode_get_atime(&inode->v));
1551         bi->bi_mtime    = timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
1552         bi->bi_ctime    = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
1553
1554         return 0;
1555 }
1556
1557 static int bch2_vfs_write_inode(struct inode *vinode,
1558                                 struct writeback_control *wbc)
1559 {
1560         struct bch_fs *c = vinode->i_sb->s_fs_info;
1561         struct bch_inode_info *inode = to_bch_ei(vinode);
1562         int ret;
1563
1564         mutex_lock(&inode->ei_update_lock);
1565         ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1566                                ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1567         mutex_unlock(&inode->ei_update_lock);
1568
1569         return bch2_err_class(ret);
1570 }
1571
1572 static void bch2_evict_inode(struct inode *vinode)
1573 {
1574         struct bch_fs *c = vinode->i_sb->s_fs_info;
1575         struct bch_inode_info *inode = to_bch_ei(vinode);
1576
1577         truncate_inode_pages_final(&inode->v.i_data);
1578
1579         clear_inode(&inode->v);
1580
1581         BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1582
1583         if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1584                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1585                                 KEY_TYPE_QUOTA_WARN);
1586                 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1587                                 KEY_TYPE_QUOTA_WARN);
1588                 bch2_inode_rm(c, inode_inum(inode));
1589         }
1590
1591         mutex_lock(&c->vfs_inodes_lock);
1592         list_del_init(&inode->ei_vfs_inode_list);
1593         mutex_unlock(&c->vfs_inodes_lock);
1594 }
1595
1596 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
1597 {
1598         struct bch_inode_info *inode;
1599         DARRAY(struct bch_inode_info *) grabbed;
1600         bool clean_pass = false, this_pass_clean;
1601
1602         /*
1603          * Initially, we scan for inodes without I_DONTCACHE, then mark them to
1604          * be pruned with d_mark_dontcache().
1605          *
1606          * Once we've had a clean pass where we didn't find any inodes without
1607          * I_DONTCACHE, we wait for them to be freed:
1608          */
1609
1610         darray_init(&grabbed);
1611         darray_make_room(&grabbed, 1024);
1612 again:
1613         cond_resched();
1614         this_pass_clean = true;
1615
1616         mutex_lock(&c->vfs_inodes_lock);
1617         list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
1618                 if (!snapshot_list_has_id(s, inode->ei_subvol))
1619                         continue;
1620
1621                 if (!(inode->v.i_state & I_DONTCACHE) &&
1622                     !(inode->v.i_state & I_FREEING) &&
1623                     igrab(&inode->v)) {
1624                         this_pass_clean = false;
1625
1626                         if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
1627                                 iput(&inode->v);
1628                                 break;
1629                         }
1630                 } else if (clean_pass && this_pass_clean) {
1631                         wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
1632                         DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
1633
1634                         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
1635                         mutex_unlock(&c->vfs_inodes_lock);
1636
1637                         schedule();
1638                         finish_wait(wq, &wait.wq_entry);
1639                         goto again;
1640                 }
1641         }
1642         mutex_unlock(&c->vfs_inodes_lock);
1643
1644         darray_for_each(grabbed, i) {
1645                 inode = *i;
1646                 d_mark_dontcache(&inode->v);
1647                 d_prune_aliases(&inode->v);
1648                 iput(&inode->v);
1649         }
1650         grabbed.nr = 0;
1651
1652         if (!clean_pass || !this_pass_clean) {
1653                 clean_pass = this_pass_clean;
1654                 goto again;
1655         }
1656
1657         darray_exit(&grabbed);
1658 }
1659
1660 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1661 {
1662         struct super_block *sb = dentry->d_sb;
1663         struct bch_fs *c = sb->s_fs_info;
1664         struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1665         unsigned shift = sb->s_blocksize_bits - 9;
1666         /*
1667          * this assumes inodes take up 64 bytes, which is a decent average
1668          * number:
1669          */
1670         u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1671
1672         buf->f_type     = BCACHEFS_STATFS_MAGIC;
1673         buf->f_bsize    = sb->s_blocksize;
1674         buf->f_blocks   = usage.capacity >> shift;
1675         buf->f_bfree    = usage.free >> shift;
1676         buf->f_bavail   = avail_factor(usage.free) >> shift;
1677
1678         buf->f_files    = usage.nr_inodes + avail_inodes;
1679         buf->f_ffree    = avail_inodes;
1680
1681         buf->f_fsid     = uuid_to_fsid(c->sb.user_uuid.b);
1682         buf->f_namelen  = BCH_NAME_MAX;
1683
1684         return 0;
1685 }
1686
1687 static int bch2_sync_fs(struct super_block *sb, int wait)
1688 {
1689         struct bch_fs *c = sb->s_fs_info;
1690         int ret;
1691
1692         if (c->opts.journal_flush_disabled)
1693                 return 0;
1694
1695         if (!wait) {
1696                 bch2_journal_flush_async(&c->journal, NULL);
1697                 return 0;
1698         }
1699
1700         ret = bch2_journal_flush(&c->journal);
1701         return bch2_err_class(ret);
1702 }
1703
1704 static struct bch_fs *bch2_path_to_fs(const char *path)
1705 {
1706         struct bch_fs *c;
1707         dev_t dev;
1708         int ret;
1709
1710         ret = lookup_bdev(path, &dev);
1711         if (ret)
1712                 return ERR_PTR(ret);
1713
1714         c = bch2_dev_to_fs(dev);
1715         if (c)
1716                 closure_put(&c->cl);
1717         return c ?: ERR_PTR(-ENOENT);
1718 }
1719
1720 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1721 {
1722         struct bch_fs *c = sb->s_fs_info;
1723         struct bch_opts opts = bch2_opts_empty();
1724         int ret;
1725
1726         ret = bch2_parse_mount_opts(c, &opts, data);
1727         if (ret)
1728                 goto err;
1729
1730         opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1731
1732         if (opts.read_only != c->opts.read_only) {
1733                 down_write(&c->state_lock);
1734
1735                 if (opts.read_only) {
1736                         bch2_fs_read_only(c);
1737
1738                         sb->s_flags |= SB_RDONLY;
1739                 } else {
1740                         ret = bch2_fs_read_write(c);
1741                         if (ret) {
1742                                 bch_err(c, "error going rw: %i", ret);
1743                                 up_write(&c->state_lock);
1744                                 ret = -EINVAL;
1745                                 goto err;
1746                         }
1747
1748                         sb->s_flags &= ~SB_RDONLY;
1749                 }
1750
1751                 c->opts.read_only = opts.read_only;
1752
1753                 up_write(&c->state_lock);
1754         }
1755
1756         if (opt_defined(opts, errors))
1757                 c->opts.errors = opts.errors;
1758 err:
1759         return bch2_err_class(ret);
1760 }
1761
1762 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
1763 {
1764         struct bch_fs *c = root->d_sb->s_fs_info;
1765         bool first = true;
1766
1767         for_each_online_member(c, ca) {
1768                 if (!first)
1769                         seq_putc(seq, ':');
1770                 first = false;
1771                 seq_puts(seq, ca->disk_sb.sb_name);
1772         }
1773
1774         return 0;
1775 }
1776
1777 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1778 {
1779         struct bch_fs *c = root->d_sb->s_fs_info;
1780         enum bch_opt_id i;
1781         struct printbuf buf = PRINTBUF;
1782         int ret = 0;
1783
1784         for (i = 0; i < bch2_opts_nr; i++) {
1785                 const struct bch_option *opt = &bch2_opt_table[i];
1786                 u64 v = bch2_opt_get_by_id(&c->opts, i);
1787
1788                 if (!(opt->flags & OPT_MOUNT))
1789                         continue;
1790
1791                 if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1792                         continue;
1793
1794                 printbuf_reset(&buf);
1795                 bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
1796                                  OPT_SHOW_MOUNT_STYLE);
1797                 seq_putc(seq, ',');
1798                 seq_puts(seq, buf.buf);
1799         }
1800
1801         if (buf.allocation_failure)
1802                 ret = -ENOMEM;
1803         printbuf_exit(&buf);
1804         return ret;
1805 }
1806
1807 static void bch2_put_super(struct super_block *sb)
1808 {
1809         struct bch_fs *c = sb->s_fs_info;
1810
1811         __bch2_fs_stop(c);
1812 }
1813
1814 /*
1815  * bcachefs doesn't currently integrate intwrite freeze protection but the
1816  * internal write references serve the same purpose. Therefore reuse the
1817  * read-only transition code to perform the quiesce. The caveat is that we don't
1818  * currently have the ability to block tasks that want a write reference while
1819  * the superblock is frozen. This is fine for now, but we should either add
1820  * blocking support or find a way to integrate sb_start_intwrite() and friends.
1821  */
1822 static int bch2_freeze(struct super_block *sb)
1823 {
1824         struct bch_fs *c = sb->s_fs_info;
1825
1826         down_write(&c->state_lock);
1827         bch2_fs_read_only(c);
1828         up_write(&c->state_lock);
1829         return 0;
1830 }
1831
1832 static int bch2_unfreeze(struct super_block *sb)
1833 {
1834         struct bch_fs *c = sb->s_fs_info;
1835         int ret;
1836
1837         if (test_bit(BCH_FS_emergency_ro, &c->flags))
1838                 return 0;
1839
1840         down_write(&c->state_lock);
1841         ret = bch2_fs_read_write(c);
1842         up_write(&c->state_lock);
1843         return ret;
1844 }
1845
1846 static const struct super_operations bch_super_operations = {
1847         .alloc_inode    = bch2_alloc_inode,
1848         .free_inode     = bch2_free_inode,
1849         .write_inode    = bch2_vfs_write_inode,
1850         .evict_inode    = bch2_evict_inode,
1851         .sync_fs        = bch2_sync_fs,
1852         .statfs         = bch2_statfs,
1853         .show_devname   = bch2_show_devname,
1854         .show_options   = bch2_show_options,
1855         .remount_fs     = bch2_remount,
1856         .put_super      = bch2_put_super,
1857         .freeze_fs      = bch2_freeze,
1858         .unfreeze_fs    = bch2_unfreeze,
1859 };
1860
1861 static int bch2_set_super(struct super_block *s, void *data)
1862 {
1863         s->s_fs_info = data;
1864         return 0;
1865 }
1866
1867 static int bch2_noset_super(struct super_block *s, void *data)
1868 {
1869         return -EBUSY;
1870 }
1871
1872 typedef DARRAY(struct bch_fs *) darray_fs;
1873
1874 static int bch2_test_super(struct super_block *s, void *data)
1875 {
1876         struct bch_fs *c = s->s_fs_info;
1877         darray_fs *d = data;
1878
1879         if (!c)
1880                 return false;
1881
1882         darray_for_each(*d, i)
1883                 if (c != *i)
1884                         return false;
1885         return true;
1886 }
1887
1888 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1889                                  int flags, const char *dev_name, void *data)
1890 {
1891         struct bch_fs *c;
1892         struct super_block *sb;
1893         struct inode *vinode;
1894         struct bch_opts opts = bch2_opts_empty();
1895         int ret;
1896
1897         opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
1898
1899         ret = bch2_parse_mount_opts(NULL, &opts, data);
1900         if (ret) {
1901                 ret = bch2_err_class(ret);
1902                 return ERR_PTR(ret);
1903         }
1904
1905         if (!dev_name || strlen(dev_name) == 0)
1906                 return ERR_PTR(-EINVAL);
1907
1908         darray_str devs;
1909         ret = bch2_split_devs(dev_name, &devs);
1910         if (ret)
1911                 return ERR_PTR(ret);
1912
1913         darray_fs devs_to_fs = {};
1914         darray_for_each(devs, i) {
1915                 ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
1916                 if (ret) {
1917                         sb = ERR_PTR(ret);
1918                         goto got_sb;
1919                 }
1920         }
1921
1922         sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
1923         if (!IS_ERR(sb))
1924                 goto got_sb;
1925
1926         c = bch2_fs_open(devs.data, devs.nr, opts);
1927         if (IS_ERR(c)) {
1928                 sb = ERR_CAST(c);
1929                 goto got_sb;
1930         }
1931
1932         /* Some options can't be parsed until after the fs is started: */
1933         ret = bch2_parse_mount_opts(c, &opts, data);
1934         if (ret) {
1935                 bch2_fs_stop(c);
1936                 sb = ERR_PTR(ret);
1937                 goto got_sb;
1938         }
1939
1940         bch2_opts_apply(&c->opts, opts);
1941
1942         sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
1943         if (IS_ERR(sb))
1944                 bch2_fs_stop(c);
1945 got_sb:
1946         darray_exit(&devs_to_fs);
1947         bch2_darray_str_exit(&devs);
1948
1949         if (IS_ERR(sb)) {
1950                 ret = PTR_ERR(sb);
1951                 goto err;
1952         }
1953
1954         c = sb->s_fs_info;
1955
1956         if (sb->s_root) {
1957                 if ((flags ^ sb->s_flags) & SB_RDONLY) {
1958                         ret = -EBUSY;
1959                         goto err_put_super;
1960                 }
1961                 goto out;
1962         }
1963
1964         sb->s_blocksize         = block_bytes(c);
1965         sb->s_blocksize_bits    = ilog2(block_bytes(c));
1966         sb->s_maxbytes          = MAX_LFS_FILESIZE;
1967         sb->s_op                = &bch_super_operations;
1968         sb->s_export_op         = &bch_export_ops;
1969 #ifdef CONFIG_BCACHEFS_QUOTA
1970         sb->s_qcop              = &bch2_quotactl_operations;
1971         sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1972 #endif
1973         sb->s_xattr             = bch2_xattr_handlers;
1974         sb->s_magic             = BCACHEFS_STATFS_MAGIC;
1975         sb->s_time_gran         = c->sb.nsec_per_time_unit;
1976         sb->s_time_min          = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
1977         sb->s_time_max          = div_s64(S64_MAX, c->sb.time_units_per_sec);
1978         sb->s_uuid              = c->sb.user_uuid;
1979         sb->s_shrink->seeks     = 0;
1980         c->vfs_sb               = sb;
1981         strscpy(sb->s_id, c->name, sizeof(sb->s_id));
1982
1983         ret = super_setup_bdi(sb);
1984         if (ret)
1985                 goto err_put_super;
1986
1987         sb->s_bdi->ra_pages             = VM_READAHEAD_PAGES;
1988
1989         for_each_online_member(c, ca) {
1990                 struct block_device *bdev = ca->disk_sb.bdev;
1991
1992                 /* XXX: create an anonymous device for multi device filesystems */
1993                 sb->s_bdev      = bdev;
1994                 sb->s_dev       = bdev->bd_dev;
1995                 percpu_ref_put(&ca->io_ref);
1996                 break;
1997         }
1998
1999         c->dev = sb->s_dev;
2000
2001 #ifdef CONFIG_BCACHEFS_POSIX_ACL
2002         if (c->opts.acl)
2003                 sb->s_flags     |= SB_POSIXACL;
2004 #endif
2005
2006         sb->s_shrink->seeks = 0;
2007
2008         vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
2009         ret = PTR_ERR_OR_ZERO(vinode);
2010         bch_err_msg(c, ret, "mounting: error getting root inode");
2011         if (ret)
2012                 goto err_put_super;
2013
2014         sb->s_root = d_make_root(vinode);
2015         if (!sb->s_root) {
2016                 bch_err(c, "error mounting: error allocating root dentry");
2017                 ret = -ENOMEM;
2018                 goto err_put_super;
2019         }
2020
2021         sb->s_flags |= SB_ACTIVE;
2022 out:
2023         return dget(sb->s_root);
2024
2025 err_put_super:
2026         __bch2_fs_stop(c);
2027         deactivate_locked_super(sb);
2028 err:
2029         /*
2030          * On an inconsistency error in recovery we might see an -EROFS derived
2031          * errorcode (from the journal), but we don't want to return that to
2032          * userspace as that causes util-linux to retry the mount RO - which is
2033          * confusing:
2034          */
2035         if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
2036                 ret = -EIO;
2037         return ERR_PTR(bch2_err_class(ret));
2038 }
2039
2040 static void bch2_kill_sb(struct super_block *sb)
2041 {
2042         struct bch_fs *c = sb->s_fs_info;
2043
2044         generic_shutdown_super(sb);
2045         bch2_fs_free(c);
2046 }
2047
2048 static struct file_system_type bcache_fs_type = {
2049         .owner          = THIS_MODULE,
2050         .name           = "bcachefs",
2051         .mount          = bch2_mount,
2052         .kill_sb        = bch2_kill_sb,
2053         .fs_flags       = FS_REQUIRES_DEV,
2054 };
2055
2056 MODULE_ALIAS_FS("bcachefs");
2057
2058 void bch2_vfs_exit(void)
2059 {
2060         unregister_filesystem(&bcache_fs_type);
2061         kmem_cache_destroy(bch2_inode_cache);
2062 }
2063
2064 int __init bch2_vfs_init(void)
2065 {
2066         int ret = -ENOMEM;
2067
2068         bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
2069         if (!bch2_inode_cache)
2070                 goto err;
2071
2072         ret = register_filesystem(&bcache_fs_type);
2073         if (ret)
2074                 goto err;
2075
2076         return 0;
2077 err:
2078         bch2_vfs_exit();
2079         return ret;
2080 }
2081
2082 #endif /* NO_BCACHEFS_FS */
This page took 0.148778 seconds and 4 git commands to generate.