]> Git Repo - linux.git/blob - fs/ceph/super.c
ceph: fix memory leaks in __ceph_sync_read()
[linux.git] / fs / ceph / super.c
1 // SPDX-License-Identifier: GPL-2.0-only
2
3 #include <linux/ceph/ceph_debug.h>
4
5 #include <linux/backing-dev.h>
6 #include <linux/ctype.h>
7 #include <linux/fs.h>
8 #include <linux/inet.h>
9 #include <linux/in6.h>
10 #include <linux/module.h>
11 #include <linux/mount.h>
12 #include <linux/fs_context.h>
13 #include <linux/fs_parser.h>
14 #include <linux/sched.h>
15 #include <linux/seq_file.h>
16 #include <linux/slab.h>
17 #include <linux/statfs.h>
18 #include <linux/string.h>
19
20 #include "super.h"
21 #include "mds_client.h"
22 #include "cache.h"
23 #include "crypto.h"
24
25 #include <linux/ceph/ceph_features.h>
26 #include <linux/ceph/decode.h>
27 #include <linux/ceph/mon_client.h>
28 #include <linux/ceph/auth.h>
29 #include <linux/ceph/debugfs.h>
30
31 #include <uapi/linux/magic.h>
32
33 static DEFINE_SPINLOCK(ceph_fsc_lock);
34 static LIST_HEAD(ceph_fsc_list);
35
36 /*
37  * Ceph superblock operations
38  *
39  * Handle the basics of mounting, unmounting.
40  */
41
42 /*
43  * super ops
44  */
45 static void ceph_put_super(struct super_block *s)
46 {
47         struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s);
48
49         doutc(fsc->client, "begin\n");
50         ceph_fscrypt_free_dummy_policy(fsc);
51         ceph_mdsc_close_sessions(fsc->mdsc);
52         doutc(fsc->client, "done\n");
53 }
54
55 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
56 {
57         struct ceph_fs_client *fsc = ceph_inode_to_fs_client(d_inode(dentry));
58         struct ceph_mon_client *monc = &fsc->client->monc;
59         struct ceph_statfs st;
60         int i, err;
61         u64 data_pool;
62
63         doutc(fsc->client, "begin\n");
64         if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) {
65                 data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0];
66         } else {
67                 data_pool = CEPH_NOPOOL;
68         }
69
70         err = ceph_monc_do_statfs(monc, data_pool, &st);
71         if (err < 0)
72                 return err;
73
74         /* fill in kstatfs */
75         buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
76
77         /*
78          * Express utilization in terms of large blocks to avoid
79          * overflow on 32-bit machines.
80          */
81         buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
82
83         /*
84          * By default use root quota for stats; fallback to overall filesystem
85          * usage if using 'noquotadf' mount option or if the root dir doesn't
86          * have max_bytes quota set.
87          */
88         if (ceph_test_mount_opt(fsc, NOQUOTADF) ||
89             !ceph_quota_update_statfs(fsc, buf)) {
90                 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
91                 buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
92                 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
93         }
94
95         /*
96          * NOTE: for the time being, we make bsize == frsize to humor
97          * not-yet-ancient versions of glibc that are broken.
98          * Someday, we will probably want to report a real block
99          * size...  whatever that may mean for a network file system!
100          */
101         buf->f_bsize = buf->f_frsize;
102
103         buf->f_files = le64_to_cpu(st.num_objects);
104         buf->f_ffree = -1;
105         buf->f_namelen = NAME_MAX;
106
107         /* Must convert the fsid, for consistent values across arches */
108         buf->f_fsid.val[0] = 0;
109         mutex_lock(&monc->mutex);
110         for (i = 0 ; i < sizeof(monc->monmap->fsid) / sizeof(__le32) ; ++i)
111                 buf->f_fsid.val[0] ^= le32_to_cpu(((__le32 *)&monc->monmap->fsid)[i]);
112         mutex_unlock(&monc->mutex);
113
114         /* fold the fs_cluster_id into the upper bits */
115         buf->f_fsid.val[1] = monc->fs_cluster_id;
116
117         doutc(fsc->client, "done\n");
118         return 0;
119 }
120
121 static int ceph_sync_fs(struct super_block *sb, int wait)
122 {
123         struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
124         struct ceph_client *cl = fsc->client;
125
126         if (!wait) {
127                 doutc(cl, "(non-blocking)\n");
128                 ceph_flush_dirty_caps(fsc->mdsc);
129                 ceph_flush_cap_releases(fsc->mdsc);
130                 doutc(cl, "(non-blocking) done\n");
131                 return 0;
132         }
133
134         doutc(cl, "(blocking)\n");
135         ceph_osdc_sync(&fsc->client->osdc);
136         ceph_mdsc_sync(fsc->mdsc);
137         doutc(cl, "(blocking) done\n");
138         return 0;
139 }
140
141 /*
142  * mount options
143  */
144 enum {
145         Opt_wsize,
146         Opt_rsize,
147         Opt_rasize,
148         Opt_caps_wanted_delay_min,
149         Opt_caps_wanted_delay_max,
150         Opt_caps_max,
151         Opt_readdir_max_entries,
152         Opt_readdir_max_bytes,
153         Opt_congestion_kb,
154         /* int args above */
155         Opt_snapdirname,
156         Opt_mds_namespace,
157         Opt_recover_session,
158         Opt_source,
159         Opt_mon_addr,
160         Opt_test_dummy_encryption,
161         /* string args above */
162         Opt_dirstat,
163         Opt_rbytes,
164         Opt_asyncreaddir,
165         Opt_dcache,
166         Opt_ino32,
167         Opt_fscache,
168         Opt_poolperm,
169         Opt_require_active_mds,
170         Opt_acl,
171         Opt_quotadf,
172         Opt_copyfrom,
173         Opt_wsync,
174         Opt_pagecache,
175         Opt_sparseread,
176 };
177
178 enum ceph_recover_session_mode {
179         ceph_recover_session_no,
180         ceph_recover_session_clean
181 };
182
183 static const struct constant_table ceph_param_recover[] = {
184         { "no",         ceph_recover_session_no },
185         { "clean",      ceph_recover_session_clean },
186         {}
187 };
188
189 static const struct fs_parameter_spec ceph_mount_parameters[] = {
190         fsparam_flag_no ("acl",                         Opt_acl),
191         fsparam_flag_no ("asyncreaddir",                Opt_asyncreaddir),
192         fsparam_s32     ("caps_max",                    Opt_caps_max),
193         fsparam_u32     ("caps_wanted_delay_max",       Opt_caps_wanted_delay_max),
194         fsparam_u32     ("caps_wanted_delay_min",       Opt_caps_wanted_delay_min),
195         fsparam_u32     ("write_congestion_kb",         Opt_congestion_kb),
196         fsparam_flag_no ("copyfrom",                    Opt_copyfrom),
197         fsparam_flag_no ("dcache",                      Opt_dcache),
198         fsparam_flag_no ("dirstat",                     Opt_dirstat),
199         fsparam_flag_no ("fsc",                         Opt_fscache), // fsc|nofsc
200         fsparam_string  ("fsc",                         Opt_fscache), // fsc=...
201         fsparam_flag_no ("ino32",                       Opt_ino32),
202         fsparam_string  ("mds_namespace",               Opt_mds_namespace),
203         fsparam_string  ("mon_addr",                    Opt_mon_addr),
204         fsparam_flag_no ("poolperm",                    Opt_poolperm),
205         fsparam_flag_no ("quotadf",                     Opt_quotadf),
206         fsparam_u32     ("rasize",                      Opt_rasize),
207         fsparam_flag_no ("rbytes",                      Opt_rbytes),
208         fsparam_u32     ("readdir_max_bytes",           Opt_readdir_max_bytes),
209         fsparam_u32     ("readdir_max_entries",         Opt_readdir_max_entries),
210         fsparam_enum    ("recover_session",             Opt_recover_session, ceph_param_recover),
211         fsparam_flag_no ("require_active_mds",          Opt_require_active_mds),
212         fsparam_u32     ("rsize",                       Opt_rsize),
213         fsparam_string  ("snapdirname",                 Opt_snapdirname),
214         fsparam_string  ("source",                      Opt_source),
215         fsparam_flag    ("test_dummy_encryption",       Opt_test_dummy_encryption),
216         fsparam_string  ("test_dummy_encryption",       Opt_test_dummy_encryption),
217         fsparam_u32     ("wsize",                       Opt_wsize),
218         fsparam_flag_no ("wsync",                       Opt_wsync),
219         fsparam_flag_no ("pagecache",                   Opt_pagecache),
220         fsparam_flag_no ("sparseread",                  Opt_sparseread),
221         {}
222 };
223
224 struct ceph_parse_opts_ctx {
225         struct ceph_options             *copts;
226         struct ceph_mount_options       *opts;
227 };
228
229 /*
230  * Remove adjacent slashes and then the trailing slash, unless it is
231  * the only remaining character.
232  *
233  * E.g. "//dir1////dir2///" --> "/dir1/dir2", "///" --> "/".
234  */
235 static void canonicalize_path(char *path)
236 {
237         int i, j = 0;
238
239         for (i = 0; path[i] != '\0'; i++) {
240                 if (path[i] != '/' || j < 1 || path[j - 1] != '/')
241                         path[j++] = path[i];
242         }
243
244         if (j > 1 && path[j - 1] == '/')
245                 j--;
246         path[j] = '\0';
247 }
248
249 /*
250  * Check if the mds namespace in ceph_mount_options matches
251  * the passed in namespace string. First time match (when
252  * ->mds_namespace is NULL) is treated specially, since
253  * ->mds_namespace needs to be initialized by the caller.
254  */
255 static int namespace_equals(struct ceph_mount_options *fsopt,
256                             const char *namespace, size_t len)
257 {
258         return !(fsopt->mds_namespace &&
259                  (strlen(fsopt->mds_namespace) != len ||
260                   strncmp(fsopt->mds_namespace, namespace, len)));
261 }
262
263 static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end,
264                                  struct fs_context *fc)
265 {
266         int r;
267         struct ceph_parse_opts_ctx *pctx = fc->fs_private;
268         struct ceph_mount_options *fsopt = pctx->opts;
269
270         if (*dev_name_end != ':')
271                 return invalfc(fc, "separator ':' missing in source");
272
273         r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name,
274                                pctx->copts, fc->log.log, ',');
275         if (r)
276                 return r;
277
278         fsopt->new_dev_syntax = false;
279         return 0;
280 }
281
282 static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end,
283                                  struct fs_context *fc)
284 {
285         size_t len;
286         struct ceph_fsid fsid;
287         struct ceph_parse_opts_ctx *pctx = fc->fs_private;
288         struct ceph_options *opts = pctx->copts;
289         struct ceph_mount_options *fsopt = pctx->opts;
290         const char *name_start = dev_name;
291         const char *fsid_start, *fs_name_start;
292
293         if (*dev_name_end != '=') {
294                 dout("separator '=' missing in source");
295                 return -EINVAL;
296         }
297
298         fsid_start = strchr(dev_name, '@');
299         if (!fsid_start)
300                 return invalfc(fc, "missing cluster fsid");
301         len = fsid_start - name_start;
302         kfree(opts->name);
303         opts->name = kstrndup(name_start, len, GFP_KERNEL);
304         if (!opts->name)
305                 return -ENOMEM;
306         dout("using %s entity name", opts->name);
307
308         ++fsid_start; /* start of cluster fsid */
309         fs_name_start = strchr(fsid_start, '.');
310         if (!fs_name_start)
311                 return invalfc(fc, "missing file system name");
312
313         if (ceph_parse_fsid(fsid_start, &fsid))
314                 return invalfc(fc, "Invalid FSID");
315
316         ++fs_name_start; /* start of file system name */
317         len = dev_name_end - fs_name_start;
318
319         if (!namespace_equals(fsopt, fs_name_start, len))
320                 return invalfc(fc, "Mismatching mds_namespace");
321         kfree(fsopt->mds_namespace);
322         fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL);
323         if (!fsopt->mds_namespace)
324                 return -ENOMEM;
325         dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace);
326
327         fsopt->new_dev_syntax = true;
328         return 0;
329 }
330
331 /*
332  * Parse the source parameter for new device format. Distinguish the device
333  * spec from the path. Try parsing new device format and fallback to old
334  * format if needed.
335  *
336  * New device syntax will looks like:
337  *     <device_spec>=/<path>
338  * where
339  *     <device_spec> is [email protected]
340  *     <path> is optional, but if present must begin with '/'
341  * (monitor addresses are passed via mount option)
342  *
343  * Old device syntax is:
344  *     <server_spec>[,<server_spec>...]:[<path>]
345  * where
346  *     <server_spec> is <ip>[:<port>]
347  *     <path> is optional, but if present must begin with '/'
348  */
349 static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc)
350 {
351         struct ceph_parse_opts_ctx *pctx = fc->fs_private;
352         struct ceph_mount_options *fsopt = pctx->opts;
353         char *dev_name = param->string, *dev_name_end;
354         int ret;
355
356         dout("'%s'\n", dev_name);
357         if (!dev_name || !*dev_name)
358                 return invalfc(fc, "Empty source");
359
360         dev_name_end = strchr(dev_name, '/');
361         if (dev_name_end) {
362                 /*
363                  * The server_path will include the whole chars from userland
364                  * including the leading '/'.
365                  */
366                 kfree(fsopt->server_path);
367                 fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
368                 if (!fsopt->server_path)
369                         return -ENOMEM;
370
371                 canonicalize_path(fsopt->server_path);
372         } else {
373                 dev_name_end = dev_name + strlen(dev_name);
374         }
375
376         dev_name_end--;         /* back up to separator */
377         if (dev_name_end < dev_name)
378                 return invalfc(fc, "Path missing in source");
379
380         dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
381         if (fsopt->server_path)
382                 dout("server path '%s'\n", fsopt->server_path);
383
384         dout("trying new device syntax");
385         ret = ceph_parse_new_source(dev_name, dev_name_end, fc);
386         if (ret) {
387                 if (ret != -EINVAL)
388                         return ret;
389                 dout("trying old device syntax");
390                 ret = ceph_parse_old_source(dev_name, dev_name_end, fc);
391                 if (ret)
392                         return ret;
393         }
394
395         fc->source = param->string;
396         param->string = NULL;
397         return 0;
398 }
399
400 static int ceph_parse_mon_addr(struct fs_parameter *param,
401                                struct fs_context *fc)
402 {
403         struct ceph_parse_opts_ctx *pctx = fc->fs_private;
404         struct ceph_mount_options *fsopt = pctx->opts;
405
406         kfree(fsopt->mon_addr);
407         fsopt->mon_addr = param->string;
408         param->string = NULL;
409
410         return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr),
411                                   pctx->copts, fc->log.log, '/');
412 }
413
414 static int ceph_parse_mount_param(struct fs_context *fc,
415                                   struct fs_parameter *param)
416 {
417         struct ceph_parse_opts_ctx *pctx = fc->fs_private;
418         struct ceph_mount_options *fsopt = pctx->opts;
419         struct fs_parse_result result;
420         unsigned int mode;
421         int token, ret;
422
423         ret = ceph_parse_param(param, pctx->copts, fc->log.log);
424         if (ret != -ENOPARAM)
425                 return ret;
426
427         token = fs_parse(fc, ceph_mount_parameters, param, &result);
428         dout("%s: fs_parse '%s' token %d\n",__func__, param->key, token);
429         if (token < 0)
430                 return token;
431
432         switch (token) {
433         case Opt_snapdirname:
434                 kfree(fsopt->snapdir_name);
435                 fsopt->snapdir_name = param->string;
436                 param->string = NULL;
437                 break;
438         case Opt_mds_namespace:
439                 if (!namespace_equals(fsopt, param->string, strlen(param->string)))
440                         return invalfc(fc, "Mismatching mds_namespace");
441                 kfree(fsopt->mds_namespace);
442                 fsopt->mds_namespace = param->string;
443                 param->string = NULL;
444                 break;
445         case Opt_recover_session:
446                 mode = result.uint_32;
447                 if (mode == ceph_recover_session_no)
448                         fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER;
449                 else if (mode == ceph_recover_session_clean)
450                         fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER;
451                 else
452                         BUG();
453                 break;
454         case Opt_source:
455                 if (fc->source)
456                         return invalfc(fc, "Multiple sources specified");
457                 return ceph_parse_source(param, fc);
458         case Opt_mon_addr:
459                 return ceph_parse_mon_addr(param, fc);
460         case Opt_wsize:
461                 if (result.uint_32 < PAGE_SIZE ||
462                     result.uint_32 > CEPH_MAX_WRITE_SIZE)
463                         goto out_of_range;
464                 fsopt->wsize = ALIGN(result.uint_32, PAGE_SIZE);
465                 break;
466         case Opt_rsize:
467                 if (result.uint_32 < PAGE_SIZE ||
468                     result.uint_32 > CEPH_MAX_READ_SIZE)
469                         goto out_of_range;
470                 fsopt->rsize = ALIGN(result.uint_32, PAGE_SIZE);
471                 break;
472         case Opt_rasize:
473                 fsopt->rasize = ALIGN(result.uint_32, PAGE_SIZE);
474                 break;
475         case Opt_caps_wanted_delay_min:
476                 if (result.uint_32 < 1)
477                         goto out_of_range;
478                 fsopt->caps_wanted_delay_min = result.uint_32;
479                 break;
480         case Opt_caps_wanted_delay_max:
481                 if (result.uint_32 < 1)
482                         goto out_of_range;
483                 fsopt->caps_wanted_delay_max = result.uint_32;
484                 break;
485         case Opt_caps_max:
486                 if (result.int_32 < 0)
487                         goto out_of_range;
488                 fsopt->caps_max = result.int_32;
489                 break;
490         case Opt_readdir_max_entries:
491                 if (result.uint_32 < 1)
492                         goto out_of_range;
493                 fsopt->max_readdir = result.uint_32;
494                 break;
495         case Opt_readdir_max_bytes:
496                 if (result.uint_32 < PAGE_SIZE && result.uint_32 != 0)
497                         goto out_of_range;
498                 fsopt->max_readdir_bytes = result.uint_32;
499                 break;
500         case Opt_congestion_kb:
501                 if (result.uint_32 < 1024) /* at least 1M */
502                         goto out_of_range;
503                 fsopt->congestion_kb = result.uint_32;
504                 break;
505         case Opt_dirstat:
506                 if (!result.negated)
507                         fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
508                 else
509                         fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
510                 break;
511         case Opt_rbytes:
512                 if (!result.negated)
513                         fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
514                 else
515                         fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
516                 break;
517         case Opt_asyncreaddir:
518                 if (!result.negated)
519                         fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
520                 else
521                         fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
522                 break;
523         case Opt_dcache:
524                 if (!result.negated)
525                         fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
526                 else
527                         fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
528                 break;
529         case Opt_ino32:
530                 if (!result.negated)
531                         fsopt->flags |= CEPH_MOUNT_OPT_INO32;
532                 else
533                         fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
534                 break;
535
536         case Opt_fscache:
537 #ifdef CONFIG_CEPH_FSCACHE
538                 kfree(fsopt->fscache_uniq);
539                 fsopt->fscache_uniq = NULL;
540                 if (result.negated) {
541                         fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
542                 } else {
543                         fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
544                         fsopt->fscache_uniq = param->string;
545                         param->string = NULL;
546                 }
547                 break;
548 #else
549                 return invalfc(fc, "fscache support is disabled");
550 #endif
551         case Opt_poolperm:
552                 if (!result.negated)
553                         fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
554                 else
555                         fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
556                 break;
557         case Opt_require_active_mds:
558                 if (!result.negated)
559                         fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT;
560                 else
561                         fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT;
562                 break;
563         case Opt_quotadf:
564                 if (!result.negated)
565                         fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF;
566                 else
567                         fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF;
568                 break;
569         case Opt_copyfrom:
570                 if (!result.negated)
571                         fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM;
572                 else
573                         fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM;
574                 break;
575         case Opt_acl:
576                 if (!result.negated) {
577 #ifdef CONFIG_CEPH_FS_POSIX_ACL
578                         fc->sb_flags |= SB_POSIXACL;
579 #else
580                         return invalfc(fc, "POSIX ACL support is disabled");
581 #endif
582                 } else {
583                         fc->sb_flags &= ~SB_POSIXACL;
584                 }
585                 break;
586         case Opt_wsync:
587                 if (!result.negated)
588                         fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS;
589                 else
590                         fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
591                 break;
592         case Opt_pagecache:
593                 if (result.negated)
594                         fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE;
595                 else
596                         fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
597                 break;
598         case Opt_sparseread:
599                 if (result.negated)
600                         fsopt->flags &= ~CEPH_MOUNT_OPT_SPARSEREAD;
601                 else
602                         fsopt->flags |= CEPH_MOUNT_OPT_SPARSEREAD;
603                 break;
604         case Opt_test_dummy_encryption:
605 #ifdef CONFIG_FS_ENCRYPTION
606                 fscrypt_free_dummy_policy(&fsopt->dummy_enc_policy);
607                 ret = fscrypt_parse_test_dummy_encryption(param,
608                                                 &fsopt->dummy_enc_policy);
609                 if (ret == -EINVAL) {
610                         warnfc(fc, "Value of option \"%s\" is unrecognized",
611                                param->key);
612                 } else if (ret == -EEXIST) {
613                         warnfc(fc, "Conflicting test_dummy_encryption options");
614                         ret = -EINVAL;
615                 }
616 #else
617                 warnfc(fc,
618                        "FS encryption not supported: test_dummy_encryption mount option ignored");
619 #endif
620                 break;
621         default:
622                 BUG();
623         }
624         return 0;
625
626 out_of_range:
627         return invalfc(fc, "%s out of range", param->key);
628 }
629
630 static void destroy_mount_options(struct ceph_mount_options *args)
631 {
632         dout("destroy_mount_options %p\n", args);
633         if (!args)
634                 return;
635
636         kfree(args->snapdir_name);
637         kfree(args->mds_namespace);
638         kfree(args->server_path);
639         kfree(args->fscache_uniq);
640         kfree(args->mon_addr);
641         fscrypt_free_dummy_policy(&args->dummy_enc_policy);
642         kfree(args);
643 }
644
645 static int strcmp_null(const char *s1, const char *s2)
646 {
647         if (!s1 && !s2)
648                 return 0;
649         if (s1 && !s2)
650                 return -1;
651         if (!s1 && s2)
652                 return 1;
653         return strcmp(s1, s2);
654 }
655
656 static int compare_mount_options(struct ceph_mount_options *new_fsopt,
657                                  struct ceph_options *new_opt,
658                                  struct ceph_fs_client *fsc)
659 {
660         struct ceph_mount_options *fsopt1 = new_fsopt;
661         struct ceph_mount_options *fsopt2 = fsc->mount_options;
662         int ofs = offsetof(struct ceph_mount_options, snapdir_name);
663         int ret;
664
665         ret = memcmp(fsopt1, fsopt2, ofs);
666         if (ret)
667                 return ret;
668
669         ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
670         if (ret)
671                 return ret;
672
673         ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
674         if (ret)
675                 return ret;
676
677         ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
678         if (ret)
679                 return ret;
680
681         ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq);
682         if (ret)
683                 return ret;
684
685         ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr);
686         if (ret)
687                 return ret;
688
689         return ceph_compare_options(new_opt, fsc->client);
690 }
691
692 /**
693  * ceph_show_options - Show mount options in /proc/mounts
694  * @m: seq_file to write to
695  * @root: root of that (sub)tree
696  */
697 static int ceph_show_options(struct seq_file *m, struct dentry *root)
698 {
699         struct ceph_fs_client *fsc = ceph_sb_to_fs_client(root->d_sb);
700         struct ceph_mount_options *fsopt = fsc->mount_options;
701         size_t pos;
702         int ret;
703
704         /* a comma between MNT/MS and client options */
705         seq_putc(m, ',');
706         pos = m->count;
707
708         ret = ceph_print_client_options(m, fsc->client, false);
709         if (ret)
710                 return ret;
711
712         /* retract our comma if no client options */
713         if (m->count == pos)
714                 m->count--;
715
716         if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
717                 seq_puts(m, ",dirstat");
718         if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES))
719                 seq_puts(m, ",rbytes");
720         if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
721                 seq_puts(m, ",noasyncreaddir");
722         if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
723                 seq_puts(m, ",nodcache");
724         if (fsopt->flags & CEPH_MOUNT_OPT_INO32)
725                 seq_puts(m, ",ino32");
726         if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) {
727                 seq_show_option(m, "fsc", fsopt->fscache_uniq);
728         }
729         if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
730                 seq_puts(m, ",nopoolperm");
731         if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF)
732                 seq_puts(m, ",noquotadf");
733
734 #ifdef CONFIG_CEPH_FS_POSIX_ACL
735         if (root->d_sb->s_flags & SB_POSIXACL)
736                 seq_puts(m, ",acl");
737         else
738                 seq_puts(m, ",noacl");
739 #endif
740
741         if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0)
742                 seq_puts(m, ",copyfrom");
743
744         /* dump mds_namespace when old device syntax is in use */
745         if (fsopt->mds_namespace && !fsopt->new_dev_syntax)
746                 seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
747
748         if (fsopt->mon_addr)
749                 seq_printf(m, ",mon_addr=%s", fsopt->mon_addr);
750
751         if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
752                 seq_show_option(m, "recover_session", "clean");
753
754         if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
755                 seq_puts(m, ",wsync");
756         if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
757                 seq_puts(m, ",nopagecache");
758         if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD)
759                 seq_puts(m, ",sparseread");
760
761         fscrypt_show_test_dummy_encryption(m, ',', root->d_sb);
762
763         if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
764                 seq_printf(m, ",wsize=%u", fsopt->wsize);
765         if (fsopt->rsize != CEPH_MAX_READ_SIZE)
766                 seq_printf(m, ",rsize=%u", fsopt->rsize);
767         if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
768                 seq_printf(m, ",rasize=%u", fsopt->rasize);
769         if (fsopt->congestion_kb != default_congestion_kb())
770                 seq_printf(m, ",write_congestion_kb=%u", fsopt->congestion_kb);
771         if (fsopt->caps_max)
772                 seq_printf(m, ",caps_max=%d", fsopt->caps_max);
773         if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
774                 seq_printf(m, ",caps_wanted_delay_min=%u",
775                          fsopt->caps_wanted_delay_min);
776         if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
777                 seq_printf(m, ",caps_wanted_delay_max=%u",
778                            fsopt->caps_wanted_delay_max);
779         if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
780                 seq_printf(m, ",readdir_max_entries=%u", fsopt->max_readdir);
781         if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
782                 seq_printf(m, ",readdir_max_bytes=%u", fsopt->max_readdir_bytes);
783         if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
784                 seq_show_option(m, "snapdirname", fsopt->snapdir_name);
785
786         return 0;
787 }
788
789 /*
790  * handle any mon messages the standard library doesn't understand.
791  * return error if we don't either.
792  */
793 static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
794 {
795         struct ceph_fs_client *fsc = client->private;
796         int type = le16_to_cpu(msg->hdr.type);
797
798         switch (type) {
799         case CEPH_MSG_MDS_MAP:
800                 ceph_mdsc_handle_mdsmap(fsc->mdsc, msg);
801                 return 0;
802         case CEPH_MSG_FS_MAP_USER:
803                 ceph_mdsc_handle_fsmap(fsc->mdsc, msg);
804                 return 0;
805         default:
806                 return -1;
807         }
808 }
809
810 /*
811  * create a new fs client
812  *
813  * Success or not, this function consumes @fsopt and @opt.
814  */
815 static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
816                                         struct ceph_options *opt)
817 {
818         struct ceph_fs_client *fsc;
819         int err;
820
821         fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
822         if (!fsc) {
823                 err = -ENOMEM;
824                 goto fail;
825         }
826
827         fsc->client = ceph_create_client(opt, fsc);
828         if (IS_ERR(fsc->client)) {
829                 err = PTR_ERR(fsc->client);
830                 goto fail;
831         }
832         opt = NULL; /* fsc->client now owns this */
833
834         fsc->client->extra_mon_dispatch = extra_mon_dispatch;
835         ceph_set_opt(fsc->client, ABORT_ON_FULL);
836
837         if (!fsopt->mds_namespace) {
838                 ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
839                                    0, true);
840         } else {
841                 ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP,
842                                    0, false);
843         }
844
845         fsc->mount_options = fsopt;
846
847         fsc->sb = NULL;
848         fsc->mount_state = CEPH_MOUNT_MOUNTING;
849         fsc->filp_gen = 1;
850         fsc->have_copy_from2 = true;
851
852         atomic_long_set(&fsc->writeback_count, 0);
853         fsc->write_congested = false;
854
855         err = -ENOMEM;
856         /*
857          * The number of concurrent works can be high but they don't need
858          * to be processed in parallel, limit concurrency.
859          */
860         fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0);
861         if (!fsc->inode_wq)
862                 goto fail_client;
863         fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1);
864         if (!fsc->cap_wq)
865                 goto fail_inode_wq;
866
867         hash_init(fsc->async_unlink_conflict);
868         spin_lock_init(&fsc->async_unlink_conflict_lock);
869
870         spin_lock(&ceph_fsc_lock);
871         list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
872         spin_unlock(&ceph_fsc_lock);
873
874         return fsc;
875
876 fail_inode_wq:
877         destroy_workqueue(fsc->inode_wq);
878 fail_client:
879         ceph_destroy_client(fsc->client);
880 fail:
881         kfree(fsc);
882         if (opt)
883                 ceph_destroy_options(opt);
884         destroy_mount_options(fsopt);
885         return ERR_PTR(err);
886 }
887
888 static void flush_fs_workqueues(struct ceph_fs_client *fsc)
889 {
890         flush_workqueue(fsc->inode_wq);
891         flush_workqueue(fsc->cap_wq);
892 }
893
894 static void destroy_fs_client(struct ceph_fs_client *fsc)
895 {
896         doutc(fsc->client, "%p\n", fsc);
897
898         spin_lock(&ceph_fsc_lock);
899         list_del(&fsc->metric_wakeup);
900         spin_unlock(&ceph_fsc_lock);
901
902         ceph_mdsc_destroy(fsc);
903         destroy_workqueue(fsc->inode_wq);
904         destroy_workqueue(fsc->cap_wq);
905
906         destroy_mount_options(fsc->mount_options);
907
908         ceph_destroy_client(fsc->client);
909
910         kfree(fsc);
911         dout("%s: %p done\n", __func__, fsc);
912 }
913
914 /*
915  * caches
916  */
917 struct kmem_cache *ceph_inode_cachep;
918 struct kmem_cache *ceph_cap_cachep;
919 struct kmem_cache *ceph_cap_snap_cachep;
920 struct kmem_cache *ceph_cap_flush_cachep;
921 struct kmem_cache *ceph_dentry_cachep;
922 struct kmem_cache *ceph_file_cachep;
923 struct kmem_cache *ceph_dir_file_cachep;
924 struct kmem_cache *ceph_mds_request_cachep;
925 mempool_t *ceph_wb_pagevec_pool;
926
927 static void ceph_inode_init_once(void *foo)
928 {
929         struct ceph_inode_info *ci = foo;
930         inode_init_once(&ci->netfs.inode);
931 }
932
933 static int __init init_caches(void)
934 {
935         int error = -ENOMEM;
936
937         ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
938                                       sizeof(struct ceph_inode_info),
939                                       __alignof__(struct ceph_inode_info),
940                                       SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
941                                       ceph_inode_init_once);
942         if (!ceph_inode_cachep)
943                 return -ENOMEM;
944
945         ceph_cap_cachep = KMEM_CACHE(ceph_cap, 0);
946         if (!ceph_cap_cachep)
947                 goto bad_cap;
948         ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, 0);
949         if (!ceph_cap_snap_cachep)
950                 goto bad_cap_snap;
951         ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
952                                            SLAB_RECLAIM_ACCOUNT);
953         if (!ceph_cap_flush_cachep)
954                 goto bad_cap_flush;
955
956         ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
957                                         SLAB_RECLAIM_ACCOUNT);
958         if (!ceph_dentry_cachep)
959                 goto bad_dentry;
960
961         ceph_file_cachep = KMEM_CACHE(ceph_file_info, 0);
962         if (!ceph_file_cachep)
963                 goto bad_file;
964
965         ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, 0);
966         if (!ceph_dir_file_cachep)
967                 goto bad_dir_file;
968
969         ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, 0);
970         if (!ceph_mds_request_cachep)
971                 goto bad_mds_req;
972
973         ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10,
974             (CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT) * sizeof(struct page *));
975         if (!ceph_wb_pagevec_pool)
976                 goto bad_pagevec_pool;
977
978         return 0;
979
980 bad_pagevec_pool:
981         kmem_cache_destroy(ceph_mds_request_cachep);
982 bad_mds_req:
983         kmem_cache_destroy(ceph_dir_file_cachep);
984 bad_dir_file:
985         kmem_cache_destroy(ceph_file_cachep);
986 bad_file:
987         kmem_cache_destroy(ceph_dentry_cachep);
988 bad_dentry:
989         kmem_cache_destroy(ceph_cap_flush_cachep);
990 bad_cap_flush:
991         kmem_cache_destroy(ceph_cap_snap_cachep);
992 bad_cap_snap:
993         kmem_cache_destroy(ceph_cap_cachep);
994 bad_cap:
995         kmem_cache_destroy(ceph_inode_cachep);
996         return error;
997 }
998
999 static void destroy_caches(void)
1000 {
1001         /*
1002          * Make sure all delayed rcu free inodes are flushed before we
1003          * destroy cache.
1004          */
1005         rcu_barrier();
1006
1007         kmem_cache_destroy(ceph_inode_cachep);
1008         kmem_cache_destroy(ceph_cap_cachep);
1009         kmem_cache_destroy(ceph_cap_snap_cachep);
1010         kmem_cache_destroy(ceph_cap_flush_cachep);
1011         kmem_cache_destroy(ceph_dentry_cachep);
1012         kmem_cache_destroy(ceph_file_cachep);
1013         kmem_cache_destroy(ceph_dir_file_cachep);
1014         kmem_cache_destroy(ceph_mds_request_cachep);
1015         mempool_destroy(ceph_wb_pagevec_pool);
1016 }
1017
1018 static void __ceph_umount_begin(struct ceph_fs_client *fsc)
1019 {
1020         ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
1021         ceph_mdsc_force_umount(fsc->mdsc);
1022         fsc->filp_gen++; // invalidate open files
1023 }
1024
1025 /*
1026  * ceph_umount_begin - initiate forced umount.  Tear down the
1027  * mount, skipping steps that may hang while waiting for server(s).
1028  */
1029 void ceph_umount_begin(struct super_block *sb)
1030 {
1031         struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
1032
1033         doutc(fsc->client, "starting forced umount\n");
1034         if (!fsc)
1035                 return;
1036         fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
1037         __ceph_umount_begin(fsc);
1038 }
1039
1040 static const struct super_operations ceph_super_ops = {
1041         .alloc_inode    = ceph_alloc_inode,
1042         .free_inode     = ceph_free_inode,
1043         .write_inode    = ceph_write_inode,
1044         .drop_inode     = generic_delete_inode,
1045         .evict_inode    = ceph_evict_inode,
1046         .sync_fs        = ceph_sync_fs,
1047         .put_super      = ceph_put_super,
1048         .show_options   = ceph_show_options,
1049         .statfs         = ceph_statfs,
1050         .umount_begin   = ceph_umount_begin,
1051 };
1052
1053 /*
1054  * Bootstrap mount by opening the root directory.  Note the mount
1055  * @started time from caller, and time out if this takes too long.
1056  */
1057 static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
1058                                        const char *path,
1059                                        unsigned long started)
1060 {
1061         struct ceph_client *cl = fsc->client;
1062         struct ceph_mds_client *mdsc = fsc->mdsc;
1063         struct ceph_mds_request *req = NULL;
1064         int err;
1065         struct dentry *root;
1066
1067         /* open dir */
1068         doutc(cl, "opening '%s'\n", path);
1069         req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
1070         if (IS_ERR(req))
1071                 return ERR_CAST(req);
1072         req->r_path1 = kstrdup(path, GFP_NOFS);
1073         if (!req->r_path1) {
1074                 root = ERR_PTR(-ENOMEM);
1075                 goto out;
1076         }
1077
1078         req->r_ino1.ino = CEPH_INO_ROOT;
1079         req->r_ino1.snap = CEPH_NOSNAP;
1080         req->r_started = started;
1081         req->r_timeout = fsc->client->options->mount_timeout;
1082         req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
1083         req->r_num_caps = 2;
1084         err = ceph_mdsc_do_request(mdsc, NULL, req);
1085         if (err == 0) {
1086                 struct inode *inode = req->r_target_inode;
1087                 req->r_target_inode = NULL;
1088                 doutc(cl, "success\n");
1089                 root = d_make_root(inode);
1090                 if (!root) {
1091                         root = ERR_PTR(-ENOMEM);
1092                         goto out;
1093                 }
1094                 doutc(cl, "success, root dentry is %p\n", root);
1095         } else {
1096                 root = ERR_PTR(err);
1097         }
1098 out:
1099         ceph_mdsc_put_request(req);
1100         return root;
1101 }
1102
1103 #ifdef CONFIG_FS_ENCRYPTION
1104 static int ceph_apply_test_dummy_encryption(struct super_block *sb,
1105                                             struct fs_context *fc,
1106                                             struct ceph_mount_options *fsopt)
1107 {
1108         struct ceph_fs_client *fsc = sb->s_fs_info;
1109
1110         if (!fscrypt_is_dummy_policy_set(&fsopt->dummy_enc_policy))
1111                 return 0;
1112
1113         /* No changing encryption context on remount. */
1114         if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE &&
1115             !fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) {
1116                 if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy,
1117                                                  &fsc->fsc_dummy_enc_policy))
1118                         return 0;
1119                 errorfc(fc, "Can't set test_dummy_encryption on remount");
1120                 return -EINVAL;
1121         }
1122
1123         /* Also make sure fsopt doesn't contain a conflicting value. */
1124         if (fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) {
1125                 if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy,
1126                                                  &fsc->fsc_dummy_enc_policy))
1127                         return 0;
1128                 errorfc(fc, "Conflicting test_dummy_encryption options");
1129                 return -EINVAL;
1130         }
1131
1132         fsc->fsc_dummy_enc_policy = fsopt->dummy_enc_policy;
1133         memset(&fsopt->dummy_enc_policy, 0, sizeof(fsopt->dummy_enc_policy));
1134
1135         warnfc(fc, "test_dummy_encryption mode enabled");
1136         return 0;
1137 }
1138 #else
1139 static int ceph_apply_test_dummy_encryption(struct super_block *sb,
1140                                             struct fs_context *fc,
1141                                             struct ceph_mount_options *fsopt)
1142 {
1143         return 0;
1144 }
1145 #endif
1146
1147 /*
1148  * mount: join the ceph cluster, and open root directory.
1149  */
1150 static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
1151                                       struct fs_context *fc)
1152 {
1153         struct ceph_client *cl = fsc->client;
1154         int err;
1155         unsigned long started = jiffies;  /* note the start time */
1156         struct dentry *root;
1157
1158         doutc(cl, "mount start %p\n", fsc);
1159         mutex_lock(&fsc->client->mount_mutex);
1160
1161         if (!fsc->sb->s_root) {
1162                 const char *path = fsc->mount_options->server_path ?
1163                                      fsc->mount_options->server_path + 1 : "";
1164
1165                 err = __ceph_open_session(fsc->client, started);
1166                 if (err < 0)
1167                         goto out;
1168
1169                 /* setup fscache */
1170                 if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) {
1171                         err = ceph_fscache_register_fs(fsc, fc);
1172                         if (err < 0)
1173                                 goto out;
1174                 }
1175
1176                 err = ceph_apply_test_dummy_encryption(fsc->sb, fc,
1177                                                        fsc->mount_options);
1178                 if (err)
1179                         goto out;
1180
1181                 doutc(cl, "mount opening path '%s'\n", path);
1182
1183                 ceph_fs_debugfs_init(fsc);
1184
1185                 root = open_root_dentry(fsc, path, started);
1186                 if (IS_ERR(root)) {
1187                         err = PTR_ERR(root);
1188                         goto out;
1189                 }
1190                 fsc->sb->s_root = dget(root);
1191         } else {
1192                 root = dget(fsc->sb->s_root);
1193         }
1194
1195         fsc->mount_state = CEPH_MOUNT_MOUNTED;
1196         doutc(cl, "mount success\n");
1197         mutex_unlock(&fsc->client->mount_mutex);
1198         return root;
1199
1200 out:
1201         mutex_unlock(&fsc->client->mount_mutex);
1202         ceph_fscrypt_free_dummy_policy(fsc);
1203         return ERR_PTR(err);
1204 }
1205
1206 static int ceph_set_super(struct super_block *s, struct fs_context *fc)
1207 {
1208         struct ceph_fs_client *fsc = s->s_fs_info;
1209         struct ceph_client *cl = fsc->client;
1210         int ret;
1211
1212         doutc(cl, "%p\n", s);
1213
1214         s->s_maxbytes = MAX_LFS_FILESIZE;
1215
1216         s->s_xattr = ceph_xattr_handlers;
1217         fsc->sb = s;
1218         fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */
1219
1220         s->s_op = &ceph_super_ops;
1221         s->s_d_op = &ceph_dentry_ops;
1222         s->s_export_op = &ceph_export_ops;
1223
1224         s->s_time_gran = 1;
1225         s->s_time_min = 0;
1226         s->s_time_max = U32_MAX;
1227         s->s_flags |= SB_NODIRATIME | SB_NOATIME;
1228
1229         ceph_fscrypt_set_ops(s);
1230
1231         ret = set_anon_super_fc(s, fc);
1232         if (ret != 0)
1233                 fsc->sb = NULL;
1234         return ret;
1235 }
1236
1237 /*
1238  * share superblock if same fs AND options
1239  */
1240 static int ceph_compare_super(struct super_block *sb, struct fs_context *fc)
1241 {
1242         struct ceph_fs_client *new = fc->s_fs_info;
1243         struct ceph_mount_options *fsopt = new->mount_options;
1244         struct ceph_options *opt = new->client->options;
1245         struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
1246         struct ceph_client *cl = fsc->client;
1247
1248         doutc(cl, "%p\n", sb);
1249
1250         if (compare_mount_options(fsopt, opt, fsc)) {
1251                 doutc(cl, "monitor(s)/mount options don't match\n");
1252                 return 0;
1253         }
1254         if ((opt->flags & CEPH_OPT_FSID) &&
1255             ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) {
1256                 doutc(cl, "fsid doesn't match\n");
1257                 return 0;
1258         }
1259         if (fc->sb_flags != (sb->s_flags & ~SB_BORN)) {
1260                 doutc(cl, "flags differ\n");
1261                 return 0;
1262         }
1263
1264         if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) {
1265                 doutc(cl, "client is blocklisted (and CLEANRECOVER is not set)\n");
1266                 return 0;
1267         }
1268
1269         if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
1270                 doutc(cl, "client has been forcibly unmounted\n");
1271                 return 0;
1272         }
1273
1274         return 1;
1275 }
1276
1277 /*
1278  * construct our own bdi so we can control readahead, etc.
1279  */
1280 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
1281
1282 static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
1283 {
1284         int err;
1285
1286         err = super_setup_bdi_name(sb, "ceph-%ld",
1287                                    atomic_long_inc_return(&bdi_seq));
1288         if (err)
1289                 return err;
1290
1291         /* set ra_pages based on rasize mount option? */
1292         sb->s_bdi->ra_pages = fsc->mount_options->rasize >> PAGE_SHIFT;
1293
1294         /* set io_pages based on max osd read size */
1295         sb->s_bdi->io_pages = fsc->mount_options->rsize >> PAGE_SHIFT;
1296
1297         return 0;
1298 }
1299
1300 static int ceph_get_tree(struct fs_context *fc)
1301 {
1302         struct ceph_parse_opts_ctx *pctx = fc->fs_private;
1303         struct ceph_mount_options *fsopt = pctx->opts;
1304         struct super_block *sb;
1305         struct ceph_fs_client *fsc;
1306         struct dentry *res;
1307         int (*compare_super)(struct super_block *, struct fs_context *) =
1308                 ceph_compare_super;
1309         int err;
1310
1311         dout("ceph_get_tree\n");
1312
1313         if (!fc->source)
1314                 return invalfc(fc, "No source");
1315         if (fsopt->new_dev_syntax && !fsopt->mon_addr)
1316                 return invalfc(fc, "No monitor address");
1317
1318         /* create client (which we may/may not use) */
1319         fsc = create_fs_client(pctx->opts, pctx->copts);
1320         pctx->opts = NULL;
1321         pctx->copts = NULL;
1322         if (IS_ERR(fsc)) {
1323                 err = PTR_ERR(fsc);
1324                 goto out_final;
1325         }
1326
1327         err = ceph_mdsc_init(fsc);
1328         if (err < 0)
1329                 goto out;
1330
1331         if (ceph_test_opt(fsc->client, NOSHARE))
1332                 compare_super = NULL;
1333
1334         fc->s_fs_info = fsc;
1335         sb = sget_fc(fc, compare_super, ceph_set_super);
1336         fc->s_fs_info = NULL;
1337         if (IS_ERR(sb)) {
1338                 err = PTR_ERR(sb);
1339                 goto out;
1340         }
1341
1342         if (ceph_sb_to_fs_client(sb) != fsc) {
1343                 destroy_fs_client(fsc);
1344                 fsc = ceph_sb_to_fs_client(sb);
1345                 dout("get_sb got existing client %p\n", fsc);
1346         } else {
1347                 dout("get_sb using new client %p\n", fsc);
1348                 err = ceph_setup_bdi(sb, fsc);
1349                 if (err < 0)
1350                         goto out_splat;
1351         }
1352
1353         res = ceph_real_mount(fsc, fc);
1354         if (IS_ERR(res)) {
1355                 err = PTR_ERR(res);
1356                 goto out_splat;
1357         }
1358
1359         doutc(fsc->client, "root %p inode %p ino %llx.%llx\n", res,
1360                     d_inode(res), ceph_vinop(d_inode(res)));
1361         fc->root = fsc->sb->s_root;
1362         return 0;
1363
1364 out_splat:
1365         if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) {
1366                 pr_info("No mds server is up or the cluster is laggy\n");
1367                 err = -EHOSTUNREACH;
1368         }
1369
1370         ceph_mdsc_close_sessions(fsc->mdsc);
1371         deactivate_locked_super(sb);
1372         goto out_final;
1373
1374 out:
1375         destroy_fs_client(fsc);
1376 out_final:
1377         dout("ceph_get_tree fail %d\n", err);
1378         return err;
1379 }
1380
1381 static void ceph_free_fc(struct fs_context *fc)
1382 {
1383         struct ceph_parse_opts_ctx *pctx = fc->fs_private;
1384
1385         if (pctx) {
1386                 destroy_mount_options(pctx->opts);
1387                 ceph_destroy_options(pctx->copts);
1388                 kfree(pctx);
1389         }
1390 }
1391
1392 static int ceph_reconfigure_fc(struct fs_context *fc)
1393 {
1394         int err;
1395         struct ceph_parse_opts_ctx *pctx = fc->fs_private;
1396         struct ceph_mount_options *fsopt = pctx->opts;
1397         struct super_block *sb = fc->root->d_sb;
1398         struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
1399
1400         err = ceph_apply_test_dummy_encryption(sb, fc, fsopt);
1401         if (err)
1402                 return err;
1403
1404         if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
1405                 ceph_set_mount_opt(fsc, ASYNC_DIROPS);
1406         else
1407                 ceph_clear_mount_opt(fsc, ASYNC_DIROPS);
1408
1409         if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD)
1410                 ceph_set_mount_opt(fsc, SPARSEREAD);
1411         else
1412                 ceph_clear_mount_opt(fsc, SPARSEREAD);
1413
1414         if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) {
1415                 kfree(fsc->mount_options->mon_addr);
1416                 fsc->mount_options->mon_addr = fsopt->mon_addr;
1417                 fsopt->mon_addr = NULL;
1418                 pr_notice_client(fsc->client,
1419                         "monitor addresses recorded, but not used for reconnection");
1420         }
1421
1422         sync_filesystem(sb);
1423         return 0;
1424 }
1425
1426 static const struct fs_context_operations ceph_context_ops = {
1427         .free           = ceph_free_fc,
1428         .parse_param    = ceph_parse_mount_param,
1429         .get_tree       = ceph_get_tree,
1430         .reconfigure    = ceph_reconfigure_fc,
1431 };
1432
1433 /*
1434  * Set up the filesystem mount context.
1435  */
1436 static int ceph_init_fs_context(struct fs_context *fc)
1437 {
1438         struct ceph_parse_opts_ctx *pctx;
1439         struct ceph_mount_options *fsopt;
1440
1441         pctx = kzalloc(sizeof(*pctx), GFP_KERNEL);
1442         if (!pctx)
1443                 return -ENOMEM;
1444
1445         pctx->copts = ceph_alloc_options();
1446         if (!pctx->copts)
1447                 goto nomem;
1448
1449         pctx->opts = kzalloc(sizeof(*pctx->opts), GFP_KERNEL);
1450         if (!pctx->opts)
1451                 goto nomem;
1452
1453         fsopt = pctx->opts;
1454         fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
1455
1456         fsopt->wsize = CEPH_MAX_WRITE_SIZE;
1457         fsopt->rsize = CEPH_MAX_READ_SIZE;
1458         fsopt->rasize = CEPH_RASIZE_DEFAULT;
1459         fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
1460         if (!fsopt->snapdir_name)
1461                 goto nomem;
1462
1463         fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
1464         fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
1465         fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
1466         fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
1467         fsopt->congestion_kb = default_congestion_kb();
1468
1469 #ifdef CONFIG_CEPH_FS_POSIX_ACL
1470         fc->sb_flags |= SB_POSIXACL;
1471 #endif
1472
1473         fc->fs_private = pctx;
1474         fc->ops = &ceph_context_ops;
1475         return 0;
1476
1477 nomem:
1478         destroy_mount_options(pctx->opts);
1479         ceph_destroy_options(pctx->copts);
1480         kfree(pctx);
1481         return -ENOMEM;
1482 }
1483
1484 /*
1485  * Return true if it successfully increases the blocker counter,
1486  * or false if the mdsc is in stopping and flushed state.
1487  */
1488 static bool __inc_stopping_blocker(struct ceph_mds_client *mdsc)
1489 {
1490         spin_lock(&mdsc->stopping_lock);
1491         if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) {
1492                 spin_unlock(&mdsc->stopping_lock);
1493                 return false;
1494         }
1495         atomic_inc(&mdsc->stopping_blockers);
1496         spin_unlock(&mdsc->stopping_lock);
1497         return true;
1498 }
1499
1500 static void __dec_stopping_blocker(struct ceph_mds_client *mdsc)
1501 {
1502         spin_lock(&mdsc->stopping_lock);
1503         if (!atomic_dec_return(&mdsc->stopping_blockers) &&
1504             mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING)
1505                 complete_all(&mdsc->stopping_waiter);
1506         spin_unlock(&mdsc->stopping_lock);
1507 }
1508
1509 /* For metadata IO requests */
1510 bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc,
1511                                    struct ceph_mds_session *session)
1512 {
1513         mutex_lock(&session->s_mutex);
1514         inc_session_sequence(session);
1515         mutex_unlock(&session->s_mutex);
1516
1517         return __inc_stopping_blocker(mdsc);
1518 }
1519
1520 void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc)
1521 {
1522         __dec_stopping_blocker(mdsc);
1523 }
1524
1525 /* For data IO requests */
1526 bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc)
1527 {
1528         return __inc_stopping_blocker(mdsc);
1529 }
1530
1531 void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc)
1532 {
1533         __dec_stopping_blocker(mdsc);
1534 }
1535
1536 static void ceph_kill_sb(struct super_block *s)
1537 {
1538         struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s);
1539         struct ceph_client *cl = fsc->client;
1540         struct ceph_mds_client *mdsc = fsc->mdsc;
1541         bool wait;
1542
1543         doutc(cl, "%p\n", s);
1544
1545         ceph_mdsc_pre_umount(mdsc);
1546         flush_fs_workqueues(fsc);
1547
1548         /*
1549          * Though the kill_anon_super() will finally trigger the
1550          * sync_filesystem() anyway, we still need to do it here and
1551          * then bump the stage of shutdown. This will allow us to
1552          * drop any further message, which will increase the inodes'
1553          * i_count reference counters but makes no sense any more,
1554          * from MDSs.
1555          *
1556          * Without this when evicting the inodes it may fail in the
1557          * kill_anon_super(), which will trigger a warning when
1558          * destroying the fscrypt keyring and then possibly trigger
1559          * a further crash in ceph module when the iput() tries to
1560          * evict the inodes later.
1561          */
1562         sync_filesystem(s);
1563
1564         spin_lock(&mdsc->stopping_lock);
1565         mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING;
1566         wait = !!atomic_read(&mdsc->stopping_blockers);
1567         spin_unlock(&mdsc->stopping_lock);
1568
1569         if (wait && atomic_read(&mdsc->stopping_blockers)) {
1570                 long timeleft = wait_for_completion_killable_timeout(
1571                                         &mdsc->stopping_waiter,
1572                                         fsc->client->options->mount_timeout);
1573                 if (!timeleft) /* timed out */
1574                         pr_warn_client(cl, "umount timed out, %ld\n", timeleft);
1575                 else if (timeleft < 0) /* killed */
1576                         pr_warn_client(cl, "umount was killed, %ld\n", timeleft);
1577         }
1578
1579         mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
1580         kill_anon_super(s);
1581
1582         fsc->client->extra_mon_dispatch = NULL;
1583         ceph_fs_debugfs_cleanup(fsc);
1584
1585         ceph_fscache_unregister_fs(fsc);
1586
1587         destroy_fs_client(fsc);
1588 }
1589
1590 static struct file_system_type ceph_fs_type = {
1591         .owner          = THIS_MODULE,
1592         .name           = "ceph",
1593         .init_fs_context = ceph_init_fs_context,
1594         .kill_sb        = ceph_kill_sb,
1595         .fs_flags       = FS_RENAME_DOES_D_MOVE | FS_ALLOW_IDMAP,
1596 };
1597 MODULE_ALIAS_FS("ceph");
1598
1599 int ceph_force_reconnect(struct super_block *sb)
1600 {
1601         struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
1602         int err = 0;
1603
1604         fsc->mount_state = CEPH_MOUNT_RECOVER;
1605         __ceph_umount_begin(fsc);
1606
1607         /* Make sure all page caches get invalidated.
1608          * see remove_session_caps_cb() */
1609         flush_workqueue(fsc->inode_wq);
1610
1611         /* In case that we were blocklisted. This also reset
1612          * all mon/osd connections */
1613         ceph_reset_client_addr(fsc->client);
1614
1615         ceph_osdc_clear_abort_err(&fsc->client->osdc);
1616
1617         fsc->blocklisted = false;
1618         fsc->mount_state = CEPH_MOUNT_MOUNTED;
1619
1620         if (sb->s_root) {
1621                 err = __ceph_do_getattr(d_inode(sb->s_root), NULL,
1622                                         CEPH_STAT_CAP_INODE, true);
1623         }
1624         return err;
1625 }
1626
1627 static int __init init_ceph(void)
1628 {
1629         int ret = init_caches();
1630         if (ret)
1631                 goto out;
1632
1633         ceph_flock_init();
1634         ret = register_filesystem(&ceph_fs_type);
1635         if (ret)
1636                 goto out_caches;
1637
1638         pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
1639
1640         return 0;
1641
1642 out_caches:
1643         destroy_caches();
1644 out:
1645         return ret;
1646 }
1647
1648 static void __exit exit_ceph(void)
1649 {
1650         dout("exit_ceph\n");
1651         unregister_filesystem(&ceph_fs_type);
1652         destroy_caches();
1653 }
1654
1655 static int param_set_metrics(const char *val, const struct kernel_param *kp)
1656 {
1657         struct ceph_fs_client *fsc;
1658         int ret;
1659
1660         ret = param_set_bool(val, kp);
1661         if (ret) {
1662                 pr_err("Failed to parse sending metrics switch value '%s'\n",
1663                        val);
1664                 return ret;
1665         } else if (!disable_send_metrics) {
1666                 // wake up all the mds clients
1667                 spin_lock(&ceph_fsc_lock);
1668                 list_for_each_entry(fsc, &ceph_fsc_list, metric_wakeup) {
1669                         metric_schedule_delayed(&fsc->mdsc->metric);
1670                 }
1671                 spin_unlock(&ceph_fsc_lock);
1672         }
1673
1674         return 0;
1675 }
1676
1677 static const struct kernel_param_ops param_ops_metrics = {
1678         .set = param_set_metrics,
1679         .get = param_get_bool,
1680 };
1681
1682 bool disable_send_metrics = false;
1683 module_param_cb(disable_send_metrics, &param_ops_metrics, &disable_send_metrics, 0644);
1684 MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)");
1685
1686 /* for both v1 and v2 syntax */
1687 static bool mount_support = true;
1688 static const struct kernel_param_ops param_ops_mount_syntax = {
1689         .get = param_get_bool,
1690 };
1691 module_param_cb(mount_syntax_v1, &param_ops_mount_syntax, &mount_support, 0444);
1692 module_param_cb(mount_syntax_v2, &param_ops_mount_syntax, &mount_support, 0444);
1693
1694 bool enable_unsafe_idmap = false;
1695 module_param(enable_unsafe_idmap, bool, 0644);
1696 MODULE_PARM_DESC(enable_unsafe_idmap,
1697                  "Allow to use idmapped mounts with MDS without CEPHFS_FEATURE_HAS_OWNER_UIDGID");
1698
1699 module_init(init_ceph);
1700 module_exit(exit_ceph);
1701
1702 MODULE_AUTHOR("Sage Weil <[email protected]>");
1703 MODULE_AUTHOR("Yehuda Sadeh <[email protected]>");
1704 MODULE_AUTHOR("Patience Warnick <[email protected]>");
1705 MODULE_DESCRIPTION("Ceph filesystem for Linux");
1706 MODULE_LICENSE("GPL");
This page took 0.125869 seconds and 4 git commands to generate.