]>
Commit | Line | Data |
---|---|---|
9cfcceea RP |
1 | Shared Subtrees |
2 | --------------- | |
3 | ||
4 | Contents: | |
5 | 1) Overview | |
6 | 2) Features | |
7 | 3) smount command | |
8 | 4) Use-case | |
9 | 5) Detailed semantics | |
10 | 6) Quiz | |
11 | 7) FAQ | |
12 | 8) Implementation | |
13 | ||
14 | ||
15 | 1) Overview | |
16 | ----------- | |
17 | ||
18 | Consider the following situation: | |
19 | ||
20 | A process wants to clone its own namespace, but still wants to access the CD | |
21 | that got mounted recently. Shared subtree semantics provide the necessary | |
22 | mechanism to accomplish the above. | |
23 | ||
24 | It provides the necessary building blocks for features like per-user-namespace | |
25 | and versioned filesystem. | |
26 | ||
27 | 2) Features | |
28 | ----------- | |
29 | ||
30 | Shared subtree provides four different flavors of mounts; struct vfsmount to be | |
31 | precise | |
32 | ||
33 | a. shared mount | |
34 | b. slave mount | |
35 | c. private mount | |
36 | d. unbindable mount | |
37 | ||
38 | ||
39 | 2a) A shared mount can be replicated to as many mountpoints and all the | |
40 | replicas continue to be exactly same. | |
41 | ||
42 | Here is an example: | |
43 | ||
44 | Lets say /mnt has a mount that is shared. | |
45 | mount --make-shared /mnt | |
46 | ||
47 | note: mount command does not yet support the --make-shared flag. | |
48 | I have included a small C program which does the same by executing | |
49 | 'smount /mnt shared' | |
50 | ||
51 | #mount --bind /mnt /tmp | |
52 | The above command replicates the mount at /mnt to the mountpoint /tmp | |
53 | and the contents of both the mounts remain identical. | |
54 | ||
55 | #ls /mnt | |
56 | a b c | |
57 | ||
58 | #ls /tmp | |
59 | a b c | |
60 | ||
61 | Now lets say we mount a device at /tmp/a | |
62 | #mount /dev/sd0 /tmp/a | |
63 | ||
64 | #ls /tmp/a | |
65 | t1 t2 t2 | |
66 | ||
67 | #ls /mnt/a | |
68 | t1 t2 t2 | |
69 | ||
70 | Note that the mount has propagated to the mount at /mnt as well. | |
71 | ||
72 | And the same is true even when /dev/sd0 is mounted on /mnt/a. The | |
73 | contents will be visible under /tmp/a too. | |
74 | ||
75 | ||
76 | 2b) A slave mount is like a shared mount except that mount and umount events | |
77 | only propagate towards it. | |
78 | ||
79 | All slave mounts have a master mount which is a shared. | |
80 | ||
81 | Here is an example: | |
82 | ||
83 | Lets say /mnt has a mount which is shared. | |
84 | #mount --make-shared /mnt | |
85 | ||
86 | Lets bind mount /mnt to /tmp | |
87 | #mount --bind /mnt /tmp | |
88 | ||
89 | the new mount at /tmp becomes a shared mount and it is a replica of | |
90 | the mount at /mnt. | |
91 | ||
92 | Now lets make the mount at /tmp; a slave of /mnt | |
93 | #mount --make-slave /tmp | |
94 | [or smount /tmp slave] | |
95 | ||
96 | lets mount /dev/sd0 on /mnt/a | |
97 | #mount /dev/sd0 /mnt/a | |
98 | ||
99 | #ls /mnt/a | |
100 | t1 t2 t3 | |
101 | ||
102 | #ls /tmp/a | |
103 | t1 t2 t3 | |
104 | ||
105 | Note the mount event has propagated to the mount at /tmp | |
106 | ||
107 | However lets see what happens if we mount something on the mount at /tmp | |
108 | ||
109 | #mount /dev/sd1 /tmp/b | |
110 | ||
111 | #ls /tmp/b | |
112 | s1 s2 s3 | |
113 | ||
114 | #ls /mnt/b | |
115 | ||
116 | Note how the mount event has not propagated to the mount at | |
117 | /mnt | |
118 | ||
119 | ||
120 | 2c) A private mount does not forward or receive propagation. | |
121 | ||
122 | This is the mount we are familiar with. Its the default type. | |
123 | ||
124 | ||
125 | 2d) A unbindable mount is a unbindable private mount | |
126 | ||
127 | lets say we have a mount at /mnt and we make is unbindable | |
128 | ||
129 | #mount --make-unbindable /mnt | |
130 | [ smount /mnt unbindable ] | |
131 | ||
132 | Lets try to bind mount this mount somewhere else. | |
133 | # mount --bind /mnt /tmp | |
134 | mount: wrong fs type, bad option, bad superblock on /mnt, | |
135 | or too many mounted file systems | |
136 | ||
137 | Binding a unbindable mount is a invalid operation. | |
138 | ||
139 | ||
140 | 3) smount command | |
141 | ||
142 | Currently the mount command is not aware of shared subtree features. | |
143 | Work is in progress to add the support in mount ( util-linux package ). | |
144 | Till then use the following program. | |
145 | ||
146 | ------------------------------------------------------------------------ | |
147 | // | |
148 | //this code was developed my Miklos Szeredi <[email protected]> | |
149 | //and modified by Ram Pai <[email protected]> | |
150 | // sample usage: | |
151 | // smount /tmp shared | |
152 | // | |
153 | #include <stdio.h> | |
154 | #include <stdlib.h> | |
155 | #include <unistd.h> | |
2de206d8 | 156 | #include <string.h> |
9cfcceea RP |
157 | #include <sys/mount.h> |
158 | #include <sys/fsuid.h> | |
159 | ||
160 | #ifndef MS_REC | |
161 | #define MS_REC 0x4000 /* 16384: Recursive loopback */ | |
162 | #endif | |
163 | ||
164 | #ifndef MS_SHARED | |
165 | #define MS_SHARED 1<<20 /* Shared */ | |
166 | #endif | |
167 | ||
168 | #ifndef MS_PRIVATE | |
169 | #define MS_PRIVATE 1<<18 /* Private */ | |
170 | #endif | |
171 | ||
172 | #ifndef MS_SLAVE | |
173 | #define MS_SLAVE 1<<19 /* Slave */ | |
174 | #endif | |
175 | ||
176 | #ifndef MS_UNBINDABLE | |
177 | #define MS_UNBINDABLE 1<<17 /* Unbindable */ | |
178 | #endif | |
179 | ||
180 | int main(int argc, char *argv[]) | |
181 | { | |
182 | int type; | |
183 | if(argc != 3) { | |
184 | fprintf(stderr, "usage: %s dir " | |
185 | "<rshared|rslave|rprivate|runbindable|shared|slave" | |
186 | "|private|unbindable>\n" , argv[0]); | |
187 | return 1; | |
188 | } | |
189 | ||
190 | fprintf(stdout, "%s %s %s\n", argv[0], argv[1], argv[2]); | |
191 | ||
192 | if (strcmp(argv[2],"rshared")==0) | |
193 | type=(MS_SHARED|MS_REC); | |
194 | else if (strcmp(argv[2],"rslave")==0) | |
195 | type=(MS_SLAVE|MS_REC); | |
196 | else if (strcmp(argv[2],"rprivate")==0) | |
197 | type=(MS_PRIVATE|MS_REC); | |
198 | else if (strcmp(argv[2],"runbindable")==0) | |
199 | type=(MS_UNBINDABLE|MS_REC); | |
200 | else if (strcmp(argv[2],"shared")==0) | |
201 | type=MS_SHARED; | |
202 | else if (strcmp(argv[2],"slave")==0) | |
203 | type=MS_SLAVE; | |
204 | else if (strcmp(argv[2],"private")==0) | |
205 | type=MS_PRIVATE; | |
206 | else if (strcmp(argv[2],"unbindable")==0) | |
207 | type=MS_UNBINDABLE; | |
208 | else { | |
209 | fprintf(stderr, "invalid operation: %s\n", argv[2]); | |
210 | return 1; | |
211 | } | |
212 | setfsuid(getuid()); | |
213 | ||
214 | if(mount("", argv[1], "dontcare", type, "") == -1) { | |
215 | perror("mount"); | |
216 | return 1; | |
217 | } | |
218 | return 0; | |
219 | } | |
220 | ----------------------------------------------------------------------- | |
221 | ||
222 | Copy the above code snippet into smount.c | |
223 | gcc -o smount smount.c | |
224 | ||
225 | ||
226 | (i) To mark all the mounts under /mnt as shared execute the following | |
227 | command: | |
228 | ||
229 | smount /mnt rshared | |
230 | the corresponding syntax planned for mount command is | |
231 | mount --make-rshared /mnt | |
232 | ||
233 | just to mark a mount /mnt as shared, execute the following | |
234 | command: | |
235 | smount /mnt shared | |
236 | the corresponding syntax planned for mount command is | |
237 | mount --make-shared /mnt | |
238 | ||
239 | (ii) To mark all the shared mounts under /mnt as slave execute the | |
240 | following | |
241 | ||
242 | command: | |
243 | smount /mnt rslave | |
244 | the corresponding syntax planned for mount command is | |
245 | mount --make-rslave /mnt | |
246 | ||
247 | just to mark a mount /mnt as slave, execute the following | |
248 | command: | |
249 | smount /mnt slave | |
250 | the corresponding syntax planned for mount command is | |
251 | mount --make-slave /mnt | |
252 | ||
253 | (iii) To mark all the mounts under /mnt as private execute the | |
254 | following command: | |
255 | ||
256 | smount /mnt rprivate | |
257 | the corresponding syntax planned for mount command is | |
258 | mount --make-rprivate /mnt | |
259 | ||
260 | just to mark a mount /mnt as private, execute the following | |
261 | command: | |
262 | smount /mnt private | |
263 | the corresponding syntax planned for mount command is | |
264 | mount --make-private /mnt | |
265 | ||
266 | NOTE: by default all the mounts are created as private. But if | |
267 | you want to change some shared/slave/unbindable mount as | |
268 | private at a later point in time, this command can help. | |
269 | ||
270 | (iv) To mark all the mounts under /mnt as unbindable execute the | |
271 | following | |
272 | ||
273 | command: | |
274 | smount /mnt runbindable | |
275 | the corresponding syntax planned for mount command is | |
276 | mount --make-runbindable /mnt | |
277 | ||
278 | just to mark a mount /mnt as unbindable, execute the following | |
279 | command: | |
280 | smount /mnt unbindable | |
281 | the corresponding syntax planned for mount command is | |
282 | mount --make-unbindable /mnt | |
283 | ||
284 | ||
285 | 4) Use cases | |
286 | ------------ | |
287 | ||
288 | A) A process wants to clone its own namespace, but still wants to | |
289 | access the CD that got mounted recently. | |
290 | ||
291 | Solution: | |
292 | ||
293 | The system administrator can make the mount at /cdrom shared | |
294 | mount --bind /cdrom /cdrom | |
295 | mount --make-shared /cdrom | |
296 | ||
297 | Now any process that clones off a new namespace will have a | |
298 | mount at /cdrom which is a replica of the same mount in the | |
299 | parent namespace. | |
300 | ||
301 | So when a CD is inserted and mounted at /cdrom that mount gets | |
302 | propagated to the other mount at /cdrom in all the other clone | |
303 | namespaces. | |
304 | ||
305 | B) A process wants its mounts invisible to any other process, but | |
306 | still be able to see the other system mounts. | |
307 | ||
308 | Solution: | |
309 | ||
310 | To begin with, the administrator can mark the entire mount tree | |
311 | as shareable. | |
312 | ||
313 | mount --make-rshared / | |
314 | ||
315 | A new process can clone off a new namespace. And mark some part | |
316 | of its namespace as slave | |
317 | ||
318 | mount --make-rslave /myprivatetree | |
319 | ||
320 | Hence forth any mounts within the /myprivatetree done by the | |
321 | process will not show up in any other namespace. However mounts | |
322 | done in the parent namespace under /myprivatetree still shows | |
323 | up in the process's namespace. | |
324 | ||
325 | ||
326 | Apart from the above semantics this feature provides the | |
327 | building blocks to solve the following problems: | |
328 | ||
329 | C) Per-user namespace | |
330 | ||
331 | The above semantics allows a way to share mounts across | |
332 | namespaces. But namespaces are associated with processes. If | |
333 | namespaces are made first class objects with user API to | |
334 | associate/disassociate a namespace with userid, then each user | |
335 | could have his/her own namespace and tailor it to his/her | |
336 | requirements. Offcourse its needs support from PAM. | |
337 | ||
338 | D) Versioned files | |
339 | ||
340 | If the entire mount tree is visible at multiple locations, then | |
341 | a underlying versioning file system can return different | |
342 | version of the file depending on the path used to access that | |
343 | file. | |
344 | ||
345 | An example is: | |
346 | ||
347 | mount --make-shared / | |
348 | mount --rbind / /view/v1 | |
349 | mount --rbind / /view/v2 | |
350 | mount --rbind / /view/v3 | |
351 | mount --rbind / /view/v4 | |
352 | ||
353 | and if /usr has a versioning filesystem mounted, than that | |
354 | mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and | |
355 | /view/v4/usr too | |
356 | ||
357 | A user can request v3 version of the file /usr/fs/namespace.c | |
358 | by accessing /view/v3/usr/fs/namespace.c . The underlying | |
359 | versioning filesystem can then decipher that v3 version of the | |
360 | filesystem is being requested and return the corresponding | |
361 | inode. | |
362 | ||
363 | 5) Detailed semantics: | |
364 | ------------------- | |
365 | The section below explains the detailed semantics of | |
366 | bind, rbind, move, mount, umount and clone-namespace operations. | |
367 | ||
368 | Note: the word 'vfsmount' and the noun 'mount' have been used | |
369 | to mean the same thing, throughout this document. | |
370 | ||
371 | 5a) Mount states | |
372 | ||
373 | A given mount can be in one of the following states | |
374 | 1) shared | |
375 | 2) slave | |
376 | 3) shared and slave | |
377 | 4) private | |
378 | 5) unbindable | |
379 | ||
380 | A 'propagation event' is defined as event generated on a vfsmount | |
381 | that leads to mount or unmount actions in other vfsmounts. | |
382 | ||
383 | A 'peer group' is defined as a group of vfsmounts that propagate | |
384 | events to each other. | |
385 | ||
386 | (1) Shared mounts | |
387 | ||
388 | A 'shared mount' is defined as a vfsmount that belongs to a | |
389 | 'peer group'. | |
390 | ||
391 | For example: | |
392 | mount --make-shared /mnt | |
393 | mount --bin /mnt /tmp | |
394 | ||
395 | The mount at /mnt and that at /tmp are both shared and belong | |
396 | to the same peer group. Anything mounted or unmounted under | |
397 | /mnt or /tmp reflect in all the other mounts of its peer | |
398 | group. | |
399 | ||
400 | ||
401 | (2) Slave mounts | |
402 | ||
403 | A 'slave mount' is defined as a vfsmount that receives | |
404 | propagation events and does not forward propagation events. | |
405 | ||
406 | A slave mount as the name implies has a master mount from which | |
407 | mount/unmount events are received. Events do not propagate from | |
408 | the slave mount to the master. Only a shared mount can be made | |
409 | a slave by executing the following command | |
410 | ||
411 | mount --make-slave mount | |
412 | ||
413 | A shared mount that is made as a slave is no more shared unless | |
414 | modified to become shared. | |
415 | ||
416 | (3) Shared and Slave | |
417 | ||
418 | A vfsmount can be both shared as well as slave. This state | |
419 | indicates that the mount is a slave of some vfsmount, and | |
420 | has its own peer group too. This vfsmount receives propagation | |
421 | events from its master vfsmount, and also forwards propagation | |
422 | events to its 'peer group' and to its slave vfsmounts. | |
423 | ||
424 | Strictly speaking, the vfsmount is shared having its own | |
425 | peer group, and this peer-group is a slave of some other | |
426 | peer group. | |
427 | ||
428 | Only a slave vfsmount can be made as 'shared and slave' by | |
429 | either executing the following command | |
430 | mount --make-shared mount | |
431 | or by moving the slave vfsmount under a shared vfsmount. | |
432 | ||
433 | (4) Private mount | |
434 | ||
435 | A 'private mount' is defined as vfsmount that does not | |
436 | receive or forward any propagation events. | |
437 | ||
438 | (5) Unbindable mount | |
439 | ||
440 | A 'unbindable mount' is defined as vfsmount that does not | |
441 | receive or forward any propagation events and cannot | |
442 | be bind mounted. | |
443 | ||
444 | ||
445 | State diagram: | |
446 | The state diagram below explains the state transition of a mount, | |
447 | in response to various commands. | |
448 | ------------------------------------------------------------------------ | |
449 | | |make-shared | make-slave | make-private |make-unbindab| | |
450 | --------------|------------|--------------|--------------|-------------| | |
451 | |shared |shared |*slave/private| private | unbindable | | |
452 | | | | | | | | |
453 | |-------------|------------|--------------|--------------|-------------| | |
454 | |slave |shared | **slave | private | unbindable | | |
455 | | |and slave | | | | | |
456 | |-------------|------------|--------------|--------------|-------------| | |
457 | |shared |shared | slave | private | unbindable | | |
458 | |and slave |and slave | | | | | |
459 | |-------------|------------|--------------|--------------|-------------| | |
460 | |private |shared | **private | private | unbindable | | |
461 | |-------------|------------|--------------|--------------|-------------| | |
462 | |unbindable |shared |**unbindable | private | unbindable | | |
463 | ------------------------------------------------------------------------ | |
464 | ||
465 | * if the shared mount is the only mount in its peer group, making it | |
466 | slave, makes it private automatically. Note that there is no master to | |
467 | which it can be slaved to. | |
468 | ||
469 | ** slaving a non-shared mount has no effect on the mount. | |
470 | ||
471 | Apart from the commands listed below, the 'move' operation also changes | |
472 | the state of a mount depending on type of the destination mount. Its | |
473 | explained in section 5d. | |
474 | ||
475 | 5b) Bind semantics | |
476 | ||
477 | Consider the following command | |
478 | ||
479 | mount --bind A/a B/b | |
480 | ||
481 | where 'A' is the source mount, 'a' is the dentry in the mount 'A', 'B' | |
482 | is the destination mount and 'b' is the dentry in the destination mount. | |
483 | ||
484 | The outcome depends on the type of mount of 'A' and 'B'. The table | |
485 | below contains quick reference. | |
486 | --------------------------------------------------------------------------- | |
487 | | BIND MOUNT OPERATION | | |
488 | |************************************************************************** | |
489 | |source(A)->| shared | private | slave | unbindable | | |
490 | | dest(B) | | | | | | |
491 | | | | | | | | | |
492 | | v | | | | | | |
493 | |************************************************************************** | |
494 | | shared | shared | shared | shared & slave | invalid | | |
495 | | | | | | | | |
496 | |non-shared| shared | private | slave | invalid | | |
497 | *************************************************************************** | |
498 | ||
499 | Details: | |
500 | ||
501 | 1. 'A' is a shared mount and 'B' is a shared mount. A new mount 'C' | |
502 | which is clone of 'A', is created. Its root dentry is 'a' . 'C' is | |
503 | mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... | |
504 | are created and mounted at the dentry 'b' on all mounts where 'B' | |
505 | propagates to. A new propagation tree containing 'C1',..,'Cn' is | |
506 | created. This propagation tree is identical to the propagation tree of | |
507 | 'B'. And finally the peer-group of 'C' is merged with the peer group | |
508 | of 'A'. | |
509 | ||
510 | 2. 'A' is a private mount and 'B' is a shared mount. A new mount 'C' | |
511 | which is clone of 'A', is created. Its root dentry is 'a'. 'C' is | |
512 | mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... | |
513 | are created and mounted at the dentry 'b' on all mounts where 'B' | |
514 | propagates to. A new propagation tree is set containing all new mounts | |
515 | 'C', 'C1', .., 'Cn' with exactly the same configuration as the | |
516 | propagation tree for 'B'. | |
517 | ||
518 | 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. A new | |
519 | mount 'C' which is clone of 'A', is created. Its root dentry is 'a' . | |
520 | 'C' is mounted on mount 'B' at dentry 'b'. Also new mounts 'C1', 'C2', | |
521 | 'C3' ... are created and mounted at the dentry 'b' on all mounts where | |
522 | 'B' propagates to. A new propagation tree containing the new mounts | |
523 | 'C','C1',.. 'Cn' is created. This propagation tree is identical to the | |
524 | propagation tree for 'B'. And finally the mount 'C' and its peer group | |
525 | is made the slave of mount 'Z'. In other words, mount 'C' is in the | |
526 | state 'slave and shared'. | |
527 | ||
528 | 4. 'A' is a unbindable mount and 'B' is a shared mount. This is a | |
529 | invalid operation. | |
530 | ||
531 | 5. 'A' is a private mount and 'B' is a non-shared(private or slave or | |
532 | unbindable) mount. A new mount 'C' which is clone of 'A', is created. | |
533 | Its root dentry is 'a'. 'C' is mounted on mount 'B' at dentry 'b'. | |
534 | ||
535 | 6. 'A' is a shared mount and 'B' is a non-shared mount. A new mount 'C' | |
536 | which is a clone of 'A' is created. Its root dentry is 'a'. 'C' is | |
537 | mounted on mount 'B' at dentry 'b'. 'C' is made a member of the | |
538 | peer-group of 'A'. | |
539 | ||
540 | 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. A | |
541 | new mount 'C' which is a clone of 'A' is created. Its root dentry is | |
542 | 'a'. 'C' is mounted on mount 'B' at dentry 'b'. Also 'C' is set as a | |
543 | slave mount of 'Z'. In other words 'A' and 'C' are both slave mounts of | |
544 | 'Z'. All mount/unmount events on 'Z' propagates to 'A' and 'C'. But | |
545 | mount/unmount on 'A' do not propagate anywhere else. Similarly | |
546 | mount/unmount on 'C' do not propagate anywhere else. | |
547 | ||
548 | 8. 'A' is a unbindable mount and 'B' is a non-shared mount. This is a | |
549 | invalid operation. A unbindable mount cannot be bind mounted. | |
550 | ||
551 | 5c) Rbind semantics | |
552 | ||
553 | rbind is same as bind. Bind replicates the specified mount. Rbind | |
554 | replicates all the mounts in the tree belonging to the specified mount. | |
555 | Rbind mount is bind mount applied to all the mounts in the tree. | |
556 | ||
557 | If the source tree that is rbind has some unbindable mounts, | |
558 | then the subtree under the unbindable mount is pruned in the new | |
559 | location. | |
560 | ||
561 | eg: lets say we have the following mount tree. | |
562 | ||
563 | A | |
564 | / \ | |
565 | B C | |
566 | / \ / \ | |
567 | D E F G | |
568 | ||
569 | Lets say all the mount except the mount C in the tree are | |
570 | of a type other than unbindable. | |
571 | ||
572 | If this tree is rbound to say Z | |
573 | ||
574 | We will have the following tree at the new location. | |
575 | ||
576 | Z | |
577 | | | |
578 | A' | |
579 | / | |
580 | B' Note how the tree under C is pruned | |
581 | / \ in the new location. | |
582 | D' E' | |
583 | ||
584 | ||
585 | ||
586 | 5d) Move semantics | |
587 | ||
588 | Consider the following command | |
589 | ||
590 | mount --move A B/b | |
591 | ||
592 | where 'A' is the source mount, 'B' is the destination mount and 'b' is | |
593 | the dentry in the destination mount. | |
594 | ||
595 | The outcome depends on the type of the mount of 'A' and 'B'. The table | |
596 | below is a quick reference. | |
597 | --------------------------------------------------------------------------- | |
598 | | MOVE MOUNT OPERATION | | |
599 | |************************************************************************** | |
600 | | source(A)->| shared | private | slave | unbindable | | |
601 | | dest(B) | | | | | | |
602 | | | | | | | | | |
603 | | v | | | | | | |
604 | |************************************************************************** | |
605 | | shared | shared | shared |shared and slave| invalid | | |
606 | | | | | | | | |
607 | |non-shared| shared | private | slave | unbindable | | |
608 | *************************************************************************** | |
609 | NOTE: moving a mount residing under a shared mount is invalid. | |
610 | ||
611 | Details follow: | |
612 | ||
613 | 1. 'A' is a shared mount and 'B' is a shared mount. The mount 'A' is | |
614 | mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', 'A2'...'An' | |
615 | are created and mounted at dentry 'b' on all mounts that receive | |
616 | propagation from mount 'B'. A new propagation tree is created in the | |
617 | exact same configuration as that of 'B'. This new propagation tree | |
618 | contains all the new mounts 'A1', 'A2'... 'An'. And this new | |
619 | propagation tree is appended to the already existing propagation tree | |
620 | of 'A'. | |
621 | ||
622 | 2. 'A' is a private mount and 'B' is a shared mount. The mount 'A' is | |
623 | mounted on mount 'B' at dentry 'b'. Also new mount 'A1', 'A2'... 'An' | |
624 | are created and mounted at dentry 'b' on all mounts that receive | |
625 | propagation from mount 'B'. The mount 'A' becomes a shared mount and a | |
626 | propagation tree is created which is identical to that of | |
627 | 'B'. This new propagation tree contains all the new mounts 'A1', | |
628 | 'A2'... 'An'. | |
629 | ||
630 | 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. The | |
631 | mount 'A' is mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', | |
632 | 'A2'... 'An' are created and mounted at dentry 'b' on all mounts that | |
633 | receive propagation from mount 'B'. A new propagation tree is created | |
634 | in the exact same configuration as that of 'B'. This new propagation | |
635 | tree contains all the new mounts 'A1', 'A2'... 'An'. And this new | |
636 | propagation tree is appended to the already existing propagation tree of | |
637 | 'A'. Mount 'A' continues to be the slave mount of 'Z' but it also | |
638 | becomes 'shared'. | |
639 | ||
640 | 4. 'A' is a unbindable mount and 'B' is a shared mount. The operation | |
641 | is invalid. Because mounting anything on the shared mount 'B' can | |
642 | create new mounts that get mounted on the mounts that receive | |
643 | propagation from 'B'. And since the mount 'A' is unbindable, cloning | |
644 | it to mount at other mountpoints is not possible. | |
645 | ||
646 | 5. 'A' is a private mount and 'B' is a non-shared(private or slave or | |
647 | unbindable) mount. The mount 'A' is mounted on mount 'B' at dentry 'b'. | |
648 | ||
649 | 6. 'A' is a shared mount and 'B' is a non-shared mount. The mount 'A' | |
650 | is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a | |
651 | shared mount. | |
652 | ||
653 | 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. | |
654 | The mount 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' | |
655 | continues to be a slave mount of mount 'Z'. | |
656 | ||
657 | 8. 'A' is a unbindable mount and 'B' is a non-shared mount. The mount | |
658 | 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a | |
659 | unbindable mount. | |
660 | ||
661 | 5e) Mount semantics | |
662 | ||
663 | Consider the following command | |
664 | ||
665 | mount device B/b | |
666 | ||
667 | 'B' is the destination mount and 'b' is the dentry in the destination | |
668 | mount. | |
669 | ||
670 | The above operation is the same as bind operation with the exception | |
671 | that the source mount is always a private mount. | |
672 | ||
673 | ||
674 | 5f) Unmount semantics | |
675 | ||
676 | Consider the following command | |
677 | ||
678 | umount A | |
679 | ||
680 | where 'A' is a mount mounted on mount 'B' at dentry 'b'. | |
681 | ||
682 | If mount 'B' is shared, then all most-recently-mounted mounts at dentry | |
683 | 'b' on mounts that receive propagation from mount 'B' and does not have | |
684 | sub-mounts within them are unmounted. | |
685 | ||
686 | Example: Lets say 'B1', 'B2', 'B3' are shared mounts that propagate to | |
687 | each other. | |
688 | ||
689 | lets say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount | |
690 | 'B1', 'B2' and 'B3' respectively. | |
691 | ||
692 | lets say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on | |
693 | mount 'B1', 'B2' and 'B3' respectively. | |
694 | ||
695 | if 'C1' is unmounted, all the mounts that are most-recently-mounted on | |
696 | 'B1' and on the mounts that 'B1' propagates-to are unmounted. | |
697 | ||
698 | 'B1' propagates to 'B2' and 'B3'. And the most recently mounted mount | |
699 | on 'B2' at dentry 'b' is 'C2', and that of mount 'B3' is 'C3'. | |
700 | ||
701 | So all 'C1', 'C2' and 'C3' should be unmounted. | |
702 | ||
703 | If any of 'C2' or 'C3' has some child mounts, then that mount is not | |
704 | unmounted, but all other mounts are unmounted. However if 'C1' is told | |
705 | to be unmounted and 'C1' has some sub-mounts, the umount operation is | |
706 | failed entirely. | |
707 | ||
708 | 5g) Clone Namespace | |
709 | ||
710 | A cloned namespace contains all the mounts as that of the parent | |
711 | namespace. | |
712 | ||
713 | Lets say 'A' and 'B' are the corresponding mounts in the parent and the | |
714 | child namespace. | |
715 | ||
716 | If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to | |
717 | each other. | |
718 | ||
719 | If 'A' is a slave mount of 'Z', then 'B' is also the slave mount of | |
720 | 'Z'. | |
721 | ||
722 | If 'A' is a private mount, then 'B' is a private mount too. | |
723 | ||
724 | If 'A' is unbindable mount, then 'B' is a unbindable mount too. | |
725 | ||
726 | ||
727 | 6) Quiz | |
728 | ||
729 | A. What is the result of the following command sequence? | |
730 | ||
731 | mount --bind /mnt /mnt | |
732 | mount --make-shared /mnt | |
733 | mount --bind /mnt /tmp | |
734 | mount --move /tmp /mnt/1 | |
735 | ||
736 | what should be the contents of /mnt /mnt/1 /mnt/1/1 should be? | |
737 | Should they all be identical? or should /mnt and /mnt/1 be | |
738 | identical only? | |
739 | ||
740 | ||
741 | B. What is the result of the following command sequence? | |
742 | ||
743 | mount --make-rshared / | |
744 | mkdir -p /v/1 | |
745 | mount --rbind / /v/1 | |
746 | ||
747 | what should be the content of /v/1/v/1 be? | |
748 | ||
749 | ||
750 | C. What is the result of the following command sequence? | |
751 | ||
752 | mount --bind /mnt /mnt | |
753 | mount --make-shared /mnt | |
754 | mkdir -p /mnt/1/2/3 /mnt/1/test | |
755 | mount --bind /mnt/1 /tmp | |
756 | mount --make-slave /mnt | |
757 | mount --make-shared /mnt | |
758 | mount --bind /mnt/1/2 /tmp1 | |
759 | mount --make-slave /mnt | |
760 | ||
761 | At this point we have the first mount at /tmp and | |
762 | its root dentry is 1. Lets call this mount 'A' | |
763 | And then we have a second mount at /tmp1 with root | |
764 | dentry 2. Lets call this mount 'B' | |
765 | Next we have a third mount at /mnt with root dentry | |
766 | mnt. Lets call this mount 'C' | |
767 | ||
768 | 'B' is the slave of 'A' and 'C' is a slave of 'B' | |
769 | A -> B -> C | |
770 | ||
771 | at this point if we execute the following command | |
772 | ||
773 | mount --bind /bin /tmp/test | |
774 | ||
775 | The mount is attempted on 'A' | |
776 | ||
777 | will the mount propagate to 'B' and 'C' ? | |
778 | ||
779 | what would be the contents of | |
780 | /mnt/1/test be? | |
781 | ||
782 | 7) FAQ | |
783 | ||
784 | Q1. Why is bind mount needed? How is it different from symbolic links? | |
785 | symbolic links can get stale if the destination mount gets | |
786 | unmounted or moved. Bind mounts continue to exist even if the | |
787 | other mount is unmounted or moved. | |
788 | ||
789 | Q2. Why can't the shared subtree be implemented using exportfs? | |
790 | ||
791 | exportfs is a heavyweight way of accomplishing part of what | |
792 | shared subtree can do. I cannot imagine a way to implement the | |
793 | semantics of slave mount using exportfs? | |
794 | ||
795 | Q3 Why is unbindable mount needed? | |
796 | ||
797 | Lets say we want to replicate the mount tree at multiple | |
798 | locations within the same subtree. | |
799 | ||
800 | if one rbind mounts a tree within the same subtree 'n' times | |
801 | the number of mounts created is an exponential function of 'n'. | |
802 | Having unbindable mount can help prune the unneeded bind | |
803 | mounts. Here is a example. | |
804 | ||
805 | step 1: | |
806 | lets say the root tree has just two directories with | |
807 | one vfsmount. | |
808 | root | |
809 | / \ | |
810 | tmp usr | |
811 | ||
812 | And we want to replicate the tree at multiple | |
813 | mountpoints under /root/tmp | |
814 | ||
815 | step2: | |
816 | mount --make-shared /root | |
817 | ||
818 | mkdir -p /tmp/m1 | |
819 | ||
820 | mount --rbind /root /tmp/m1 | |
821 | ||
822 | the new tree now looks like this: | |
823 | ||
824 | root | |
825 | / \ | |
826 | tmp usr | |
827 | / | |
828 | m1 | |
829 | / \ | |
830 | tmp usr | |
831 | / | |
832 | m1 | |
833 | ||
834 | it has two vfsmounts | |
835 | ||
836 | step3: | |
837 | mkdir -p /tmp/m2 | |
838 | mount --rbind /root /tmp/m2 | |
839 | ||
840 | the new tree now looks like this: | |
841 | ||
842 | root | |
843 | / \ | |
844 | tmp usr | |
845 | / \ | |
846 | m1 m2 | |
847 | / \ / \ | |
848 | tmp usr tmp usr | |
849 | / \ / | |
850 | m1 m2 m1 | |
851 | / \ / \ | |
852 | tmp usr tmp usr | |
853 | / / \ | |
854 | m1 m1 m2 | |
855 | / \ | |
856 | tmp usr | |
857 | / \ | |
858 | m1 m2 | |
859 | ||
860 | it has 6 vfsmounts | |
861 | ||
862 | step 4: | |
863 | mkdir -p /tmp/m3 | |
864 | mount --rbind /root /tmp/m3 | |
865 | ||
866 | I wont' draw the tree..but it has 24 vfsmounts | |
867 | ||
868 | ||
869 | at step i the number of vfsmounts is V[i] = i*V[i-1]. | |
870 | This is an exponential function. And this tree has way more | |
871 | mounts than what we really needed in the first place. | |
872 | ||
873 | One could use a series of umount at each step to prune | |
874 | out the unneeded mounts. But there is a better solution. | |
875 | Unclonable mounts come in handy here. | |
876 | ||
877 | step 1: | |
878 | lets say the root tree has just two directories with | |
879 | one vfsmount. | |
880 | root | |
881 | / \ | |
882 | tmp usr | |
883 | ||
884 | How do we set up the same tree at multiple locations under | |
885 | /root/tmp | |
886 | ||
887 | step2: | |
888 | mount --bind /root/tmp /root/tmp | |
889 | ||
890 | mount --make-rshared /root | |
891 | mount --make-unbindable /root/tmp | |
892 | ||
893 | mkdir -p /tmp/m1 | |
894 | ||
895 | mount --rbind /root /tmp/m1 | |
896 | ||
897 | the new tree now looks like this: | |
898 | ||
899 | root | |
900 | / \ | |
901 | tmp usr | |
902 | / | |
903 | m1 | |
904 | / \ | |
905 | tmp usr | |
906 | ||
907 | step3: | |
908 | mkdir -p /tmp/m2 | |
909 | mount --rbind /root /tmp/m2 | |
910 | ||
911 | the new tree now looks like this: | |
912 | ||
913 | root | |
914 | / \ | |
915 | tmp usr | |
916 | / \ | |
917 | m1 m2 | |
918 | / \ / \ | |
919 | tmp usr tmp usr | |
920 | ||
921 | step4: | |
922 | ||
923 | mkdir -p /tmp/m3 | |
924 | mount --rbind /root /tmp/m3 | |
925 | ||
926 | the new tree now looks like this: | |
927 | ||
928 | root | |
929 | / \ | |
930 | tmp usr | |
931 | / \ \ | |
932 | m1 m2 m3 | |
933 | / \ / \ / \ | |
934 | tmp usr tmp usr tmp usr | |
935 | ||
936 | 8) Implementation | |
937 | ||
938 | 8A) Datastructure | |
939 | ||
940 | 4 new fields are introduced to struct vfsmount | |
941 | ->mnt_share | |
942 | ->mnt_slave_list | |
943 | ->mnt_slave | |
944 | ->mnt_master | |
945 | ||
fa00e7e1 | 946 | ->mnt_share links together all the mount to/from which this vfsmount |
9cfcceea RP |
947 | send/receives propagation events. |
948 | ||
949 | ->mnt_slave_list links all the mounts to which this vfsmount propagates | |
950 | to. | |
951 | ||
fa00e7e1 | 952 | ->mnt_slave links together all the slaves that its master vfsmount |
9cfcceea RP |
953 | propagates to. |
954 | ||
955 | ->mnt_master points to the master vfsmount from which this vfsmount | |
956 | receives propagation. | |
957 | ||
958 | ->mnt_flags takes two more flags to indicate the propagation status of | |
959 | the vfsmount. MNT_SHARE indicates that the vfsmount is a shared | |
960 | vfsmount. MNT_UNCLONABLE indicates that the vfsmount cannot be | |
961 | replicated. | |
962 | ||
963 | All the shared vfsmounts in a peer group form a cyclic list through | |
964 | ->mnt_share. | |
965 | ||
966 | All vfsmounts with the same ->mnt_master form on a cyclic list anchored | |
967 | in ->mnt_master->mnt_slave_list and going through ->mnt_slave. | |
968 | ||
969 | ->mnt_master can point to arbitrary (and possibly different) members | |
970 | of master peer group. To find all immediate slaves of a peer group | |
971 | you need to go through _all_ ->mnt_slave_list of its members. | |
972 | Conceptually it's just a single set - distribution among the | |
973 | individual lists does not affect propagation or the way propagation | |
974 | tree is modified by operations. | |
975 | ||
976 | A example propagation tree looks as shown in the figure below. | |
977 | [ NOTE: Though it looks like a forest, if we consider all the shared | |
978 | mounts as a conceptual entity called 'pnode', it becomes a tree] | |
979 | ||
980 | ||
981 | A <--> B <--> C <---> D | |
982 | /|\ /| |\ | |
983 | / F G J K H I | |
984 | / | |
985 | E<-->K | |
986 | /|\ | |
987 | M L N | |
988 | ||
989 | In the above figure A,B,C and D all are shared and propagate to each | |
990 | other. 'A' has got 3 slave mounts 'E' 'F' and 'G' 'C' has got 2 slave | |
991 | mounts 'J' and 'K' and 'D' has got two slave mounts 'H' and 'I'. | |
992 | 'E' is also shared with 'K' and they propagate to each other. And | |
993 | 'K' has 3 slaves 'M', 'L' and 'N' | |
994 | ||
995 | A's ->mnt_share links with the ->mnt_share of 'B' 'C' and 'D' | |
996 | ||
997 | A's ->mnt_slave_list links with ->mnt_slave of 'E', 'K', 'F' and 'G' | |
998 | ||
999 | E's ->mnt_share links with ->mnt_share of K | |
1000 | 'E', 'K', 'F', 'G' have their ->mnt_master point to struct | |
1001 | vfsmount of 'A' | |
1002 | 'M', 'L', 'N' have their ->mnt_master point to struct vfsmount of 'K' | |
1003 | K's ->mnt_slave_list links with ->mnt_slave of 'M', 'L' and 'N' | |
1004 | ||
1005 | C's ->mnt_slave_list links with ->mnt_slave of 'J' and 'K' | |
1006 | J and K's ->mnt_master points to struct vfsmount of C | |
1007 | and finally D's ->mnt_slave_list links with ->mnt_slave of 'H' and 'I' | |
1008 | 'H' and 'I' have their ->mnt_master pointing to struct vfsmount of 'D'. | |
1009 | ||
1010 | ||
1011 | NOTE: The propagation tree is orthogonal to the mount tree. | |
1012 | ||
1013 | ||
1014 | 8B Algorithm: | |
1015 | ||
1016 | The crux of the implementation resides in rbind/move operation. | |
1017 | ||
1018 | The overall algorithm breaks the operation into 3 phases: (look at | |
1019 | attach_recursive_mnt() and propagate_mnt()) | |
1020 | ||
1021 | 1. prepare phase. | |
1022 | 2. commit phases. | |
1023 | 3. abort phases. | |
1024 | ||
1025 | Prepare phase: | |
1026 | ||
1027 | for each mount in the source tree: | |
1028 | a) Create the necessary number of mount trees to | |
1029 | be attached to each of the mounts that receive | |
1030 | propagation from the destination mount. | |
1031 | b) Do not attach any of the trees to its destination. | |
1032 | However note down its ->mnt_parent and ->mnt_mountpoint | |
1033 | c) Link all the new mounts to form a propagation tree that | |
1034 | is identical to the propagation tree of the destination | |
1035 | mount. | |
1036 | ||
1037 | If this phase is successful, there should be 'n' new | |
1038 | propagation trees; where 'n' is the number of mounts in the | |
1039 | source tree. Go to the commit phase | |
1040 | ||
1041 | Also there should be 'm' new mount trees, where 'm' is | |
1042 | the number of mounts to which the destination mount | |
1043 | propagates to. | |
1044 | ||
1045 | if any memory allocations fail, go to the abort phase. | |
1046 | ||
1047 | Commit phase | |
1048 | attach each of the mount trees to their corresponding | |
1049 | destination mounts. | |
1050 | ||
1051 | Abort phase | |
1052 | delete all the newly created trees. | |
1053 | ||
1054 | NOTE: all the propagation related functionality resides in the file | |
1055 | pnode.c | |
1056 | ||
1057 | ||
1058 | ------------------------------------------------------------------------ | |
1059 | ||
1060 | version 0.1 (created the initial document, Ram Pai [email protected]) | |
1061 | version 0.2 (Incorporated comments from Al Viro) |