]>
Commit | Line | Data |
---|---|---|
9cc6fc50 DH |
1 | /* Handle fileserver selection and rotation. |
2 | * | |
3 | * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. | |
4 | * Written by David Howells ([email protected]) | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU General Public Licence | |
8 | * as published by the Free Software Foundation; either version | |
9 | * 2 of the Licence, or (at your option) any later version. | |
10 | */ | |
11 | ||
12 | #include <linux/kernel.h> | |
13 | #include <linux/slab.h> | |
d2ddc776 DH |
14 | #include <linux/fs.h> |
15 | #include <linux/sched.h> | |
16 | #include <linux/delay.h> | |
17 | #include <linux/sched/signal.h> | |
9cc6fc50 | 18 | #include "internal.h" |
d2ddc776 | 19 | #include "afs_fs.h" |
9cc6fc50 DH |
20 | |
21 | /* | |
22 | * Initialise a filesystem server cursor for iterating over FS servers. | |
23 | */ | |
fe342cf7 | 24 | static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) |
9cc6fc50 DH |
25 | { |
26 | memset(fc, 0, sizeof(*fc)); | |
27 | } | |
28 | ||
d2ddc776 DH |
29 | /* |
30 | * Begin an operation on the fileserver. | |
31 | * | |
32 | * Fileserver operations are serialised on the server by vnode, so we serialise | |
33 | * them here also using the io_lock. | |
34 | */ | |
35 | bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode, | |
36 | struct key *key) | |
37 | { | |
38 | afs_init_fs_cursor(fc, vnode); | |
39 | fc->vnode = vnode; | |
40 | fc->key = key; | |
41 | fc->ac.error = SHRT_MAX; | |
42 | ||
43 | if (mutex_lock_interruptible(&vnode->io_lock) < 0) { | |
44 | fc->ac.error = -EINTR; | |
45 | fc->flags |= AFS_FS_CURSOR_STOP; | |
46 | return false; | |
47 | } | |
48 | ||
0fafdc9f | 49 | if (vnode->lock_state != AFS_VNODE_LOCK_NONE) |
d2ddc776 DH |
50 | fc->flags |= AFS_FS_CURSOR_CUR_ONLY; |
51 | return true; | |
52 | } | |
53 | ||
54 | /* | |
55 | * Begin iteration through a server list, starting with the vnode's last used | |
56 | * server if possible, or the last recorded good server if not. | |
57 | */ | |
58 | static bool afs_start_fs_iteration(struct afs_fs_cursor *fc, | |
59 | struct afs_vnode *vnode) | |
60 | { | |
61 | struct afs_cb_interest *cbi; | |
62 | int i; | |
63 | ||
64 | read_lock(&vnode->volume->servers_lock); | |
65 | fc->server_list = afs_get_serverlist(vnode->volume->servers); | |
66 | read_unlock(&vnode->volume->servers_lock); | |
67 | ||
68 | cbi = vnode->cb_interest; | |
69 | if (cbi) { | |
70 | /* See if the vnode's preferred record is still available */ | |
71 | for (i = 0; i < fc->server_list->nr_servers; i++) { | |
72 | if (fc->server_list->servers[i].cb_interest == cbi) { | |
73 | fc->start = i; | |
74 | goto found_interest; | |
75 | } | |
76 | } | |
77 | ||
78 | /* If we have a lock outstanding on a server that's no longer | |
79 | * serving this vnode, then we can't switch to another server | |
80 | * and have to return an error. | |
81 | */ | |
82 | if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { | |
83 | fc->ac.error = -ESTALE; | |
84 | return false; | |
85 | } | |
86 | ||
87 | /* Note that the callback promise is effectively broken */ | |
88 | write_seqlock(&vnode->cb_lock); | |
89 | ASSERTCMP(cbi, ==, vnode->cb_interest); | |
90 | vnode->cb_interest = NULL; | |
91 | if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) | |
92 | vnode->cb_break++; | |
93 | write_sequnlock(&vnode->cb_lock); | |
94 | ||
95 | afs_put_cb_interest(afs_v2net(vnode), cbi); | |
96 | cbi = NULL; | |
97 | } else { | |
98 | fc->start = READ_ONCE(fc->server_list->index); | |
99 | } | |
100 | ||
101 | found_interest: | |
102 | fc->index = fc->start; | |
103 | return true; | |
104 | } | |
105 | ||
106 | /* | |
107 | * Post volume busy note. | |
108 | */ | |
109 | static void afs_busy(struct afs_volume *volume, u32 abort_code) | |
110 | { | |
111 | const char *m; | |
112 | ||
113 | switch (abort_code) { | |
114 | case VOFFLINE: m = "offline"; break; | |
115 | case VRESTARTING: m = "restarting"; break; | |
116 | case VSALVAGING: m = "being salvaged"; break; | |
117 | default: m = "busy"; break; | |
118 | } | |
0fafdc9f | 119 | |
d2ddc776 DH |
120 | pr_notice("kAFS: Volume %u '%s' is %s\n", volume->vid, volume->name, m); |
121 | } | |
122 | ||
123 | /* | |
124 | * Sleep and retry the operation to the same fileserver. | |
125 | */ | |
126 | static bool afs_sleep_and_retry(struct afs_fs_cursor *fc) | |
127 | { | |
128 | msleep_interruptible(1000); | |
129 | if (signal_pending(current)) { | |
130 | fc->ac.error = -ERESTARTSYS; | |
131 | return false; | |
132 | } | |
133 | ||
134 | return true; | |
135 | } | |
136 | ||
137 | /* | |
138 | * Select the fileserver to use. May be called multiple times to rotate | |
139 | * through the fileservers. | |
140 | */ | |
141 | bool afs_select_fileserver(struct afs_fs_cursor *fc) | |
142 | { | |
143 | struct afs_addr_list *alist; | |
144 | struct afs_server *server; | |
145 | struct afs_vnode *vnode = fc->vnode; | |
146 | ||
147 | _enter("%u/%u,%u/%u,%d,%d", | |
148 | fc->index, fc->start, | |
149 | fc->ac.index, fc->ac.start, | |
150 | fc->ac.error, fc->ac.abort_code); | |
151 | ||
152 | if (fc->flags & AFS_FS_CURSOR_STOP) { | |
153 | _leave(" = f [stopped]"); | |
154 | return false; | |
155 | } | |
156 | ||
157 | /* Evaluate the result of the previous operation, if there was one. */ | |
158 | switch (fc->ac.error) { | |
159 | case SHRT_MAX: | |
160 | goto start; | |
161 | ||
162 | case 0: | |
163 | default: | |
164 | /* Success or local failure. Stop. */ | |
165 | fc->flags |= AFS_FS_CURSOR_STOP; | |
166 | _leave(" = f [okay/local %d]", fc->ac.error); | |
167 | return false; | |
168 | ||
169 | case -ECONNABORTED: | |
170 | /* The far side rejected the operation on some grounds. This | |
171 | * might involve the server being busy or the volume having been moved. | |
172 | */ | |
173 | switch (fc->ac.abort_code) { | |
174 | case VNOVOL: | |
175 | /* This fileserver doesn't know about the volume. | |
176 | * - May indicate that the VL is wrong - retry once and compare | |
177 | * the results. | |
178 | * - May indicate that the fileserver couldn't attach to the vol. | |
179 | */ | |
180 | if (fc->flags & AFS_FS_CURSOR_VNOVOL) { | |
181 | fc->ac.error = -EREMOTEIO; | |
3d9fa911 | 182 | goto next_server; |
d2ddc776 DH |
183 | } |
184 | ||
185 | write_lock(&vnode->volume->servers_lock); | |
186 | fc->server_list->vnovol_mask |= 1 << fc->index; | |
187 | write_unlock(&vnode->volume->servers_lock); | |
188 | ||
189 | set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); | |
190 | fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); | |
191 | if (fc->ac.error < 0) | |
192 | goto failed; | |
193 | ||
194 | if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) { | |
195 | fc->ac.error = -ENOMEDIUM; | |
196 | goto failed; | |
197 | } | |
198 | ||
199 | /* If the server list didn't change, then assume that | |
200 | * it's the fileserver having trouble. | |
201 | */ | |
202 | if (vnode->volume->servers == fc->server_list) { | |
203 | fc->ac.error = -EREMOTEIO; | |
3d9fa911 | 204 | goto next_server; |
d2ddc776 DH |
205 | } |
206 | ||
207 | /* Try again */ | |
208 | fc->flags |= AFS_FS_CURSOR_VNOVOL; | |
209 | _leave(" = t [vnovol]"); | |
210 | return true; | |
211 | ||
212 | case VSALVAGE: /* TODO: Should this return an error or iterate? */ | |
213 | case VVOLEXISTS: | |
214 | case VNOSERVICE: | |
215 | case VONLINE: | |
216 | case VDISKFULL: | |
217 | case VOVERQUOTA: | |
218 | fc->ac.error = afs_abort_to_error(fc->ac.abort_code); | |
219 | goto next_server; | |
220 | ||
221 | case VOFFLINE: | |
222 | if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) { | |
223 | afs_busy(vnode->volume, fc->ac.abort_code); | |
224 | clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); | |
225 | } | |
226 | if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { | |
227 | fc->ac.error = -EADV; | |
228 | goto failed; | |
229 | } | |
230 | if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { | |
231 | fc->ac.error = -ESTALE; | |
232 | goto failed; | |
233 | } | |
234 | goto busy; | |
235 | ||
236 | case VSALVAGING: | |
237 | case VRESTARTING: | |
238 | case VBUSY: | |
239 | /* Retry after going round all the servers unless we | |
240 | * have a file lock we need to maintain. | |
241 | */ | |
242 | if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) { | |
243 | fc->ac.error = -EBUSY; | |
244 | goto failed; | |
245 | } | |
246 | if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) { | |
247 | afs_busy(vnode->volume, fc->ac.abort_code); | |
248 | clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); | |
249 | } | |
250 | busy: | |
251 | if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) { | |
252 | if (!afs_sleep_and_retry(fc)) | |
253 | goto failed; | |
254 | ||
255 | /* Retry with same server & address */ | |
256 | _leave(" = t [vbusy]"); | |
257 | return true; | |
258 | } | |
259 | ||
260 | fc->flags |= AFS_FS_CURSOR_VBUSY; | |
261 | goto next_server; | |
262 | ||
263 | case VMOVED: | |
264 | /* The volume migrated to another server. We consider | |
265 | * consider all locks and callbacks broken and request | |
266 | * an update from the VLDB. | |
267 | * | |
268 | * We also limit the number of VMOVED hops we will | |
269 | * honour, just in case someone sets up a loop. | |
270 | */ | |
271 | if (fc->flags & AFS_FS_CURSOR_VMOVED) { | |
272 | fc->ac.error = -EREMOTEIO; | |
273 | goto failed; | |
274 | } | |
275 | fc->flags |= AFS_FS_CURSOR_VMOVED; | |
276 | ||
277 | set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags); | |
278 | set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); | |
279 | fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); | |
280 | if (fc->ac.error < 0) | |
281 | goto failed; | |
282 | ||
283 | /* If the server list didn't change, then the VLDB is | |
284 | * out of sync with the fileservers. This is hopefully | |
285 | * a temporary condition, however, so we don't want to | |
286 | * permanently block access to the file. | |
287 | * | |
288 | * TODO: Try other fileservers if we can. | |
289 | * | |
290 | * TODO: Retry a few times with sleeps. | |
291 | */ | |
292 | if (vnode->volume->servers == fc->server_list) { | |
293 | fc->ac.error = -ENOMEDIUM; | |
294 | goto failed; | |
295 | } | |
296 | ||
297 | goto restart_from_beginning; | |
298 | ||
299 | default: | |
300 | clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags); | |
301 | clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags); | |
302 | fc->ac.error = afs_abort_to_error(fc->ac.abort_code); | |
303 | goto failed; | |
304 | } | |
305 | ||
306 | case -ENETUNREACH: | |
307 | case -EHOSTUNREACH: | |
308 | case -ECONNREFUSED: | |
309 | case -ETIMEDOUT: | |
310 | case -ETIME: | |
311 | _debug("no conn"); | |
312 | goto iterate_address; | |
1a025028 DH |
313 | |
314 | case -ECONNRESET: | |
315 | _debug("call reset"); | |
316 | goto failed; | |
d2ddc776 DH |
317 | } |
318 | ||
319 | restart_from_beginning: | |
320 | _debug("restart"); | |
321 | afs_end_cursor(&fc->ac); | |
322 | afs_put_cb_interest(afs_v2net(vnode), fc->cbi); | |
323 | fc->cbi = NULL; | |
324 | afs_put_serverlist(afs_v2net(vnode), fc->server_list); | |
325 | fc->server_list = NULL; | |
326 | start: | |
327 | _debug("start"); | |
328 | /* See if we need to do an update of the volume record. Note that the | |
329 | * volume may have moved or even have been deleted. | |
330 | */ | |
331 | fc->ac.error = afs_check_volume_status(vnode->volume, fc->key); | |
332 | if (fc->ac.error < 0) | |
333 | goto failed; | |
334 | ||
335 | if (!afs_start_fs_iteration(fc, vnode)) | |
336 | goto failed; | |
d2ddc776 DH |
337 | |
338 | use_server: | |
339 | _debug("use"); | |
340 | /* We're starting on a different fileserver from the list. We need to | |
341 | * check it, create a callback intercept, find its address list and | |
342 | * probe its capabilities before we use it. | |
343 | */ | |
344 | ASSERTCMP(fc->ac.alist, ==, NULL); | |
345 | server = fc->server_list->servers[fc->index].server; | |
346 | ||
347 | if (!afs_check_server_record(fc, server)) | |
348 | goto failed; | |
349 | ||
350 | _debug("USING SERVER: %pU", &server->uuid); | |
351 | ||
352 | /* Make sure we've got a callback interest record for this server. We | |
353 | * have to link it in before we send the request as we can be sent a | |
354 | * break request before we've finished decoding the reply and | |
355 | * installing the vnode. | |
356 | */ | |
d4a96bec DH |
357 | fc->ac.error = afs_register_server_cb_interest(vnode, fc->server_list, |
358 | fc->index); | |
d2ddc776 DH |
359 | if (fc->ac.error < 0) |
360 | goto failed; | |
361 | ||
362 | fc->cbi = afs_get_cb_interest(vnode->cb_interest); | |
363 | ||
364 | read_lock(&server->fs_lock); | |
365 | alist = rcu_dereference_protected(server->addresses, | |
366 | lockdep_is_held(&server->fs_lock)); | |
367 | afs_get_addrlist(alist); | |
368 | read_unlock(&server->fs_lock); | |
369 | ||
8305e579 | 370 | memset(&fc->ac, 0, sizeof(fc->ac)); |
d2ddc776 DH |
371 | |
372 | /* Probe the current fileserver if we haven't done so yet. */ | |
373 | if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) { | |
374 | fc->ac.alist = afs_get_addrlist(alist); | |
375 | ||
ec5a3b4b DH |
376 | if (!afs_probe_fileserver(fc)) { |
377 | switch (fc->ac.error) { | |
378 | case -ENOMEM: | |
379 | case -ERESTARTSYS: | |
380 | case -EINTR: | |
381 | goto failed; | |
382 | default: | |
383 | goto next_server; | |
384 | } | |
385 | } | |
d2ddc776 DH |
386 | } |
387 | ||
388 | if (!fc->ac.alist) | |
389 | fc->ac.alist = alist; | |
390 | else | |
391 | afs_put_addrlist(alist); | |
392 | ||
d2ddc776 DH |
393 | fc->ac.start = READ_ONCE(alist->index); |
394 | fc->ac.index = fc->ac.start; | |
d2ddc776 DH |
395 | |
396 | iterate_address: | |
397 | ASSERT(fc->ac.alist); | |
398 | _debug("iterate %d/%d", fc->ac.index, fc->ac.alist->nr_addrs); | |
399 | /* Iterate over the current server's address list to try and find an | |
400 | * address on which it will respond to us. | |
401 | */ | |
fe4d774c DH |
402 | if (!afs_iterate_addresses(&fc->ac)) |
403 | goto next_server; | |
d2ddc776 | 404 | |
fe4d774c DH |
405 | _leave(" = t"); |
406 | return true; | |
d2ddc776 | 407 | |
16280a15 DH |
408 | next_server: |
409 | _debug("next"); | |
410 | afs_end_cursor(&fc->ac); | |
411 | afs_put_cb_interest(afs_v2net(vnode), fc->cbi); | |
412 | fc->cbi = NULL; | |
413 | fc->index++; | |
414 | if (fc->index >= fc->server_list->nr_servers) | |
415 | fc->index = 0; | |
416 | if (fc->index != fc->start) | |
417 | goto use_server; | |
418 | ||
419 | /* That's all the servers poked to no good effect. Try again if some | |
420 | * of them were busy. | |
421 | */ | |
422 | if (fc->flags & AFS_FS_CURSOR_VBUSY) | |
423 | goto restart_from_beginning; | |
424 | ||
425 | fc->ac.error = -EDESTADDRREQ; | |
426 | goto failed; | |
427 | ||
d2ddc776 DH |
428 | failed: |
429 | fc->flags |= AFS_FS_CURSOR_STOP; | |
fe4d774c | 430 | afs_end_cursor(&fc->ac); |
d2ddc776 DH |
431 | _leave(" = f [failed %d]", fc->ac.error); |
432 | return false; | |
433 | } | |
434 | ||
435 | /* | |
436 | * Select the same fileserver we used for a vnode before and only that | |
437 | * fileserver. We use this when we have a lock on that file, which is backed | |
438 | * only by the fileserver we obtained it from. | |
439 | */ | |
440 | bool afs_select_current_fileserver(struct afs_fs_cursor *fc) | |
441 | { | |
442 | struct afs_vnode *vnode = fc->vnode; | |
443 | struct afs_cb_interest *cbi = vnode->cb_interest; | |
444 | struct afs_addr_list *alist; | |
445 | ||
446 | _enter(""); | |
447 | ||
0fafdc9f DH |
448 | switch (fc->ac.error) { |
449 | case SHRT_MAX: | |
450 | if (!cbi) { | |
451 | fc->ac.error = -ESTALE; | |
452 | fc->flags |= AFS_FS_CURSOR_STOP; | |
453 | return false; | |
454 | } | |
455 | ||
456 | fc->cbi = afs_get_cb_interest(vnode->cb_interest); | |
457 | ||
458 | read_lock(&cbi->server->fs_lock); | |
459 | alist = rcu_dereference_protected(cbi->server->addresses, | |
460 | lockdep_is_held(&cbi->server->fs_lock)); | |
461 | afs_get_addrlist(alist); | |
462 | read_unlock(&cbi->server->fs_lock); | |
463 | if (!alist) { | |
464 | fc->ac.error = -ESTALE; | |
465 | fc->flags |= AFS_FS_CURSOR_STOP; | |
466 | return false; | |
467 | } | |
468 | ||
8305e579 | 469 | memset(&fc->ac, 0, sizeof(fc->ac)); |
0fafdc9f | 470 | fc->ac.alist = alist; |
0fafdc9f DH |
471 | fc->ac.start = READ_ONCE(alist->index); |
472 | fc->ac.index = fc->ac.start; | |
0fafdc9f DH |
473 | goto iterate_address; |
474 | ||
475 | case 0: | |
476 | default: | |
477 | /* Success or local failure. Stop. */ | |
d2ddc776 | 478 | fc->flags |= AFS_FS_CURSOR_STOP; |
0fafdc9f | 479 | _leave(" = f [okay/local %d]", fc->ac.error); |
d2ddc776 | 480 | return false; |
d2ddc776 | 481 | |
0fafdc9f | 482 | case -ECONNABORTED: |
d2ddc776 | 483 | fc->flags |= AFS_FS_CURSOR_STOP; |
0fafdc9f | 484 | _leave(" = f [abort]"); |
d2ddc776 | 485 | return false; |
0fafdc9f DH |
486 | |
487 | case -ENETUNREACH: | |
488 | case -EHOSTUNREACH: | |
489 | case -ECONNREFUSED: | |
490 | case -ETIMEDOUT: | |
491 | case -ETIME: | |
492 | _debug("no conn"); | |
493 | goto iterate_address; | |
d2ddc776 DH |
494 | } |
495 | ||
0fafdc9f DH |
496 | iterate_address: |
497 | /* Iterate over the current server's address list to try and find an | |
498 | * address on which it will respond to us. | |
499 | */ | |
500 | if (afs_iterate_addresses(&fc->ac)) { | |
501 | _leave(" = t"); | |
502 | return true; | |
503 | } | |
504 | ||
505 | afs_end_cursor(&fc->ac); | |
506 | return false; | |
d2ddc776 DH |
507 | } |
508 | ||
509 | /* | |
510 | * Tidy up a filesystem cursor and unlock the vnode. | |
511 | */ | |
512 | int afs_end_vnode_operation(struct afs_fs_cursor *fc) | |
513 | { | |
514 | struct afs_net *net = afs_v2net(fc->vnode); | |
515 | int ret; | |
516 | ||
517 | mutex_unlock(&fc->vnode->io_lock); | |
518 | ||
519 | afs_end_cursor(&fc->ac); | |
520 | afs_put_cb_interest(net, fc->cbi); | |
521 | afs_put_serverlist(net, fc->server_list); | |
522 | ||
523 | ret = fc->ac.error; | |
524 | if (ret == -ECONNABORTED) | |
525 | afs_abort_to_error(fc->ac.abort_code); | |
526 | ||
527 | return fc->ac.error; | |
528 | } |