]>
Commit | Line | Data |
---|---|---|
c1d7c514 | 1 | // SPDX-License-Identifier: GPL-2.0 |
925baedd CM |
2 | /* |
3 | * Copyright (C) 2008 Oracle. All rights reserved. | |
925baedd | 4 | */ |
c1d7c514 | 5 | |
925baedd | 6 | #include <linux/sched.h> |
925baedd CM |
7 | #include <linux/pagemap.h> |
8 | #include <linux/spinlock.h> | |
9 | #include <linux/page-flags.h> | |
4881ee5a | 10 | #include <asm/bug.h> |
602cbe91 | 11 | #include "misc.h" |
925baedd CM |
12 | #include "ctree.h" |
13 | #include "extent_io.h" | |
14 | #include "locking.h" | |
15 | ||
d4e253bb DS |
16 | /* |
17 | * Extent buffer locking | |
18 | * ===================== | |
19 | * | |
20 | * The locks use a custom scheme that allows to do more operations than are | |
21 | * available fromt current locking primitives. The building blocks are still | |
22 | * rwlock and wait queues. | |
23 | * | |
24 | * Required semantics: | |
25 | * | |
26 | * - reader/writer exclusion | |
27 | * - writer/writer exclusion | |
28 | * - reader/reader sharing | |
29 | * - spinning lock semantics | |
30 | * - blocking lock semantics | |
31 | * - try-lock semantics for readers and writers | |
32 | * - one level nesting, allowing read lock to be taken by the same thread that | |
33 | * already has write lock | |
34 | * | |
35 | * The extent buffer locks (also called tree locks) manage access to eb data | |
36 | * related to the storage in the b-tree (keys, items, but not the individual | |
37 | * members of eb). | |
38 | * We want concurrency of many readers and safe updates. The underlying locking | |
39 | * is done by read-write spinlock and the blocking part is implemented using | |
40 | * counters and wait queues. | |
41 | * | |
42 | * spinning semantics - the low-level rwlock is held so all other threads that | |
43 | * want to take it are spinning on it. | |
44 | * | |
45 | * blocking semantics - the low-level rwlock is not held but the counter | |
46 | * denotes how many times the blocking lock was held; | |
47 | * sleeping is possible | |
48 | * | |
49 | * Write lock always allows only one thread to access the data. | |
50 | * | |
51 | * | |
52 | * Debugging | |
53 | * --------- | |
54 | * | |
55 | * There are additional state counters that are asserted in various contexts, | |
56 | * removed from non-debug build to reduce extent_buffer size and for | |
57 | * performance reasons. | |
58 | * | |
59 | * | |
329ced79 JB |
60 | * Lock recursion |
61 | * -------------- | |
d4e253bb DS |
62 | * |
63 | * A write operation on a tree might indirectly start a look up on the same | |
64 | * tree. This can happen when btrfs_cow_block locks the tree and needs to | |
65 | * lookup free extents. | |
66 | * | |
67 | * btrfs_cow_block | |
68 | * .. | |
69 | * alloc_tree_block_no_bg_flush | |
70 | * btrfs_alloc_tree_block | |
71 | * btrfs_reserve_extent | |
72 | * .. | |
73 | * load_free_space_cache | |
74 | * .. | |
75 | * btrfs_lookup_file_extent | |
76 | * btrfs_search_slot | |
77 | * | |
78 | * | |
79 | * Locking pattern - spinning | |
80 | * -------------------------- | |
81 | * | |
82 | * The simple locking scenario, the +--+ denotes the spinning section. | |
83 | * | |
84 | * +- btrfs_tree_lock | |
85 | * | - extent_buffer::rwlock is held | |
86 | * | - no heavy operations should happen, eg. IO, memory allocations, large | |
87 | * | structure traversals | |
88 | * +- btrfs_tree_unock | |
89 | * | |
90 | * | |
91 | * Locking pattern - blocking | |
92 | * -------------------------- | |
93 | * | |
94 | * The blocking write uses the following scheme. The +--+ denotes the spinning | |
95 | * section. | |
96 | * | |
97 | * +- btrfs_tree_lock | |
98 | * | | |
99 | * +- btrfs_set_lock_blocking_write | |
100 | * | |
101 | * - allowed: IO, memory allocations, etc. | |
102 | * | |
103 | * -- btrfs_tree_unlock - note, no explicit unblocking necessary | |
104 | * | |
105 | * | |
106 | * Blocking read is similar. | |
107 | * | |
108 | * +- btrfs_tree_read_lock | |
109 | * | | |
110 | * +- btrfs_set_lock_blocking_read | |
111 | * | |
112 | * - heavy operations allowed | |
113 | * | |
114 | * +- btrfs_tree_read_unlock_blocking | |
115 | * | | |
116 | * +- btrfs_tree_read_unlock | |
117 | * | |
118 | */ | |
119 | ||
e4e9fd0f | 120 | #ifdef CONFIG_BTRFS_DEBUG |
d6156218 | 121 | static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) |
e4e9fd0f | 122 | { |
f3dc24c5 DS |
123 | WARN_ON(eb->spinning_writers); |
124 | eb->spinning_writers++; | |
e4e9fd0f DS |
125 | } |
126 | ||
d6156218 | 127 | static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) |
e4e9fd0f | 128 | { |
f3dc24c5 DS |
129 | WARN_ON(eb->spinning_writers != 1); |
130 | eb->spinning_writers--; | |
e4e9fd0f DS |
131 | } |
132 | ||
d6156218 | 133 | static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) |
e4e9fd0f | 134 | { |
f3dc24c5 | 135 | WARN_ON(eb->spinning_writers); |
e4e9fd0f DS |
136 | } |
137 | ||
d6156218 | 138 | static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) |
225948de DS |
139 | { |
140 | atomic_inc(&eb->spinning_readers); | |
141 | } | |
142 | ||
d6156218 | 143 | static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) |
225948de DS |
144 | { |
145 | WARN_ON(atomic_read(&eb->spinning_readers) == 0); | |
146 | atomic_dec(&eb->spinning_readers); | |
147 | } | |
148 | ||
d6156218 | 149 | static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) |
58a2ddae DS |
150 | { |
151 | atomic_inc(&eb->read_locks); | |
152 | } | |
153 | ||
d6156218 | 154 | static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) |
58a2ddae DS |
155 | { |
156 | atomic_dec(&eb->read_locks); | |
157 | } | |
158 | ||
d6156218 | 159 | static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) |
58a2ddae DS |
160 | { |
161 | BUG_ON(!atomic_read(&eb->read_locks)); | |
162 | } | |
163 | ||
d6156218 | 164 | static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) |
e3f15388 | 165 | { |
00801ae4 | 166 | eb->write_locks++; |
e3f15388 DS |
167 | } |
168 | ||
d6156218 | 169 | static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) |
e3f15388 | 170 | { |
00801ae4 | 171 | eb->write_locks--; |
e3f15388 DS |
172 | } |
173 | ||
e4e9fd0f DS |
174 | #else |
175 | static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { } | |
176 | static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { } | |
177 | static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { } | |
225948de DS |
178 | static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { } |
179 | static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { } | |
58a2ddae DS |
180 | static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { } |
181 | static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { } | |
182 | static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { } | |
e3f15388 DS |
183 | static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { } |
184 | static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { } | |
e4e9fd0f DS |
185 | #endif |
186 | ||
d4e253bb DS |
187 | /* |
188 | * Mark already held read lock as blocking. Can be nested in write lock by the | |
189 | * same thread. | |
190 | * | |
191 | * Use when there are potentially long operations ahead so other thread waiting | |
192 | * on the lock will not actively spin but sleep instead. | |
193 | * | |
194 | * The rwlock is released and blocking reader counter is increased. | |
195 | */ | |
b95be2d9 DS |
196 | void btrfs_set_lock_blocking_read(struct extent_buffer *eb) |
197 | { | |
31aab402 | 198 | trace_btrfs_set_lock_blocking_read(eb); |
b95be2d9 DS |
199 | /* |
200 | * No lock is required. The lock owner may change if we have a read | |
201 | * lock, but it won't change to or away from us. If we have the write | |
202 | * lock, we are the owner and it'll never change. | |
203 | */ | |
329ced79 | 204 | if (eb->lock_recursed && current->pid == eb->lock_owner) |
b95be2d9 DS |
205 | return; |
206 | btrfs_assert_tree_read_locked(eb); | |
207 | atomic_inc(&eb->blocking_readers); | |
afd495a8 | 208 | btrfs_assert_spinning_readers_put(eb); |
b95be2d9 DS |
209 | read_unlock(&eb->lock); |
210 | } | |
211 | ||
d4e253bb DS |
212 | /* |
213 | * Mark already held write lock as blocking. | |
214 | * | |
215 | * Use when there are potentially long operations ahead so other threads | |
216 | * waiting on the lock will not actively spin but sleep instead. | |
217 | * | |
218 | * The rwlock is released and blocking writers is set. | |
219 | */ | |
b95be2d9 | 220 | void btrfs_set_lock_blocking_write(struct extent_buffer *eb) |
925baedd | 221 | { |
31aab402 | 222 | trace_btrfs_set_lock_blocking_write(eb); |
ea4ebde0 | 223 | /* |
b95be2d9 DS |
224 | * No lock is required. The lock owner may change if we have a read |
225 | * lock, but it won't change to or away from us. If we have the write | |
226 | * lock, we are the owner and it'll never change. | |
ea4ebde0 | 227 | */ |
329ced79 | 228 | if (eb->lock_recursed && current->pid == eb->lock_owner) |
ea4ebde0 | 229 | return; |
06297d8c | 230 | if (eb->blocking_writers == 0) { |
843ccf9f | 231 | btrfs_assert_spinning_writers_put(eb); |
b95be2d9 | 232 | btrfs_assert_tree_locked(eb); |
a4477988 | 233 | WRITE_ONCE(eb->blocking_writers, 1); |
b95be2d9 | 234 | write_unlock(&eb->lock); |
b4ce94de | 235 | } |
b4ce94de | 236 | } |
f9efa9c7 | 237 | |
b4ce94de | 238 | /* |
d4e253bb DS |
239 | * Lock the extent buffer for read. Wait for any writers (spinning or blocking). |
240 | * Can be nested in write lock by the same thread. | |
241 | * | |
242 | * Use when the locked section does only lightweight actions and busy waiting | |
243 | * would be cheaper than making other threads do the wait/wake loop. | |
244 | * | |
245 | * The rwlock is held upon exit. | |
b4ce94de | 246 | */ |
fd7ba1c1 JB |
247 | void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, |
248 | bool recurse) | |
b4ce94de | 249 | { |
34e73cc9 QW |
250 | u64 start_ns = 0; |
251 | ||
252 | if (trace_btrfs_tree_read_lock_enabled()) | |
253 | start_ns = ktime_get_ns(); | |
bd681513 | 254 | again: |
5b25f70f | 255 | read_lock(&eb->lock); |
06297d8c DS |
256 | BUG_ON(eb->blocking_writers == 0 && |
257 | current->pid == eb->lock_owner); | |
06297d8c | 258 | if (eb->blocking_writers) { |
f5c2a525 DS |
259 | if (current->pid == eb->lock_owner) { |
260 | /* | |
261 | * This extent is already write-locked by our thread. | |
262 | * We allow an additional read lock to be added because | |
263 | * it's for the same thread. btrfs_find_all_roots() | |
264 | * depends on this as it may be called on a partly | |
265 | * (write-)locked tree. | |
266 | */ | |
51899412 | 267 | WARN_ON(!recurse); |
329ced79 JB |
268 | BUG_ON(eb->lock_recursed); |
269 | eb->lock_recursed = true; | |
f5c2a525 DS |
270 | read_unlock(&eb->lock); |
271 | trace_btrfs_tree_read_lock(eb, start_ns); | |
272 | return; | |
273 | } | |
bd681513 | 274 | read_unlock(&eb->lock); |
39f9d028 | 275 | wait_event(eb->write_lock_wq, |
a4477988 | 276 | READ_ONCE(eb->blocking_writers) == 0); |
bd681513 | 277 | goto again; |
b4ce94de | 278 | } |
5c9c799a | 279 | btrfs_assert_tree_read_locks_get(eb); |
afd495a8 | 280 | btrfs_assert_spinning_readers_get(eb); |
34e73cc9 | 281 | trace_btrfs_tree_read_lock(eb, start_ns); |
b4ce94de CM |
282 | } |
283 | ||
51899412 JB |
284 | void btrfs_tree_read_lock(struct extent_buffer *eb) |
285 | { | |
fd7ba1c1 | 286 | __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false); |
51899412 JB |
287 | } |
288 | ||
f82c458a | 289 | /* |
d4e253bb DS |
290 | * Lock extent buffer for read, optimistically expecting that there are no |
291 | * contending blocking writers. If there are, don't wait. | |
292 | * | |
293 | * Return 1 if the rwlock has been taken, 0 otherwise | |
f82c458a CM |
294 | */ |
295 | int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) | |
296 | { | |
a4477988 | 297 | if (READ_ONCE(eb->blocking_writers)) |
f82c458a CM |
298 | return 0; |
299 | ||
300 | read_lock(&eb->lock); | |
a4477988 DS |
301 | /* Refetch value after lock */ |
302 | if (READ_ONCE(eb->blocking_writers)) { | |
f82c458a CM |
303 | read_unlock(&eb->lock); |
304 | return 0; | |
305 | } | |
5c9c799a | 306 | btrfs_assert_tree_read_locks_get(eb); |
afd495a8 | 307 | btrfs_assert_spinning_readers_get(eb); |
31aab402 | 308 | trace_btrfs_tree_read_lock_atomic(eb); |
f82c458a CM |
309 | return 1; |
310 | } | |
311 | ||
b4ce94de | 312 | /* |
d4e253bb DS |
313 | * Try-lock for read. Don't block or wait for contending writers. |
314 | * | |
315 | * Retrun 1 if the rwlock has been taken, 0 otherwise | |
b4ce94de | 316 | */ |
bd681513 | 317 | int btrfs_try_tree_read_lock(struct extent_buffer *eb) |
b4ce94de | 318 | { |
a4477988 | 319 | if (READ_ONCE(eb->blocking_writers)) |
bd681513 | 320 | return 0; |
b4ce94de | 321 | |
ea4ebde0 CM |
322 | if (!read_trylock(&eb->lock)) |
323 | return 0; | |
324 | ||
a4477988 DS |
325 | /* Refetch value after lock */ |
326 | if (READ_ONCE(eb->blocking_writers)) { | |
bd681513 CM |
327 | read_unlock(&eb->lock); |
328 | return 0; | |
b9473439 | 329 | } |
5c9c799a | 330 | btrfs_assert_tree_read_locks_get(eb); |
afd495a8 | 331 | btrfs_assert_spinning_readers_get(eb); |
31aab402 | 332 | trace_btrfs_try_tree_read_lock(eb); |
bd681513 | 333 | return 1; |
b4ce94de CM |
334 | } |
335 | ||
336 | /* | |
d4e253bb DS |
337 | * Try-lock for write. May block until the lock is uncontended, but does not |
338 | * wait until it is free. | |
339 | * | |
340 | * Retrun 1 if the rwlock has been taken, 0 otherwise | |
b4ce94de | 341 | */ |
bd681513 | 342 | int btrfs_try_tree_write_lock(struct extent_buffer *eb) |
b4ce94de | 343 | { |
a4477988 | 344 | if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) |
bd681513 | 345 | return 0; |
ea4ebde0 | 346 | |
f82c458a | 347 | write_lock(&eb->lock); |
a4477988 DS |
348 | /* Refetch value after lock */ |
349 | if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) { | |
bd681513 CM |
350 | write_unlock(&eb->lock); |
351 | return 0; | |
352 | } | |
c79adfc0 | 353 | btrfs_assert_tree_write_locks_get(eb); |
843ccf9f | 354 | btrfs_assert_spinning_writers_get(eb); |
5b25f70f | 355 | eb->lock_owner = current->pid; |
31aab402 | 356 | trace_btrfs_try_tree_write_lock(eb); |
b4ce94de CM |
357 | return 1; |
358 | } | |
359 | ||
360 | /* | |
d4e253bb DS |
361 | * Release read lock. Must be used only if the lock is in spinning mode. If |
362 | * the read lock is nested, must pair with read lock before the write unlock. | |
363 | * | |
364 | * The rwlock is not held upon exit. | |
bd681513 CM |
365 | */ |
366 | void btrfs_tree_read_unlock(struct extent_buffer *eb) | |
367 | { | |
31aab402 | 368 | trace_btrfs_tree_read_unlock(eb); |
ea4ebde0 CM |
369 | /* |
370 | * if we're nested, we have the write lock. No new locking | |
371 | * is needed as long as we are the lock owner. | |
329ced79 | 372 | * The write unlock will do a barrier for us, and the lock_recursed |
ea4ebde0 CM |
373 | * field only matters to the lock owner. |
374 | */ | |
329ced79 JB |
375 | if (eb->lock_recursed && current->pid == eb->lock_owner) { |
376 | eb->lock_recursed = false; | |
ea4ebde0 | 377 | return; |
5b25f70f | 378 | } |
bd681513 | 379 | btrfs_assert_tree_read_locked(eb); |
afd495a8 | 380 | btrfs_assert_spinning_readers_put(eb); |
5c9c799a | 381 | btrfs_assert_tree_read_locks_put(eb); |
bd681513 CM |
382 | read_unlock(&eb->lock); |
383 | } | |
384 | ||
385 | /* | |
d4e253bb DS |
386 | * Release read lock, previously set to blocking by a pairing call to |
387 | * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same | |
388 | * thread. | |
389 | * | |
390 | * State of rwlock is unchanged, last reader wakes waiting threads. | |
bd681513 CM |
391 | */ |
392 | void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) | |
393 | { | |
31aab402 | 394 | trace_btrfs_tree_read_unlock_blocking(eb); |
ea4ebde0 CM |
395 | /* |
396 | * if we're nested, we have the write lock. No new locking | |
397 | * is needed as long as we are the lock owner. | |
329ced79 | 398 | * The write unlock will do a barrier for us, and the lock_recursed |
ea4ebde0 CM |
399 | * field only matters to the lock owner. |
400 | */ | |
329ced79 JB |
401 | if (eb->lock_recursed && current->pid == eb->lock_owner) { |
402 | eb->lock_recursed = false; | |
ea4ebde0 | 403 | return; |
5b25f70f | 404 | } |
bd681513 CM |
405 | btrfs_assert_tree_read_locked(eb); |
406 | WARN_ON(atomic_read(&eb->blocking_readers) == 0); | |
093258e6 DS |
407 | /* atomic_dec_and_test implies a barrier */ |
408 | if (atomic_dec_and_test(&eb->blocking_readers)) | |
409 | cond_wake_up_nomb(&eb->read_lock_wq); | |
5c9c799a | 410 | btrfs_assert_tree_read_locks_put(eb); |
bd681513 CM |
411 | } |
412 | ||
413 | /* | |
d4e253bb DS |
414 | * Lock for write. Wait for all blocking and spinning readers and writers. This |
415 | * starts context where reader lock could be nested by the same thread. | |
416 | * | |
417 | * The rwlock is held for write upon exit. | |
b4ce94de | 418 | */ |
fd7ba1c1 | 419 | void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) |
78d933c7 | 420 | __acquires(&eb->lock) |
b4ce94de | 421 | { |
34e73cc9 QW |
422 | u64 start_ns = 0; |
423 | ||
424 | if (trace_btrfs_tree_lock_enabled()) | |
425 | start_ns = ktime_get_ns(); | |
426 | ||
166f66d0 | 427 | WARN_ON(eb->lock_owner == current->pid); |
bd681513 CM |
428 | again: |
429 | wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); | |
a4477988 | 430 | wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0); |
bd681513 | 431 | write_lock(&eb->lock); |
a4477988 DS |
432 | /* Refetch value after lock */ |
433 | if (atomic_read(&eb->blocking_readers) || | |
434 | READ_ONCE(eb->blocking_writers)) { | |
bd681513 | 435 | write_unlock(&eb->lock); |
bd681513 CM |
436 | goto again; |
437 | } | |
843ccf9f | 438 | btrfs_assert_spinning_writers_get(eb); |
c79adfc0 | 439 | btrfs_assert_tree_write_locks_get(eb); |
5b25f70f | 440 | eb->lock_owner = current->pid; |
34e73cc9 | 441 | trace_btrfs_tree_lock(eb, start_ns); |
925baedd CM |
442 | } |
443 | ||
fd7ba1c1 JB |
444 | void btrfs_tree_lock(struct extent_buffer *eb) |
445 | { | |
446 | __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); | |
447 | } | |
448 | ||
bd681513 | 449 | /* |
d4e253bb DS |
450 | * Release the write lock, either blocking or spinning (ie. there's no need |
451 | * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking). | |
452 | * This also ends the context for nesting, the read lock must have been | |
453 | * released already. | |
454 | * | |
455 | * Tasks blocked and waiting are woken, rwlock is not held upon exit. | |
bd681513 | 456 | */ |
143bede5 | 457 | void btrfs_tree_unlock(struct extent_buffer *eb) |
925baedd | 458 | { |
a4477988 DS |
459 | /* |
460 | * This is read both locked and unlocked but always by the same thread | |
461 | * that already owns the lock so we don't need to use READ_ONCE | |
462 | */ | |
06297d8c | 463 | int blockers = eb->blocking_writers; |
bd681513 CM |
464 | |
465 | BUG_ON(blockers > 1); | |
466 | ||
467 | btrfs_assert_tree_locked(eb); | |
31aab402 | 468 | trace_btrfs_tree_unlock(eb); |
ea4ebde0 | 469 | eb->lock_owner = 0; |
c79adfc0 | 470 | btrfs_assert_tree_write_locks_put(eb); |
bd681513 CM |
471 | |
472 | if (blockers) { | |
843ccf9f | 473 | btrfs_assert_no_spinning_writers(eb); |
a4477988 DS |
474 | /* Unlocked write */ |
475 | WRITE_ONCE(eb->blocking_writers, 0); | |
6e7ca09b NB |
476 | /* |
477 | * We need to order modifying blocking_writers above with | |
478 | * actually waking up the sleepers to ensure they see the | |
479 | * updated value of blocking_writers | |
480 | */ | |
481 | cond_wake_up(&eb->write_lock_wq); | |
bd681513 | 482 | } else { |
843ccf9f | 483 | btrfs_assert_spinning_writers_put(eb); |
bd681513 CM |
484 | write_unlock(&eb->lock); |
485 | } | |
925baedd | 486 | } |
ed2b1d36 DS |
487 | |
488 | /* | |
489 | * Set all locked nodes in the path to blocking locks. This should be done | |
490 | * before scheduling | |
491 | */ | |
492 | void btrfs_set_path_blocking(struct btrfs_path *p) | |
493 | { | |
494 | int i; | |
495 | ||
496 | for (i = 0; i < BTRFS_MAX_LEVEL; i++) { | |
497 | if (!p->nodes[i] || !p->locks[i]) | |
498 | continue; | |
499 | /* | |
500 | * If we currently have a spinning reader or writer lock this | |
501 | * will bump the count of blocking holders and drop the | |
502 | * spinlock. | |
503 | */ | |
504 | if (p->locks[i] == BTRFS_READ_LOCK) { | |
505 | btrfs_set_lock_blocking_read(p->nodes[i]); | |
506 | p->locks[i] = BTRFS_READ_LOCK_BLOCKING; | |
507 | } else if (p->locks[i] == BTRFS_WRITE_LOCK) { | |
508 | btrfs_set_lock_blocking_write(p->nodes[i]); | |
509 | p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; | |
510 | } | |
511 | } | |
512 | } | |
1f95ec01 DS |
513 | |
514 | /* | |
515 | * This releases any locks held in the path starting at level and going all the | |
516 | * way up to the root. | |
517 | * | |
518 | * btrfs_search_slot will keep the lock held on higher nodes in a few corner | |
519 | * cases, such as COW of the block at slot zero in the node. This ignores | |
520 | * those rules, and it should only be called when there are no more updates to | |
521 | * be done higher up in the tree. | |
522 | */ | |
523 | void btrfs_unlock_up_safe(struct btrfs_path *path, int level) | |
524 | { | |
525 | int i; | |
526 | ||
527 | if (path->keep_locks) | |
528 | return; | |
529 | ||
530 | for (i = level; i < BTRFS_MAX_LEVEL; i++) { | |
531 | if (!path->nodes[i]) | |
532 | continue; | |
533 | if (!path->locks[i]) | |
534 | continue; | |
535 | btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); | |
536 | path->locks[i] = 0; | |
537 | } | |
538 | } | |
b908c334 DS |
539 | |
540 | /* | |
541 | * Loop around taking references on and locking the root node of the tree until | |
542 | * we end up with a lock on the root node. | |
543 | * | |
544 | * Return: root extent buffer with write lock held | |
545 | */ | |
546 | struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) | |
547 | { | |
548 | struct extent_buffer *eb; | |
549 | ||
550 | while (1) { | |
551 | eb = btrfs_root_node(root); | |
552 | btrfs_tree_lock(eb); | |
553 | if (eb == root->node) | |
554 | break; | |
555 | btrfs_tree_unlock(eb); | |
556 | free_extent_buffer(eb); | |
557 | } | |
558 | return eb; | |
559 | } | |
560 | ||
561 | /* | |
562 | * Loop around taking references on and locking the root node of the tree until | |
563 | * we end up with a lock on the root node. | |
564 | * | |
565 | * Return: root extent buffer with read lock held | |
566 | */ | |
51899412 JB |
567 | struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, |
568 | bool recurse) | |
b908c334 DS |
569 | { |
570 | struct extent_buffer *eb; | |
571 | ||
572 | while (1) { | |
573 | eb = btrfs_root_node(root); | |
fd7ba1c1 | 574 | __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse); |
b908c334 DS |
575 | if (eb == root->node) |
576 | break; | |
577 | btrfs_tree_read_unlock(eb); | |
578 | free_extent_buffer(eb); | |
579 | } | |
580 | return eb; | |
581 | } | |
2992df73 NB |
582 | |
583 | /* | |
584 | * DREW locks | |
585 | * ========== | |
586 | * | |
587 | * DREW stands for double-reader-writer-exclusion lock. It's used in situation | |
588 | * where you want to provide A-B exclusion but not AA or BB. | |
589 | * | |
590 | * Currently implementation gives more priority to reader. If a reader and a | |
591 | * writer both race to acquire their respective sides of the lock the writer | |
592 | * would yield its lock as soon as it detects a concurrent reader. Additionally | |
593 | * if there are pending readers no new writers would be allowed to come in and | |
594 | * acquire the lock. | |
595 | */ | |
596 | ||
597 | int btrfs_drew_lock_init(struct btrfs_drew_lock *lock) | |
598 | { | |
599 | int ret; | |
600 | ||
601 | ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL); | |
602 | if (ret) | |
603 | return ret; | |
604 | ||
605 | atomic_set(&lock->readers, 0); | |
606 | init_waitqueue_head(&lock->pending_readers); | |
607 | init_waitqueue_head(&lock->pending_writers); | |
608 | ||
609 | return 0; | |
610 | } | |
611 | ||
612 | void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock) | |
613 | { | |
614 | percpu_counter_destroy(&lock->writers); | |
615 | } | |
616 | ||
617 | /* Return true if acquisition is successful, false otherwise */ | |
618 | bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) | |
619 | { | |
620 | if (atomic_read(&lock->readers)) | |
621 | return false; | |
622 | ||
623 | percpu_counter_inc(&lock->writers); | |
624 | ||
625 | /* Ensure writers count is updated before we check for pending readers */ | |
626 | smp_mb(); | |
627 | if (atomic_read(&lock->readers)) { | |
628 | btrfs_drew_write_unlock(lock); | |
629 | return false; | |
630 | } | |
631 | ||
632 | return true; | |
633 | } | |
634 | ||
635 | void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) | |
636 | { | |
637 | while (true) { | |
638 | if (btrfs_drew_try_write_lock(lock)) | |
639 | return; | |
640 | wait_event(lock->pending_writers, !atomic_read(&lock->readers)); | |
641 | } | |
642 | } | |
643 | ||
644 | void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) | |
645 | { | |
646 | percpu_counter_dec(&lock->writers); | |
647 | cond_wake_up(&lock->pending_readers); | |
648 | } | |
649 | ||
650 | void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) | |
651 | { | |
652 | atomic_inc(&lock->readers); | |
653 | ||
654 | /* | |
655 | * Ensure the pending reader count is perceieved BEFORE this reader | |
656 | * goes to sleep in case of active writers. This guarantees new writers | |
657 | * won't be allowed and that the current reader will be woken up when | |
658 | * the last active writer finishes its jobs. | |
659 | */ | |
660 | smp_mb__after_atomic(); | |
661 | ||
662 | wait_event(lock->pending_readers, | |
663 | percpu_counter_sum(&lock->writers) == 0); | |
664 | } | |
665 | ||
666 | void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) | |
667 | { | |
668 | /* | |
669 | * atomic_dec_and_test implies a full barrier, so woken up writers | |
670 | * are guaranteed to see the decrement | |
671 | */ | |
672 | if (atomic_dec_and_test(&lock->readers)) | |
673 | wake_up(&lock->pending_writers); | |
674 | } |