]>
Commit | Line | Data |
---|---|---|
51dee5e4 PB |
1 | DOCUMENTATION FOR LOCKED COUNTERS (aka QemuLockCnt) |
2 | =================================================== | |
3 | ||
4 | QEMU often uses reference counts to track data structures that are being | |
5 | accessed and should not be freed. For example, a loop that invoke | |
6 | callbacks like this is not safe: | |
7 | ||
8 | QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { | |
9 | if (ioh->revents & G_IO_OUT) { | |
10 | ioh->fd_write(ioh->opaque); | |
11 | } | |
12 | } | |
13 | ||
14 | QLIST_FOREACH_SAFE protects against deletion of the current node (ioh) | |
15 | by stashing away its "next" pointer. However, ioh->fd_write could | |
16 | actually delete the next node from the list. The simplest way to | |
17 | avoid this is to mark the node as deleted, and remove it from the | |
18 | list in the above loop: | |
19 | ||
20 | QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { | |
21 | if (ioh->deleted) { | |
22 | QLIST_REMOVE(ioh, next); | |
23 | g_free(ioh); | |
24 | } else { | |
25 | if (ioh->revents & G_IO_OUT) { | |
26 | ioh->fd_write(ioh->opaque); | |
27 | } | |
28 | } | |
29 | } | |
30 | ||
31 | If however this loop must also be reentrant, i.e. it is possible that | |
32 | ioh->fd_write invokes the loop again, some kind of counting is needed: | |
33 | ||
34 | walking_handlers++; | |
35 | QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { | |
36 | if (ioh->deleted) { | |
37 | if (walking_handlers == 1) { | |
38 | QLIST_REMOVE(ioh, next); | |
39 | g_free(ioh); | |
40 | } | |
41 | } else { | |
42 | if (ioh->revents & G_IO_OUT) { | |
43 | ioh->fd_write(ioh->opaque); | |
44 | } | |
45 | } | |
46 | } | |
47 | walking_handlers--; | |
48 | ||
49 | One may think of using the RCU primitives, rcu_read_lock() and | |
50 | rcu_read_unlock(); effectively, the RCU nesting count would take | |
51 | the place of the walking_handlers global variable. Indeed, | |
52 | reference counting and RCU have similar purposes, but their usage in | |
53 | general is complementary: | |
54 | ||
55 | - reference counting is fine-grained and limited to a single data | |
56 | structure; RCU delays reclamation of *all* RCU-protected data | |
57 | structures; | |
58 | ||
59 | - reference counting works even in the presence of code that keeps | |
60 | a reference for a long time; RCU critical sections in principle | |
61 | should be kept short; | |
62 | ||
63 | - reference counting is often applied to code that is not thread-safe | |
64 | but is reentrant; in fact, usage of reference counting in QEMU predates | |
65 | the introduction of threads by many years. RCU is generally used to | |
66 | protect readers from other threads freeing memory after concurrent | |
67 | modifications to a data structure. | |
68 | ||
69 | - reclaiming data can be done by a separate thread in the case of RCU; | |
70 | this can improve performance, but also delay reclamation undesirably. | |
71 | With reference counting, reclamation is deterministic. | |
72 | ||
73 | This file documents QemuLockCnt, an abstraction for using reference | |
74 | counting in code that has to be both thread-safe and reentrant. | |
75 | ||
76 | ||
77 | QemuLockCnt concepts | |
78 | -------------------- | |
79 | ||
80 | A QemuLockCnt comprises both a counter and a mutex; it has primitives | |
81 | to increment and decrement the counter, and to take and release the | |
82 | mutex. The counter notes how many visits to the data structures are | |
83 | taking place (the visits could be from different threads, or there could | |
84 | be multiple reentrant visits from the same thread). The basic rules | |
85 | governing the counter/mutex pair then are the following: | |
86 | ||
87 | - Data protected by the QemuLockCnt must not be freed unless the | |
88 | counter is zero and the mutex is taken. | |
89 | ||
90 | - A new visit cannot be started while the counter is zero and the | |
91 | mutex is taken. | |
92 | ||
93 | Most of the time, the mutex protects all writes to the data structure, | |
94 | not just frees, though there could be cases where this is not necessary. | |
95 | ||
96 | Reads, instead, can be done without taking the mutex, as long as the | |
97 | readers and writers use the same macros that are used for RCU, for | |
d73415a3 | 98 | example qatomic_rcu_read, qatomic_rcu_set, QLIST_FOREACH_RCU, etc. This is |
51dee5e4 PB |
99 | because the reads are done outside a lock and a set or QLIST_INSERT_HEAD |
100 | can happen concurrently with the read. The RCU API ensures that the | |
101 | processor and the compiler see all required memory barriers. | |
102 | ||
103 | This could be implemented simply by protecting the counter with the | |
104 | mutex, for example: | |
105 | ||
106 | // (1) | |
107 | qemu_mutex_lock(&walking_handlers_mutex); | |
108 | walking_handlers++; | |
109 | qemu_mutex_unlock(&walking_handlers_mutex); | |
110 | ||
111 | ... | |
112 | ||
113 | // (2) | |
114 | qemu_mutex_lock(&walking_handlers_mutex); | |
115 | if (--walking_handlers == 0) { | |
116 | QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { | |
117 | if (ioh->deleted) { | |
118 | QLIST_REMOVE(ioh, next); | |
119 | g_free(ioh); | |
120 | } | |
121 | } | |
122 | } | |
123 | qemu_mutex_unlock(&walking_handlers_mutex); | |
124 | ||
125 | Here, no frees can happen in the code represented by the ellipsis. | |
126 | If another thread is executing critical section (2), that part of | |
127 | the code cannot be entered, because the thread will not be able | |
128 | to increment the walking_handlers variable. And of course | |
129 | during the visit any other thread will see a nonzero value for | |
130 | walking_handlers, as in the single-threaded code. | |
131 | ||
132 | Note that it is possible for multiple concurrent accesses to delay | |
133 | the cleanup arbitrarily; in other words, for the walking_handlers | |
134 | counter to never become zero. For this reason, this technique is | |
135 | more easily applicable if concurrent access to the structure is rare. | |
136 | ||
137 | However, critical sections are easy to forget since you have to do | |
138 | them for each modification of the counter. QemuLockCnt ensures that | |
139 | all modifications of the counter take the lock appropriately, and it | |
140 | can also be more efficient in two ways: | |
141 | ||
142 | - it avoids taking the lock for many operations (for example | |
143 | incrementing the counter while it is non-zero); | |
144 | ||
fbcc3e50 PB |
145 | - on some platforms, one can implement QemuLockCnt to hold the lock |
146 | and the mutex in a single word, making the fast path no more expensive | |
51dee5e4 | 147 | than simply managing a counter using atomic operations (see |
b208ac07 | 148 | docs/devel/atomics.txt). This can be very helpful if concurrent access to |
fbcc3e50 | 149 | the data structure is expected to be rare. |
51dee5e4 PB |
150 | |
151 | ||
152 | Using the same mutex for frees and writes can still incur some small | |
153 | inefficiencies; for example, a visit can never start if the counter is | |
154 | zero and the mutex is taken---even if the mutex is taken by a write, | |
155 | which in principle need not block a visit of the data structure. | |
156 | However, these are usually not a problem if any of the following | |
157 | assumptions are valid: | |
158 | ||
159 | - concurrent access is possible but rare | |
160 | ||
161 | - writes are rare | |
162 | ||
163 | - writes are frequent, but this kind of write (e.g. appending to a | |
164 | list) has a very small critical section. | |
165 | ||
166 | For example, QEMU uses QemuLockCnt to manage an AioContext's list of | |
167 | bottom halves and file descriptor handlers. Modifications to the list | |
168 | of file descriptor handlers are rare. Creation of a new bottom half is | |
169 | frequent and can happen on a fast path; however: 1) it is almost never | |
170 | concurrent with a visit to the list of bottom halves; 2) it only has | |
171 | three instructions in the critical path, two assignments and a smp_wmb(). | |
172 | ||
173 | ||
174 | QemuLockCnt API | |
175 | --------------- | |
176 | ||
177 | The QemuLockCnt API is described in include/qemu/thread.h. | |
178 | ||
179 | ||
180 | QemuLockCnt usage | |
181 | ----------------- | |
182 | ||
183 | This section explains the typical usage patterns for QemuLockCnt functions. | |
184 | ||
185 | Setting a variable to a non-NULL value can be done between | |
186 | qemu_lockcnt_lock and qemu_lockcnt_unlock: | |
187 | ||
188 | qemu_lockcnt_lock(&xyz_lockcnt); | |
189 | if (!xyz) { | |
190 | new_xyz = g_new(XYZ, 1); | |
191 | ... | |
d73415a3 | 192 | qatomic_rcu_set(&xyz, new_xyz); |
51dee5e4 PB |
193 | } |
194 | qemu_lockcnt_unlock(&xyz_lockcnt); | |
195 | ||
196 | Accessing the value can be done between qemu_lockcnt_inc and | |
197 | qemu_lockcnt_dec: | |
198 | ||
199 | qemu_lockcnt_inc(&xyz_lockcnt); | |
200 | if (xyz) { | |
d73415a3 | 201 | XYZ *p = qatomic_rcu_read(&xyz); |
51dee5e4 PB |
202 | ... |
203 | /* Accesses can now be done through "p". */ | |
204 | } | |
205 | qemu_lockcnt_dec(&xyz_lockcnt); | |
206 | ||
207 | Freeing the object can similarly use qemu_lockcnt_lock and | |
208 | qemu_lockcnt_unlock, but you also need to ensure that the count | |
209 | is zero (i.e. there is no concurrent visit). Because qemu_lockcnt_inc | |
210 | takes the QemuLockCnt's lock, the count cannot become non-zero while | |
211 | the object is being freed. Freeing an object looks like this: | |
212 | ||
213 | qemu_lockcnt_lock(&xyz_lockcnt); | |
214 | if (!qemu_lockcnt_count(&xyz_lockcnt)) { | |
215 | g_free(xyz); | |
216 | xyz = NULL; | |
217 | } | |
218 | qemu_lockcnt_unlock(&xyz_lockcnt); | |
219 | ||
220 | If an object has to be freed right after a visit, you can combine | |
221 | the decrement, the locking and the check on count as follows: | |
222 | ||
223 | qemu_lockcnt_inc(&xyz_lockcnt); | |
224 | if (xyz) { | |
d73415a3 | 225 | XYZ *p = qatomic_rcu_read(&xyz); |
51dee5e4 PB |
226 | ... |
227 | /* Accesses can now be done through "p". */ | |
228 | } | |
229 | if (qemu_lockcnt_dec_and_lock(&xyz_lockcnt)) { | |
230 | g_free(xyz); | |
231 | xyz = NULL; | |
232 | qemu_lockcnt_unlock(&xyz_lockcnt); | |
233 | } | |
234 | ||
235 | QemuLockCnt can also be used to access a list as follows: | |
236 | ||
237 | qemu_lockcnt_inc(&io_handlers_lockcnt); | |
238 | QLIST_FOREACH_RCU(ioh, &io_handlers, pioh) { | |
239 | if (ioh->revents & G_IO_OUT) { | |
240 | ioh->fd_write(ioh->opaque); | |
241 | } | |
242 | } | |
243 | ||
244 | if (qemu_lockcnt_dec_and_lock(&io_handlers_lockcnt)) { | |
245 | QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { | |
246 | if (ioh->deleted) { | |
247 | QLIST_REMOVE(ioh, next); | |
248 | g_free(ioh); | |
249 | } | |
250 | } | |
251 | qemu_lockcnt_unlock(&io_handlers_lockcnt); | |
252 | } | |
253 | ||
254 | Again, the RCU primitives are used because new items can be added to the | |
255 | list during the walk. QLIST_FOREACH_RCU ensures that the processor and | |
256 | the compiler see the appropriate memory barriers. | |
257 | ||
258 | An alternative pattern uses qemu_lockcnt_dec_if_lock: | |
259 | ||
260 | qemu_lockcnt_inc(&io_handlers_lockcnt); | |
261 | QLIST_FOREACH_SAFE_RCU(ioh, &io_handlers, next, pioh) { | |
262 | if (ioh->deleted) { | |
263 | if (qemu_lockcnt_dec_if_lock(&io_handlers_lockcnt)) { | |
264 | QLIST_REMOVE(ioh, next); | |
265 | g_free(ioh); | |
266 | qemu_lockcnt_inc_and_unlock(&io_handlers_lockcnt); | |
267 | } | |
268 | } else { | |
269 | if (ioh->revents & G_IO_OUT) { | |
270 | ioh->fd_write(ioh->opaque); | |
271 | } | |
272 | } | |
273 | } | |
274 | qemu_lockcnt_dec(&io_handlers_lockcnt); | |
275 | ||
276 | Here you can use qemu_lockcnt_dec instead of qemu_lockcnt_dec_and_lock, | |
277 | because there is no special task to do if the count goes from 1 to 0. |