]>
Commit | Line | Data |
---|---|---|
ee1ee6db TG |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | ||
3 | /* | |
4 | * rcuref - A scalable reference count implementation for RCU managed objects | |
5 | * | |
6 | * rcuref is provided to replace open coded reference count implementations | |
7 | * based on atomic_t. It protects explicitely RCU managed objects which can | |
8 | * be visible even after the last reference has been dropped and the object | |
9 | * is heading towards destruction. | |
10 | * | |
11 | * A common usage pattern is: | |
12 | * | |
13 | * get() | |
14 | * rcu_read_lock(); | |
15 | * p = get_ptr(); | |
16 | * if (p && !atomic_inc_not_zero(&p->refcnt)) | |
17 | * p = NULL; | |
18 | * rcu_read_unlock(); | |
19 | * return p; | |
20 | * | |
21 | * put() | |
22 | * if (!atomic_dec_return(&->refcnt)) { | |
23 | * remove_ptr(p); | |
24 | * kfree_rcu((p, rcu); | |
25 | * } | |
26 | * | |
27 | * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has | |
28 | * O(N^2) behaviour under contention with N concurrent operations. | |
29 | * | |
30 | * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales | |
31 | * better under contention. | |
32 | * | |
33 | * Why not refcount? | |
34 | * ================= | |
35 | * | |
36 | * In principle it should be possible to make refcount use the rcuref | |
37 | * scheme, but the destruction race described below cannot be prevented | |
38 | * unless the protected object is RCU managed. | |
39 | * | |
40 | * Theory of operation | |
41 | * =================== | |
42 | * | |
43 | * rcuref uses an unsigned integer reference counter. As long as the | |
44 | * counter value is greater than or equal to RCUREF_ONEREF and not larger | |
45 | * than RCUREF_MAXREF the reference is alive: | |
46 | * | |
47 | * ONEREF MAXREF SATURATED RELEASED DEAD NOREF | |
48 | * 0 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF | |
49 | * <---valid --------> <-------saturation zone-------> <-----dead zone-----> | |
50 | * | |
51 | * The get() and put() operations do unconditional increments and | |
52 | * decrements. The result is checked after the operation. This optimizes | |
53 | * for the fast path. | |
54 | * | |
55 | * If the reference count is saturated or dead, then the increments and | |
56 | * decrements are not harmful as the reference count still stays in the | |
57 | * respective zones and is always set back to STATURATED resp. DEAD. The | |
58 | * zones have room for 2^28 racing operations in each direction, which | |
59 | * makes it practically impossible to escape the zones. | |
60 | * | |
61 | * Once the last reference is dropped the reference count becomes | |
62 | * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The | |
63 | * slowpath then tries to set the reference count from RCUREF_NOREF to | |
64 | * RCUREF_DEAD via a cmpxchg(). This opens a small window where a | |
65 | * concurrent rcuref_get() can acquire the reference count and bring it | |
66 | * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD. | |
67 | * | |
68 | * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in | |
69 | * DEAD + 1, which is inside the dead zone. If that happens the reference | |
70 | * count is put back to DEAD. | |
71 | * | |
72 | * The actual race is possible due to the unconditional increment and | |
73 | * decrements in rcuref_get() and rcuref_put(): | |
74 | * | |
75 | * T1 T2 | |
76 | * get() put() | |
77 | * if (atomic_add_negative(-1, &ref->refcnt)) | |
78 | * succeeds-> atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); | |
79 | * | |
80 | * atomic_add_negative(1, &ref->refcnt); <- Elevates refcount to DEAD + 1 | |
81 | * | |
82 | * As the result of T1's add is negative, the get() goes into the slow path | |
83 | * and observes refcnt being in the dead zone which makes the operation fail. | |
84 | * | |
85 | * Possible critical states: | |
86 | * | |
87 | * Context Counter References Operation | |
88 | * T1 0 1 init() | |
89 | * T2 1 2 get() | |
90 | * T1 0 1 put() | |
91 | * T2 -1 0 put() tries to mark dead | |
92 | * T1 0 1 get() | |
93 | * T2 0 1 put() mark dead fails | |
94 | * T1 -1 0 put() tries to mark dead | |
95 | * T1 DEAD 0 put() mark dead succeeds | |
96 | * T2 DEAD+1 0 get() fails and puts it back to DEAD | |
97 | * | |
98 | * Of course there are more complex scenarios, but the above illustrates | |
99 | * the working principle. The rest is left to the imagination of the | |
100 | * reader. | |
101 | * | |
102 | * Deconstruction race | |
103 | * =================== | |
104 | * | |
105 | * The release operation must be protected by prohibiting a grace period in | |
106 | * order to prevent a possible use after free: | |
107 | * | |
108 | * T1 T2 | |
109 | * put() get() | |
110 | * // ref->refcnt = ONEREF | |
111 | * if (!atomic_add_negative(-1, &ref->refcnt)) | |
112 | * return false; <- Not taken | |
113 | * | |
114 | * // ref->refcnt == NOREF | |
115 | * --> preemption | |
116 | * // Elevates ref->refcnt to ONEREF | |
117 | * if (!atomic_add_negative(1, &ref->refcnt)) | |
118 | * return true; <- taken | |
119 | * | |
120 | * if (put(&p->ref)) { <-- Succeeds | |
121 | * remove_pointer(p); | |
122 | * kfree_rcu(p, rcu); | |
123 | * } | |
124 | * | |
125 | * RCU grace period ends, object is freed | |
126 | * | |
127 | * atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF | |
128 | * | |
129 | * This is prevented by disabling preemption around the put() operation as | |
130 | * that's in most kernel configurations cheaper than a rcu_read_lock() / | |
131 | * rcu_read_unlock() pair and in many cases even a NOOP. In any case it | |
132 | * prevents the grace period which keeps the object alive until all put() | |
133 | * operations complete. | |
134 | * | |
135 | * Saturation protection | |
136 | * ===================== | |
137 | * | |
138 | * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX). | |
139 | * Once this is exceedded the reference count becomes stale by setting it | |
140 | * to RCUREF_SATURATED, which will cause a memory leak, but it prevents | |
141 | * wrap arounds which obviously cause worse problems than a memory | |
142 | * leak. When saturation is reached a warning is emitted. | |
143 | * | |
144 | * Race conditions | |
145 | * =============== | |
146 | * | |
147 | * All reference count increment/decrement operations are unconditional and | |
148 | * only verified after the fact. This optimizes for the good case and takes | |
149 | * the occasional race vs. a dead or already saturated refcount into | |
150 | * account. The saturation and dead zones are large enough to accomodate | |
151 | * for that. | |
152 | * | |
153 | * Memory ordering | |
154 | * =============== | |
155 | * | |
156 | * Memory ordering rules are slightly relaxed wrt regular atomic_t functions | |
157 | * and provide only what is strictly required for refcounts. | |
158 | * | |
159 | * The increments are fully relaxed; these will not provide ordering. The | |
160 | * rationale is that whatever is used to obtain the object to increase the | |
161 | * reference count on will provide the ordering. For locked data | |
162 | * structures, its the lock acquire, for RCU/lockless data structures its | |
163 | * the dependent load. | |
164 | * | |
165 | * rcuref_get() provides a control dependency ordering future stores which | |
166 | * ensures that the object is not modified when acquiring a reference | |
167 | * fails. | |
168 | * | |
169 | * rcuref_put() provides release order, i.e. all prior loads and stores | |
170 | * will be issued before. It also provides a control dependency ordering | |
171 | * against the subsequent destruction of the object. | |
172 | * | |
173 | * If rcuref_put() successfully dropped the last reference and marked the | |
174 | * object DEAD it also provides acquire ordering. | |
175 | */ | |
176 | ||
177 | #include <linux/export.h> | |
178 | #include <linux/rcuref.h> | |
179 | ||
180 | /** | |
181 | * rcuref_get_slowpath - Slowpath of rcuref_get() | |
182 | * @ref: Pointer to the reference count | |
183 | * | |
184 | * Invoked when the reference count is outside of the valid zone. | |
185 | * | |
186 | * Return: | |
187 | * False if the reference count was already marked dead | |
188 | * | |
189 | * True if the reference count is saturated, which prevents the | |
190 | * object from being deconstructed ever. | |
191 | */ | |
192 | bool rcuref_get_slowpath(rcuref_t *ref) | |
193 | { | |
194 | unsigned int cnt = atomic_read(&ref->refcnt); | |
195 | ||
196 | /* | |
197 | * If the reference count was already marked dead, undo the | |
198 | * increment so it stays in the middle of the dead zone and return | |
199 | * fail. | |
200 | */ | |
201 | if (cnt >= RCUREF_RELEASED) { | |
202 | atomic_set(&ref->refcnt, RCUREF_DEAD); | |
203 | return false; | |
204 | } | |
205 | ||
206 | /* | |
207 | * If it was saturated, warn and mark it so. In case the increment | |
208 | * was already on a saturated value restore the saturation | |
209 | * marker. This keeps it in the middle of the saturation zone and | |
210 | * prevents the reference count from overflowing. This leaks the | |
211 | * object memory, but prevents the obvious reference count overflow | |
212 | * damage. | |
213 | */ | |
214 | if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory")) | |
215 | atomic_set(&ref->refcnt, RCUREF_SATURATED); | |
216 | return true; | |
217 | } | |
218 | EXPORT_SYMBOL_GPL(rcuref_get_slowpath); | |
219 | ||
220 | /** | |
221 | * rcuref_put_slowpath - Slowpath of __rcuref_put() | |
222 | * @ref: Pointer to the reference count | |
223 | * | |
224 | * Invoked when the reference count is outside of the valid zone. | |
225 | * | |
226 | * Return: | |
227 | * True if this was the last reference with no future references | |
228 | * possible. This signals the caller that it can safely schedule the | |
229 | * object, which is protected by the reference counter, for | |
230 | * deconstruction. | |
231 | * | |
232 | * False if there are still active references or the put() raced | |
233 | * with a concurrent get()/put() pair. Caller is not allowed to | |
234 | * deconstruct the protected object. | |
235 | */ | |
236 | bool rcuref_put_slowpath(rcuref_t *ref) | |
237 | { | |
238 | unsigned int cnt = atomic_read(&ref->refcnt); | |
239 | ||
240 | /* Did this drop the last reference? */ | |
241 | if (likely(cnt == RCUREF_NOREF)) { | |
242 | /* | |
243 | * Carefully try to set the reference count to RCUREF_DEAD. | |
244 | * | |
245 | * This can fail if a concurrent get() operation has | |
246 | * elevated it again or the corresponding put() even marked | |
247 | * it dead already. Both are valid situations and do not | |
248 | * require a retry. If this fails the caller is not | |
249 | * allowed to deconstruct the object. | |
250 | */ | |
251 | if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF) | |
252 | return false; | |
253 | ||
254 | /* | |
255 | * The caller can safely schedule the object for | |
256 | * deconstruction. Provide acquire ordering. | |
257 | */ | |
258 | smp_acquire__after_ctrl_dep(); | |
259 | return true; | |
260 | } | |
261 | ||
262 | /* | |
263 | * If the reference count was already in the dead zone, then this | |
264 | * put() operation is imbalanced. Warn, put the reference count back to | |
265 | * DEAD and tell the caller to not deconstruct the object. | |
266 | */ | |
267 | if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) { | |
268 | atomic_set(&ref->refcnt, RCUREF_DEAD); | |
269 | return false; | |
270 | } | |
271 | ||
272 | /* | |
273 | * This is a put() operation on a saturated refcount. Restore the | |
274 | * mean saturation value and tell the caller to not deconstruct the | |
275 | * object. | |
276 | */ | |
277 | if (cnt > RCUREF_MAXREF) | |
278 | atomic_set(&ref->refcnt, RCUREF_SATURATED); | |
279 | return false; | |
280 | } | |
281 | EXPORT_SYMBOL_GPL(rcuref_put_slowpath); |