]>
Commit | Line | Data |
---|---|---|
5b497af4 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
0f8e4bd8 | 2 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com |
6c905981 | 3 | * Copyright (c) 2016 Facebook |
0f8e4bd8 AS |
4 | */ |
5 | #include <linux/bpf.h> | |
699c86d6 | 6 | #include <linux/btf.h> |
0f8e4bd8 AS |
7 | #include <linux/jhash.h> |
8 | #include <linux/filter.h> | |
4fe84359 | 9 | #include <linux/rculist_nulls.h> |
c0203475 | 10 | #include <linux/random.h> |
699c86d6 | 11 | #include <uapi/linux/btf.h> |
1e6c62a8 | 12 | #include <linux/rcupdate_trace.h> |
c317ab71 | 13 | #include <linux/btf_ids.h> |
6c905981 | 14 | #include "percpu_freelist.h" |
29ba732a | 15 | #include "bpf_lru_list.h" |
bcc6b1b7 | 16 | #include "map_in_map.h" |
fba1a1c6 | 17 | #include <linux/bpf_mem_alloc.h> |
0f8e4bd8 | 18 | |
6e71b04a CF |
19 | #define HTAB_CREATE_FLAG_MASK \ |
20 | (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ | |
591fe988 | 21 | BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) |
96eabe7a | 22 | |
05799638 YS |
23 | #define BATCH_OPS(_name) \ |
24 | .map_lookup_batch = \ | |
25 | _name##_map_lookup_batch, \ | |
26 | .map_lookup_and_delete_batch = \ | |
27 | _name##_map_lookup_and_delete_batch, \ | |
28 | .map_update_batch = \ | |
29 | generic_map_update_batch, \ | |
30 | .map_delete_batch = \ | |
31 | generic_map_delete_batch | |
32 | ||
dbca151c TG |
33 | /* |
34 | * The bucket lock has two protection scopes: | |
35 | * | |
6bd45f2e | 36 | * 1) Serializing concurrent operations from BPF programs on different |
dbca151c TG |
37 | * CPUs |
38 | * | |
39 | * 2) Serializing concurrent operations from BPF programs and sys_bpf() | |
40 | * | |
41 | * BPF programs can execute in any context including perf, kprobes and | |
42 | * tracing. As there are almost no limits where perf, kprobes and tracing | |
43 | * can be invoked from the lock operations need to be protected against | |
44 | * deadlocks. Deadlocks can be caused by recursion and by an invocation in | |
45 | * the lock held section when functions which acquire this lock are invoked | |
46 | * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU | |
47 | * variable bpf_prog_active, which prevents BPF programs attached to perf | |
48 | * events, kprobes and tracing to be invoked before the prior invocation | |
49 | * from one of these contexts completed. sys_bpf() uses the same mechanism | |
50 | * by pinning the task to the current CPU and incrementing the recursion | |
8fb33b60 | 51 | * protection across the map operation. |
7f805d17 TG |
52 | * |
53 | * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain | |
54 | * operations like memory allocations (even with GFP_ATOMIC) from atomic | |
55 | * contexts. This is required because even with GFP_ATOMIC the memory | |
8fb33b60 | 56 | * allocator calls into code paths which acquire locks with long held lock |
7f805d17 TG |
57 | * sections. To ensure the deterministic behaviour these locks are regular |
58 | * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only | |
59 | * true atomic contexts on an RT kernel are the low level hardware | |
60 | * handling, scheduling, low level interrupt handling, NMIs etc. None of | |
61 | * these contexts should ever do memory allocations. | |
62 | * | |
63 | * As regular device interrupt handlers and soft interrupts are forced into | |
64 | * thread context, the existing code which does | |
ace2bee8 | 65 | * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*(); |
7f805d17 TG |
66 | * just works. |
67 | * | |
68 | * In theory the BPF locks could be converted to regular spinlocks as well, | |
69 | * but the bucket locks and percpu_freelist locks can be taken from | |
70 | * arbitrary contexts (perf, kprobes, tracepoints) which are required to be | |
1d8b82c6 HT |
71 | * atomic contexts even on RT. Before the introduction of bpf_mem_alloc, |
72 | * it is only safe to use raw spinlock for preallocated hash map on a RT kernel, | |
73 | * because there is no memory allocation within the lock held sections. However | |
74 | * after hash map was fully converted to use bpf_mem_alloc, there will be | |
75 | * non-synchronous memory allocation for non-preallocated hash map, so it is | |
76 | * safe to always use raw spinlock for bucket lock. | |
dbca151c | 77 | */ |
688ecfe6 | 78 | struct bucket { |
4fe84359 | 79 | struct hlist_nulls_head head; |
1d8b82c6 | 80 | raw_spinlock_t raw_lock; |
688ecfe6 | 81 | }; |
82 | ||
20b6cc34 SL |
83 | #define HASHTAB_MAP_LOCK_COUNT 8 |
84 | #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) | |
85 | ||
0f8e4bd8 AS |
86 | struct bpf_htab { |
87 | struct bpf_map map; | |
fba1a1c6 | 88 | struct bpf_mem_alloc ma; |
ee4ed53c | 89 | struct bpf_mem_alloc pcpu_ma; |
688ecfe6 | 90 | struct bucket *buckets; |
6c905981 | 91 | void *elems; |
29ba732a MKL |
92 | union { |
93 | struct pcpu_freelist freelist; | |
94 | struct bpf_lru lru; | |
95 | }; | |
8c290e60 | 96 | struct htab_elem *__percpu *extra_elems; |
86fe28f7 AS |
97 | /* number of elements in non-preallocated hashtable are kept |
98 | * in either pcount or count | |
99 | */ | |
100 | struct percpu_counter pcount; | |
101 | atomic_t count; | |
102 | bool use_percpu_counter; | |
0f8e4bd8 AS |
103 | u32 n_buckets; /* number of hash buckets */ |
104 | u32 elem_size; /* size of each element in bytes */ | |
c0203475 | 105 | u32 hashrnd; |
c50eb518 | 106 | struct lock_class_key lockdep_key; |
20b6cc34 | 107 | int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; |
0f8e4bd8 AS |
108 | }; |
109 | ||
110 | /* each htab element is struct htab_elem + key + value */ | |
111 | struct htab_elem { | |
824bd0ce | 112 | union { |
4fe84359 | 113 | struct hlist_nulls_node hash_node; |
9f691549 AS |
114 | struct { |
115 | void *padding; | |
116 | union { | |
9f691549 | 117 | struct pcpu_freelist_node fnode; |
b9aff38d | 118 | struct htab_elem *batch_flink; |
9f691549 AS |
119 | }; |
120 | }; | |
824bd0ce | 121 | }; |
a6ed3ea6 | 122 | union { |
ee4ed53c AS |
123 | /* pointer to per-cpu pointer */ |
124 | void *ptr_to_pptr; | |
29ba732a | 125 | struct bpf_lru_node lru_node; |
a6ed3ea6 | 126 | }; |
6c905981 | 127 | u32 hash; |
d7f10df8 | 128 | char key[] __aligned(8); |
0f8e4bd8 AS |
129 | }; |
130 | ||
7f805d17 TG |
131 | static inline bool htab_is_prealloc(const struct bpf_htab *htab) |
132 | { | |
133 | return !(htab->map.map_flags & BPF_F_NO_PREALLOC); | |
134 | } | |
135 | ||
d01f9b19 TG |
136 | static void htab_init_buckets(struct bpf_htab *htab) |
137 | { | |
9263dddc | 138 | unsigned int i; |
d01f9b19 TG |
139 | |
140 | for (i = 0; i < htab->n_buckets; i++) { | |
141 | INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); | |
1d8b82c6 HT |
142 | raw_spin_lock_init(&htab->buckets[i].raw_lock); |
143 | lockdep_set_class(&htab->buckets[i].raw_lock, | |
c50eb518 | 144 | &htab->lockdep_key); |
e7e51805 | 145 | cond_resched(); |
d01f9b19 TG |
146 | } |
147 | } | |
148 | ||
20b6cc34 SL |
149 | static inline int htab_lock_bucket(const struct bpf_htab *htab, |
150 | struct bucket *b, u32 hash, | |
151 | unsigned long *pflags) | |
d01f9b19 TG |
152 | { |
153 | unsigned long flags; | |
154 | ||
20b6cc34 SL |
155 | hash = hash & HASHTAB_MAP_LOCK_MASK; |
156 | ||
1d8b82c6 | 157 | preempt_disable(); |
20b6cc34 SL |
158 | if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { |
159 | __this_cpu_dec(*(htab->map_locked[hash])); | |
1d8b82c6 | 160 | preempt_enable(); |
20b6cc34 SL |
161 | return -EBUSY; |
162 | } | |
163 | ||
1d8b82c6 | 164 | raw_spin_lock_irqsave(&b->raw_lock, flags); |
20b6cc34 SL |
165 | *pflags = flags; |
166 | ||
167 | return 0; | |
d01f9b19 TG |
168 | } |
169 | ||
170 | static inline void htab_unlock_bucket(const struct bpf_htab *htab, | |
20b6cc34 | 171 | struct bucket *b, u32 hash, |
d01f9b19 TG |
172 | unsigned long flags) |
173 | { | |
20b6cc34 | 174 | hash = hash & HASHTAB_MAP_LOCK_MASK; |
1d8b82c6 | 175 | raw_spin_unlock_irqrestore(&b->raw_lock, flags); |
20b6cc34 | 176 | __this_cpu_dec(*(htab->map_locked[hash])); |
1d8b82c6 | 177 | preempt_enable(); |
d01f9b19 TG |
178 | } |
179 | ||
29ba732a MKL |
180 | static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); |
181 | ||
182 | static bool htab_is_lru(const struct bpf_htab *htab) | |
183 | { | |
8f844938 MKL |
184 | return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || |
185 | htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; | |
186 | } | |
187 | ||
188 | static bool htab_is_percpu(const struct bpf_htab *htab) | |
189 | { | |
190 | return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || | |
191 | htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; | |
29ba732a MKL |
192 | } |
193 | ||
6c905981 AS |
194 | static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, |
195 | void __percpu *pptr) | |
196 | { | |
197 | *(void __percpu **)(l->key + key_size) = pptr; | |
198 | } | |
199 | ||
200 | static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) | |
201 | { | |
202 | return *(void __percpu **)(l->key + key_size); | |
203 | } | |
204 | ||
bcc6b1b7 MKL |
205 | static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) |
206 | { | |
207 | return *(void **)(l->key + roundup(map->key_size, 8)); | |
208 | } | |
209 | ||
6c905981 AS |
210 | static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) |
211 | { | |
e1868b9e | 212 | return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); |
6c905981 AS |
213 | } |
214 | ||
68134668 AS |
215 | static bool htab_has_extra_elems(struct bpf_htab *htab) |
216 | { | |
217 | return !htab_is_percpu(htab) && !htab_is_lru(htab); | |
218 | } | |
219 | ||
220 | static void htab_free_prealloced_timers(struct bpf_htab *htab) | |
221 | { | |
222 | u32 num_entries = htab->map.max_entries; | |
223 | int i; | |
224 | ||
14a324f6 | 225 | if (!map_value_has_timer(&htab->map)) |
68134668 AS |
226 | return; |
227 | if (htab_has_extra_elems(htab)) | |
228 | num_entries += num_possible_cpus(); | |
229 | ||
230 | for (i = 0; i < num_entries; i++) { | |
231 | struct htab_elem *elem; | |
232 | ||
233 | elem = get_htab_elem(htab, i); | |
234 | bpf_timer_cancel_and_free(elem->key + | |
235 | round_up(htab->map.key_size, 8) + | |
236 | htab->map.timer_off); | |
237 | cond_resched(); | |
238 | } | |
239 | } | |
240 | ||
14a324f6 KKD |
241 | static void htab_free_prealloced_kptrs(struct bpf_htab *htab) |
242 | { | |
243 | u32 num_entries = htab->map.max_entries; | |
244 | int i; | |
245 | ||
246 | if (!map_value_has_kptrs(&htab->map)) | |
247 | return; | |
248 | if (htab_has_extra_elems(htab)) | |
249 | num_entries += num_possible_cpus(); | |
250 | ||
251 | for (i = 0; i < num_entries; i++) { | |
252 | struct htab_elem *elem; | |
253 | ||
254 | elem = get_htab_elem(htab, i); | |
255 | bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8)); | |
256 | cond_resched(); | |
257 | } | |
258 | } | |
259 | ||
6c905981 AS |
260 | static void htab_free_elems(struct bpf_htab *htab) |
261 | { | |
262 | int i; | |
263 | ||
8f844938 | 264 | if (!htab_is_percpu(htab)) |
6c905981 AS |
265 | goto free_elems; |
266 | ||
267 | for (i = 0; i < htab->map.max_entries; i++) { | |
268 | void __percpu *pptr; | |
269 | ||
270 | pptr = htab_elem_get_ptr(get_htab_elem(htab, i), | |
271 | htab->map.key_size); | |
272 | free_percpu(pptr); | |
9147efcb | 273 | cond_resched(); |
6c905981 AS |
274 | } |
275 | free_elems: | |
d407bd25 | 276 | bpf_map_area_free(htab->elems); |
6c905981 AS |
277 | } |
278 | ||
b9aff38d YS |
279 | /* The LRU list has a lock (lru_lock). Each htab bucket has a lock |
280 | * (bucket_lock). If both locks need to be acquired together, the lock | |
281 | * order is always lru_lock -> bucket_lock and this only happens in | |
282 | * bpf_lru_list.c logic. For example, certain code path of | |
283 | * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), | |
284 | * will acquire lru_lock first followed by acquiring bucket_lock. | |
285 | * | |
286 | * In hashtab.c, to avoid deadlock, lock acquisition of | |
287 | * bucket_lock followed by lru_lock is not allowed. In such cases, | |
288 | * bucket_lock needs to be released first before acquiring lru_lock. | |
289 | */ | |
29ba732a MKL |
290 | static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, |
291 | u32 hash) | |
292 | { | |
293 | struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); | |
294 | struct htab_elem *l; | |
295 | ||
296 | if (node) { | |
297 | l = container_of(node, struct htab_elem, lru_node); | |
275c30bc | 298 | memcpy(l->key, key, htab->map.key_size); |
29ba732a MKL |
299 | return l; |
300 | } | |
301 | ||
302 | return NULL; | |
303 | } | |
304 | ||
305 | static int prealloc_init(struct bpf_htab *htab) | |
6c905981 | 306 | { |
8c290e60 | 307 | u32 num_entries = htab->map.max_entries; |
6c905981 AS |
308 | int err = -ENOMEM, i; |
309 | ||
68134668 | 310 | if (htab_has_extra_elems(htab)) |
8c290e60 AS |
311 | num_entries += num_possible_cpus(); |
312 | ||
e1868b9e | 313 | htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries, |
96eabe7a | 314 | htab->map.numa_node); |
6c905981 AS |
315 | if (!htab->elems) |
316 | return -ENOMEM; | |
317 | ||
8f844938 | 318 | if (!htab_is_percpu(htab)) |
6c905981 AS |
319 | goto skip_percpu_elems; |
320 | ||
8c290e60 | 321 | for (i = 0; i < num_entries; i++) { |
6c905981 AS |
322 | u32 size = round_up(htab->map.value_size, 8); |
323 | void __percpu *pptr; | |
324 | ||
88145681 RG |
325 | pptr = bpf_map_alloc_percpu(&htab->map, size, 8, |
326 | GFP_USER | __GFP_NOWARN); | |
6c905981 AS |
327 | if (!pptr) |
328 | goto free_elems; | |
329 | htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, | |
330 | pptr); | |
9147efcb | 331 | cond_resched(); |
6c905981 AS |
332 | } |
333 | ||
334 | skip_percpu_elems: | |
29ba732a MKL |
335 | if (htab_is_lru(htab)) |
336 | err = bpf_lru_init(&htab->lru, | |
337 | htab->map.map_flags & BPF_F_NO_COMMON_LRU, | |
338 | offsetof(struct htab_elem, hash) - | |
339 | offsetof(struct htab_elem, lru_node), | |
340 | htab_lru_map_delete_node, | |
341 | htab); | |
342 | else | |
343 | err = pcpu_freelist_init(&htab->freelist); | |
344 | ||
6c905981 AS |
345 | if (err) |
346 | goto free_elems; | |
347 | ||
29ba732a MKL |
348 | if (htab_is_lru(htab)) |
349 | bpf_lru_populate(&htab->lru, htab->elems, | |
350 | offsetof(struct htab_elem, lru_node), | |
8c290e60 | 351 | htab->elem_size, num_entries); |
29ba732a | 352 | else |
9f691549 AS |
353 | pcpu_freelist_populate(&htab->freelist, |
354 | htab->elems + offsetof(struct htab_elem, fnode), | |
8c290e60 | 355 | htab->elem_size, num_entries); |
29ba732a | 356 | |
6c905981 AS |
357 | return 0; |
358 | ||
359 | free_elems: | |
360 | htab_free_elems(htab); | |
361 | return err; | |
362 | } | |
363 | ||
29ba732a MKL |
364 | static void prealloc_destroy(struct bpf_htab *htab) |
365 | { | |
366 | htab_free_elems(htab); | |
367 | ||
368 | if (htab_is_lru(htab)) | |
369 | bpf_lru_destroy(&htab->lru); | |
370 | else | |
371 | pcpu_freelist_destroy(&htab->freelist); | |
372 | } | |
373 | ||
a6ed3ea6 AS |
374 | static int alloc_extra_elems(struct bpf_htab *htab) |
375 | { | |
8c290e60 AS |
376 | struct htab_elem *__percpu *pptr, *l_new; |
377 | struct pcpu_freelist_node *l; | |
a6ed3ea6 AS |
378 | int cpu; |
379 | ||
88145681 RG |
380 | pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8, |
381 | GFP_USER | __GFP_NOWARN); | |
a6ed3ea6 AS |
382 | if (!pptr) |
383 | return -ENOMEM; | |
384 | ||
385 | for_each_possible_cpu(cpu) { | |
8c290e60 AS |
386 | l = pcpu_freelist_pop(&htab->freelist); |
387 | /* pop will succeed, since prealloc_init() | |
388 | * preallocated extra num_possible_cpus elements | |
389 | */ | |
390 | l_new = container_of(l, struct htab_elem, fnode); | |
391 | *per_cpu_ptr(pptr, cpu) = l_new; | |
a6ed3ea6 AS |
392 | } |
393 | htab->extra_elems = pptr; | |
394 | return 0; | |
395 | } | |
396 | ||
0f8e4bd8 | 397 | /* Called from syscall */ |
9328e0d1 | 398 | static int htab_map_alloc_check(union bpf_attr *attr) |
0f8e4bd8 | 399 | { |
8f844938 MKL |
400 | bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || |
401 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); | |
402 | bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || | |
403 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); | |
29ba732a MKL |
404 | /* percpu_lru means each cpu has its own LRU list. |
405 | * it is different from BPF_MAP_TYPE_PERCPU_HASH where | |
406 | * the map's value itself is percpu. percpu_lru has | |
407 | * nothing to do with the map's value. | |
408 | */ | |
409 | bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); | |
410 | bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); | |
96b3b6c9 | 411 | bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); |
96eabe7a | 412 | int numa_node = bpf_map_attr_numa_node(attr); |
0f8e4bd8 | 413 | |
9f691549 AS |
414 | BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != |
415 | offsetof(struct htab_elem, hash_node.pprev)); | |
416 | ||
2c78ee89 | 417 | if (lru && !bpf_capable()) |
29ba732a | 418 | /* LRU implementation is much complicated than other |
2c78ee89 | 419 | * maps. Hence, limit to CAP_BPF. |
29ba732a | 420 | */ |
9328e0d1 | 421 | return -EPERM; |
29ba732a | 422 | |
96b3b6c9 LB |
423 | if (zero_seed && !capable(CAP_SYS_ADMIN)) |
424 | /* Guard against local DoS, and discourage production use. */ | |
425 | return -EPERM; | |
426 | ||
591fe988 DB |
427 | if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || |
428 | !bpf_map_flags_access_ok(attr->map_flags)) | |
9328e0d1 | 429 | return -EINVAL; |
6c905981 | 430 | |
29ba732a | 431 | if (!lru && percpu_lru) |
9328e0d1 | 432 | return -EINVAL; |
29ba732a MKL |
433 | |
434 | if (lru && !prealloc) | |
9328e0d1 | 435 | return -ENOTSUPP; |
29ba732a | 436 | |
96eabe7a | 437 | if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) |
9328e0d1 | 438 | return -EINVAL; |
96eabe7a | 439 | |
daffc5a2 JK |
440 | /* check sanity of attributes. |
441 | * value_size == 0 may be allowed in the future to use map as a set | |
442 | */ | |
443 | if (attr->max_entries == 0 || attr->key_size == 0 || | |
444 | attr->value_size == 0) | |
9328e0d1 | 445 | return -EINVAL; |
daffc5a2 | 446 | |
c6bde958 FL |
447 | if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - |
448 | sizeof(struct htab_elem)) | |
449 | /* if key_size + value_size is bigger, the user space won't be | |
450 | * able to access the elements via bpf syscall. This check | |
451 | * also makes sure that the elem_size doesn't overflow and it's | |
daffc5a2 JK |
452 | * kmalloc-able later in htab_map_update_elem() |
453 | */ | |
9328e0d1 JK |
454 | return -E2BIG; |
455 | ||
456 | return 0; | |
457 | } | |
458 | ||
459 | static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |
460 | { | |
461 | bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || | |
462 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); | |
463 | bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || | |
464 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); | |
465 | /* percpu_lru means each cpu has its own LRU list. | |
466 | * it is different from BPF_MAP_TYPE_PERCPU_HASH where | |
467 | * the map's value itself is percpu. percpu_lru has | |
468 | * nothing to do with the map's value. | |
469 | */ | |
470 | bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); | |
471 | bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); | |
9328e0d1 | 472 | struct bpf_htab *htab; |
20b6cc34 | 473 | int err, i; |
daffc5a2 | 474 | |
73cf09a3 | 475 | htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); |
0f8e4bd8 AS |
476 | if (!htab) |
477 | return ERR_PTR(-ENOMEM); | |
478 | ||
8aaeed81 ED |
479 | lockdep_register_key(&htab->lockdep_key); |
480 | ||
bd475643 | 481 | bpf_map_init_from_attr(&htab->map, attr); |
0f8e4bd8 | 482 | |
29ba732a MKL |
483 | if (percpu_lru) { |
484 | /* ensure each CPU's lru list has >=1 elements. | |
485 | * since we are at it, make each lru list has the same | |
486 | * number of elements. | |
487 | */ | |
488 | htab->map.max_entries = roundup(attr->max_entries, | |
489 | num_possible_cpus()); | |
490 | if (htab->map.max_entries < attr->max_entries) | |
491 | htab->map.max_entries = rounddown(attr->max_entries, | |
492 | num_possible_cpus()); | |
493 | } | |
494 | ||
0f8e4bd8 AS |
495 | /* hash table size must be power of 2 */ |
496 | htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); | |
497 | ||
01b3f521 | 498 | htab->elem_size = sizeof(struct htab_elem) + |
824bd0ce AS |
499 | round_up(htab->map.key_size, 8); |
500 | if (percpu) | |
501 | htab->elem_size += sizeof(void *); | |
502 | else | |
6c905981 | 503 | htab->elem_size += round_up(htab->map.value_size, 8); |
01b3f521 | 504 | |
daffc5a2 | 505 | err = -E2BIG; |
daaf427c AS |
506 | /* prevent zero size kmalloc and check for u32 overflow */ |
507 | if (htab->n_buckets == 0 || | |
688ecfe6 | 508 | htab->n_buckets > U32_MAX / sizeof(struct bucket)) |
daaf427c AS |
509 | goto free_htab; |
510 | ||
01b3f521 | 511 | err = -ENOMEM; |
d407bd25 | 512 | htab->buckets = bpf_map_area_alloc(htab->n_buckets * |
96eabe7a MKL |
513 | sizeof(struct bucket), |
514 | htab->map.numa_node); | |
d407bd25 | 515 | if (!htab->buckets) |
755e5d55 | 516 | goto free_htab; |
0f8e4bd8 | 517 | |
20b6cc34 | 518 | for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { |
88145681 RG |
519 | htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, |
520 | sizeof(int), | |
521 | sizeof(int), | |
522 | GFP_USER); | |
20b6cc34 SL |
523 | if (!htab->map_locked[i]) |
524 | goto free_map_locked; | |
525 | } | |
526 | ||
96b3b6c9 LB |
527 | if (htab->map.map_flags & BPF_F_ZERO_SEED) |
528 | htab->hashrnd = 0; | |
529 | else | |
a251c17a | 530 | htab->hashrnd = get_random_u32(); |
96b3b6c9 | 531 | |
d01f9b19 | 532 | htab_init_buckets(htab); |
0f8e4bd8 | 533 | |
86fe28f7 AS |
534 | /* compute_batch_value() computes batch value as num_online_cpus() * 2 |
535 | * and __percpu_counter_compare() needs | |
536 | * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() | |
537 | * for percpu_counter to be faster than atomic_t. In practice the average bpf | |
538 | * hash map size is 10k, which means that a system with 64 cpus will fill | |
539 | * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore | |
540 | * define our own batch count as 32 then 10k hash map can be filled up to 80%: | |
541 | * 10k - 8k > 32 _batch_ * 64 _cpus_ | |
542 | * and __percpu_counter_compare() will still be fast. At that point hash map | |
543 | * collisions will dominate its performance anyway. Assume that hash map filled | |
544 | * to 50+% isn't going to be O(1) and use the following formula to choose | |
545 | * between percpu_counter and atomic_t. | |
546 | */ | |
547 | #define PERCPU_COUNTER_BATCH 32 | |
548 | if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) | |
549 | htab->use_percpu_counter = true; | |
550 | ||
551 | if (htab->use_percpu_counter) { | |
552 | err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); | |
553 | if (err) | |
554 | goto free_map_locked; | |
555 | } | |
556 | ||
29ba732a MKL |
557 | if (prealloc) { |
558 | err = prealloc_init(htab); | |
6c905981 | 559 | if (err) |
20b6cc34 | 560 | goto free_map_locked; |
8c290e60 AS |
561 | |
562 | if (!percpu && !lru) { | |
563 | /* lru itself can remove the least used element, so | |
564 | * there is no need for an extra elem during map_update. | |
565 | */ | |
566 | err = alloc_extra_elems(htab); | |
567 | if (err) | |
568 | goto free_prealloc; | |
569 | } | |
fba1a1c6 | 570 | } else { |
4ab67149 | 571 | err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); |
fba1a1c6 AS |
572 | if (err) |
573 | goto free_map_locked; | |
ee4ed53c AS |
574 | if (percpu) { |
575 | err = bpf_mem_alloc_init(&htab->pcpu_ma, | |
576 | round_up(htab->map.value_size, 8), true); | |
577 | if (err) | |
578 | goto free_map_locked; | |
579 | } | |
6c905981 | 580 | } |
0f8e4bd8 | 581 | |
0f8e4bd8 AS |
582 | return &htab->map; |
583 | ||
8c290e60 AS |
584 | free_prealloc: |
585 | prealloc_destroy(htab); | |
20b6cc34 | 586 | free_map_locked: |
cf7de6a5 TH |
587 | if (htab->use_percpu_counter) |
588 | percpu_counter_destroy(&htab->pcount); | |
20b6cc34 SL |
589 | for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) |
590 | free_percpu(htab->map_locked[i]); | |
d407bd25 | 591 | bpf_map_area_free(htab->buckets); |
ee4ed53c | 592 | bpf_mem_alloc_destroy(&htab->pcpu_ma); |
fba1a1c6 | 593 | bpf_mem_alloc_destroy(&htab->ma); |
0f8e4bd8 | 594 | free_htab: |
8aaeed81 | 595 | lockdep_unregister_key(&htab->lockdep_key); |
73cf09a3 | 596 | bpf_map_area_free(htab); |
0f8e4bd8 AS |
597 | return ERR_PTR(err); |
598 | } | |
599 | ||
c0203475 | 600 | static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) |
0f8e4bd8 | 601 | { |
c0203475 | 602 | return jhash(key, key_len, hashrnd); |
0f8e4bd8 AS |
603 | } |
604 | ||
688ecfe6 | 605 | static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) |
0f8e4bd8 AS |
606 | { |
607 | return &htab->buckets[hash & (htab->n_buckets - 1)]; | |
608 | } | |
609 | ||
4fe84359 | 610 | static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) |
688ecfe6 | 611 | { |
612 | return &__select_bucket(htab, hash)->head; | |
613 | } | |
614 | ||
4fe84359 AS |
615 | /* this lookup function can only be called with bucket lock taken */ |
616 | static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, | |
0f8e4bd8 AS |
617 | void *key, u32 key_size) |
618 | { | |
4fe84359 | 619 | struct hlist_nulls_node *n; |
0f8e4bd8 AS |
620 | struct htab_elem *l; |
621 | ||
4fe84359 | 622 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) |
0f8e4bd8 AS |
623 | if (l->hash == hash && !memcmp(&l->key, key, key_size)) |
624 | return l; | |
625 | ||
626 | return NULL; | |
627 | } | |
628 | ||
4fe84359 AS |
629 | /* can be called without bucket lock. it will repeat the loop in |
630 | * the unlikely event when elements moved from one bucket into another | |
631 | * while link list is being walked | |
632 | */ | |
633 | static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, | |
634 | u32 hash, void *key, | |
635 | u32 key_size, u32 n_buckets) | |
636 | { | |
637 | struct hlist_nulls_node *n; | |
638 | struct htab_elem *l; | |
639 | ||
640 | again: | |
641 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) | |
642 | if (l->hash == hash && !memcmp(&l->key, key, key_size)) | |
643 | return l; | |
644 | ||
645 | if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) | |
646 | goto again; | |
647 | ||
648 | return NULL; | |
649 | } | |
650 | ||
9015d2f5 AS |
651 | /* Called from syscall or from eBPF program directly, so |
652 | * arguments have to match bpf_map_lookup_elem() exactly. | |
653 | * The return value is adjusted by BPF instructions | |
654 | * in htab_map_gen_lookup(). | |
655 | */ | |
824bd0ce | 656 | static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) |
0f8e4bd8 AS |
657 | { |
658 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
4fe84359 | 659 | struct hlist_nulls_head *head; |
0f8e4bd8 AS |
660 | struct htab_elem *l; |
661 | u32 hash, key_size; | |
662 | ||
694cea39 THJ |
663 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
664 | !rcu_read_lock_bh_held()); | |
0f8e4bd8 AS |
665 | |
666 | key_size = map->key_size; | |
667 | ||
c0203475 | 668 | hash = htab_map_hash(key, key_size, htab->hashrnd); |
0f8e4bd8 AS |
669 | |
670 | head = select_bucket(htab, hash); | |
671 | ||
4fe84359 | 672 | l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); |
0f8e4bd8 | 673 | |
824bd0ce AS |
674 | return l; |
675 | } | |
676 | ||
677 | static void *htab_map_lookup_elem(struct bpf_map *map, void *key) | |
678 | { | |
679 | struct htab_elem *l = __htab_map_lookup_elem(map, key); | |
680 | ||
0f8e4bd8 AS |
681 | if (l) |
682 | return l->key + round_up(map->key_size, 8); | |
683 | ||
684 | return NULL; | |
685 | } | |
686 | ||
9015d2f5 AS |
687 | /* inline bpf_map_lookup_elem() call. |
688 | * Instead of: | |
689 | * bpf_prog | |
690 | * bpf_map_lookup_elem | |
691 | * map->ops->map_lookup_elem | |
692 | * htab_map_lookup_elem | |
693 | * __htab_map_lookup_elem | |
694 | * do: | |
695 | * bpf_prog | |
696 | * __htab_map_lookup_elem | |
697 | */ | |
4a8f87e6 | 698 | static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) |
9015d2f5 AS |
699 | { |
700 | struct bpf_insn *insn = insn_buf; | |
701 | const int ret = BPF_REG_0; | |
702 | ||
09772d92 DB |
703 | BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, |
704 | (void *(*)(struct bpf_map *map, void *key))NULL)); | |
3d717fad | 705 | *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); |
9015d2f5 AS |
706 | *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); |
707 | *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, | |
708 | offsetof(struct htab_elem, key) + | |
709 | round_up(map->key_size, 8)); | |
710 | return insn - insn_buf; | |
711 | } | |
712 | ||
50b045a8 DB |
713 | static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, |
714 | void *key, const bool mark) | |
29ba732a MKL |
715 | { |
716 | struct htab_elem *l = __htab_map_lookup_elem(map, key); | |
717 | ||
718 | if (l) { | |
50b045a8 DB |
719 | if (mark) |
720 | bpf_lru_node_set_ref(&l->lru_node); | |
29ba732a MKL |
721 | return l->key + round_up(map->key_size, 8); |
722 | } | |
723 | ||
724 | return NULL; | |
725 | } | |
726 | ||
50b045a8 DB |
727 | static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) |
728 | { | |
729 | return __htab_lru_map_lookup_elem(map, key, true); | |
730 | } | |
731 | ||
732 | static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) | |
733 | { | |
734 | return __htab_lru_map_lookup_elem(map, key, false); | |
735 | } | |
736 | ||
4a8f87e6 | 737 | static int htab_lru_map_gen_lookup(struct bpf_map *map, |
cc555421 MKL |
738 | struct bpf_insn *insn_buf) |
739 | { | |
740 | struct bpf_insn *insn = insn_buf; | |
741 | const int ret = BPF_REG_0; | |
bb9b9f88 | 742 | const int ref_reg = BPF_REG_1; |
cc555421 | 743 | |
09772d92 DB |
744 | BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, |
745 | (void *(*)(struct bpf_map *map, void *key))NULL)); | |
3d717fad | 746 | *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); |
bb9b9f88 MKL |
747 | *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); |
748 | *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, | |
749 | offsetof(struct htab_elem, lru_node) + | |
750 | offsetof(struct bpf_lru_node, ref)); | |
751 | *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); | |
cc555421 MKL |
752 | *insn++ = BPF_ST_MEM(BPF_B, ret, |
753 | offsetof(struct htab_elem, lru_node) + | |
754 | offsetof(struct bpf_lru_node, ref), | |
755 | 1); | |
756 | *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, | |
757 | offsetof(struct htab_elem, key) + | |
758 | round_up(map->key_size, 8)); | |
759 | return insn - insn_buf; | |
760 | } | |
761 | ||
14a324f6 KKD |
762 | static void check_and_free_fields(struct bpf_htab *htab, |
763 | struct htab_elem *elem) | |
68134668 | 764 | { |
14a324f6 KKD |
765 | void *map_value = elem->key + round_up(htab->map.key_size, 8); |
766 | ||
767 | if (map_value_has_timer(&htab->map)) | |
768 | bpf_timer_cancel_and_free(map_value + htab->map.timer_off); | |
769 | if (map_value_has_kptrs(&htab->map)) | |
770 | bpf_map_free_kptrs(&htab->map, map_value); | |
68134668 AS |
771 | } |
772 | ||
29ba732a MKL |
773 | /* It is called from the bpf_lru_list when the LRU needs to delete |
774 | * older elements from the htab. | |
775 | */ | |
776 | static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) | |
777 | { | |
241d50ec | 778 | struct bpf_htab *htab = arg; |
4fe84359 AS |
779 | struct htab_elem *l = NULL, *tgt_l; |
780 | struct hlist_nulls_head *head; | |
781 | struct hlist_nulls_node *n; | |
29ba732a MKL |
782 | unsigned long flags; |
783 | struct bucket *b; | |
20b6cc34 | 784 | int ret; |
29ba732a MKL |
785 | |
786 | tgt_l = container_of(node, struct htab_elem, lru_node); | |
787 | b = __select_bucket(htab, tgt_l->hash); | |
788 | head = &b->head; | |
789 | ||
20b6cc34 SL |
790 | ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); |
791 | if (ret) | |
792 | return false; | |
29ba732a | 793 | |
4fe84359 | 794 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) |
29ba732a | 795 | if (l == tgt_l) { |
4fe84359 | 796 | hlist_nulls_del_rcu(&l->hash_node); |
14a324f6 | 797 | check_and_free_fields(htab, l); |
29ba732a MKL |
798 | break; |
799 | } | |
800 | ||
20b6cc34 | 801 | htab_unlock_bucket(htab, b, tgt_l->hash, flags); |
29ba732a MKL |
802 | |
803 | return l == tgt_l; | |
804 | } | |
805 | ||
0f8e4bd8 AS |
806 | /* Called from syscall */ |
807 | static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | |
808 | { | |
809 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
4fe84359 | 810 | struct hlist_nulls_head *head; |
0f8e4bd8 AS |
811 | struct htab_elem *l, *next_l; |
812 | u32 hash, key_size; | |
8fe45924 | 813 | int i = 0; |
0f8e4bd8 AS |
814 | |
815 | WARN_ON_ONCE(!rcu_read_lock_held()); | |
816 | ||
817 | key_size = map->key_size; | |
818 | ||
8fe45924 TQ |
819 | if (!key) |
820 | goto find_first_elem; | |
821 | ||
c0203475 | 822 | hash = htab_map_hash(key, key_size, htab->hashrnd); |
0f8e4bd8 AS |
823 | |
824 | head = select_bucket(htab, hash); | |
825 | ||
826 | /* lookup the key */ | |
4fe84359 | 827 | l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); |
0f8e4bd8 | 828 | |
8fe45924 | 829 | if (!l) |
0f8e4bd8 | 830 | goto find_first_elem; |
0f8e4bd8 AS |
831 | |
832 | /* key was found, get next key in the same bucket */ | |
4fe84359 | 833 | next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), |
0f8e4bd8 AS |
834 | struct htab_elem, hash_node); |
835 | ||
836 | if (next_l) { | |
837 | /* if next elem in this hash list is non-zero, just return it */ | |
838 | memcpy(next_key, next_l->key, key_size); | |
839 | return 0; | |
840 | } | |
841 | ||
842 | /* no more elements in this hash list, go to the next bucket */ | |
843 | i = hash & (htab->n_buckets - 1); | |
844 | i++; | |
845 | ||
846 | find_first_elem: | |
847 | /* iterate over buckets */ | |
848 | for (; i < htab->n_buckets; i++) { | |
849 | head = select_bucket(htab, i); | |
850 | ||
851 | /* pick first element in the bucket */ | |
4fe84359 | 852 | next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), |
0f8e4bd8 AS |
853 | struct htab_elem, hash_node); |
854 | if (next_l) { | |
855 | /* if it's not empty, just return it */ | |
856 | memcpy(next_key, next_l->key, key_size); | |
857 | return 0; | |
858 | } | |
859 | } | |
860 | ||
6c905981 | 861 | /* iterated over all buckets and all elements */ |
0f8e4bd8 AS |
862 | return -ENOENT; |
863 | } | |
864 | ||
6c905981 | 865 | static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) |
824bd0ce | 866 | { |
6c905981 | 867 | if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) |
ee4ed53c | 868 | bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); |
14a324f6 | 869 | check_and_free_fields(htab, l); |
fba1a1c6 | 870 | bpf_mem_cache_free(&htab->ma, l); |
824bd0ce AS |
871 | } |
872 | ||
1d4e1eab | 873 | static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) |
824bd0ce | 874 | { |
bcc6b1b7 | 875 | struct bpf_map *map = &htab->map; |
1d4e1eab | 876 | void *ptr; |
bcc6b1b7 MKL |
877 | |
878 | if (map->ops->map_fd_put_ptr) { | |
1d4e1eab | 879 | ptr = fd_htab_map_get_ptr(map, l); |
bcc6b1b7 MKL |
880 | map->ops->map_fd_put_ptr(ptr); |
881 | } | |
1d4e1eab AN |
882 | } |
883 | ||
86fe28f7 AS |
884 | static bool is_map_full(struct bpf_htab *htab) |
885 | { | |
886 | if (htab->use_percpu_counter) | |
887 | return __percpu_counter_compare(&htab->pcount, htab->map.max_entries, | |
888 | PERCPU_COUNTER_BATCH) >= 0; | |
889 | return atomic_read(&htab->count) >= htab->map.max_entries; | |
890 | } | |
891 | ||
892 | static void inc_elem_count(struct bpf_htab *htab) | |
893 | { | |
894 | if (htab->use_percpu_counter) | |
895 | percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); | |
896 | else | |
897 | atomic_inc(&htab->count); | |
898 | } | |
899 | ||
900 | static void dec_elem_count(struct bpf_htab *htab) | |
901 | { | |
902 | if (htab->use_percpu_counter) | |
903 | percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); | |
904 | else | |
905 | atomic_dec(&htab->count); | |
906 | } | |
907 | ||
908 | ||
1d4e1eab AN |
909 | static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) |
910 | { | |
911 | htab_put_fd_value(htab, l); | |
bcc6b1b7 | 912 | |
8c290e60 | 913 | if (htab_is_prealloc(htab)) { |
14a324f6 | 914 | check_and_free_fields(htab, l); |
a89fac57 | 915 | __pcpu_freelist_push(&htab->freelist, &l->fnode); |
824bd0ce | 916 | } else { |
86fe28f7 | 917 | dec_elem_count(htab); |
ee4ed53c | 918 | htab_elem_free(htab, l); |
824bd0ce AS |
919 | } |
920 | } | |
921 | ||
fd91de7b MKL |
922 | static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, |
923 | void *value, bool onallcpus) | |
924 | { | |
925 | if (!onallcpus) { | |
926 | /* copy true value_size bytes */ | |
927 | memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); | |
928 | } else { | |
929 | u32 size = round_up(htab->map.value_size, 8); | |
930 | int off = 0, cpu; | |
931 | ||
932 | for_each_possible_cpu(cpu) { | |
933 | bpf_long_memcpy(per_cpu_ptr(pptr, cpu), | |
934 | value + off, size); | |
935 | off += size; | |
936 | } | |
937 | } | |
938 | } | |
939 | ||
d3bec013 DV |
940 | static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, |
941 | void *value, bool onallcpus) | |
942 | { | |
ee4ed53c AS |
943 | /* When not setting the initial value on all cpus, zero-fill element |
944 | * values for other cpus. Otherwise, bpf program has no way to ensure | |
d3bec013 DV |
945 | * known initial values for cpus other than current one |
946 | * (onallcpus=false always when coming from bpf prog). | |
947 | */ | |
ee4ed53c | 948 | if (!onallcpus) { |
d3bec013 DV |
949 | u32 size = round_up(htab->map.value_size, 8); |
950 | int current_cpu = raw_smp_processor_id(); | |
951 | int cpu; | |
952 | ||
953 | for_each_possible_cpu(cpu) { | |
954 | if (cpu == current_cpu) | |
955 | bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, | |
956 | size); | |
957 | else | |
958 | memset(per_cpu_ptr(pptr, cpu), 0, size); | |
959 | } | |
960 | } else { | |
961 | pcpu_copy_value(htab, pptr, value, onallcpus); | |
962 | } | |
963 | } | |
964 | ||
cd36c3a2 DB |
965 | static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) |
966 | { | |
967 | return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && | |
968 | BITS_PER_LONG == 64; | |
969 | } | |
970 | ||
824bd0ce AS |
971 | static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, |
972 | void *value, u32 key_size, u32 hash, | |
a6ed3ea6 | 973 | bool percpu, bool onallcpus, |
8c290e60 | 974 | struct htab_elem *old_elem) |
824bd0ce | 975 | { |
d83525ca | 976 | u32 size = htab->map.value_size; |
8c290e60 AS |
977 | bool prealloc = htab_is_prealloc(htab); |
978 | struct htab_elem *l_new, **pl_new; | |
824bd0ce AS |
979 | void __percpu *pptr; |
980 | ||
6c905981 | 981 | if (prealloc) { |
8c290e60 AS |
982 | if (old_elem) { |
983 | /* if we're updating the existing element, | |
984 | * use per-cpu extra elems to avoid freelist_pop/push | |
985 | */ | |
986 | pl_new = this_cpu_ptr(htab->extra_elems); | |
987 | l_new = *pl_new; | |
1d4e1eab | 988 | htab_put_fd_value(htab, old_elem); |
8c290e60 AS |
989 | *pl_new = old_elem; |
990 | } else { | |
991 | struct pcpu_freelist_node *l; | |
9f691549 | 992 | |
a89fac57 | 993 | l = __pcpu_freelist_pop(&htab->freelist); |
8c290e60 AS |
994 | if (!l) |
995 | return ERR_PTR(-E2BIG); | |
9f691549 | 996 | l_new = container_of(l, struct htab_elem, fnode); |
6c905981 | 997 | } |
a6ed3ea6 | 998 | } else { |
86fe28f7 AS |
999 | if (is_map_full(htab)) |
1000 | if (!old_elem) | |
8c290e60 AS |
1001 | /* when map is full and update() is replacing |
1002 | * old element, it's ok to allocate, since | |
1003 | * old element will be freed immediately. | |
1004 | * Otherwise return an error | |
1005 | */ | |
86fe28f7 AS |
1006 | return ERR_PTR(-E2BIG); |
1007 | inc_elem_count(htab); | |
fba1a1c6 | 1008 | l_new = bpf_mem_cache_alloc(&htab->ma); |
ed2b82c0 MV |
1009 | if (!l_new) { |
1010 | l_new = ERR_PTR(-ENOMEM); | |
1011 | goto dec_count; | |
1012 | } | |
68134668 AS |
1013 | check_and_init_map_value(&htab->map, |
1014 | l_new->key + round_up(key_size, 8)); | |
6c905981 | 1015 | } |
824bd0ce AS |
1016 | |
1017 | memcpy(l_new->key, key, key_size); | |
1018 | if (percpu) { | |
6c905981 AS |
1019 | if (prealloc) { |
1020 | pptr = htab_elem_get_ptr(l_new, key_size); | |
1021 | } else { | |
1022 | /* alloc_percpu zero-fills */ | |
ee4ed53c | 1023 | pptr = bpf_mem_cache_alloc(&htab->pcpu_ma); |
6c905981 | 1024 | if (!pptr) { |
fba1a1c6 | 1025 | bpf_mem_cache_free(&htab->ma, l_new); |
ed2b82c0 MV |
1026 | l_new = ERR_PTR(-ENOMEM); |
1027 | goto dec_count; | |
6c905981 | 1028 | } |
ee4ed53c AS |
1029 | l_new->ptr_to_pptr = pptr; |
1030 | pptr = *(void **)pptr; | |
824bd0ce AS |
1031 | } |
1032 | ||
d3bec013 | 1033 | pcpu_init_value(htab, pptr, value, onallcpus); |
15a07b33 | 1034 | |
6c905981 AS |
1035 | if (!prealloc) |
1036 | htab_elem_set_ptr(l_new, key_size, pptr); | |
d83525ca AS |
1037 | } else if (fd_htab_map_needs_adjust(htab)) { |
1038 | size = round_up(size, 8); | |
824bd0ce | 1039 | memcpy(l_new->key + round_up(key_size, 8), value, size); |
d83525ca AS |
1040 | } else { |
1041 | copy_map_value(&htab->map, | |
1042 | l_new->key + round_up(key_size, 8), | |
1043 | value); | |
824bd0ce AS |
1044 | } |
1045 | ||
1046 | l_new->hash = hash; | |
1047 | return l_new; | |
ed2b82c0 | 1048 | dec_count: |
86fe28f7 | 1049 | dec_elem_count(htab); |
ed2b82c0 | 1050 | return l_new; |
824bd0ce AS |
1051 | } |
1052 | ||
1053 | static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, | |
1054 | u64 map_flags) | |
1055 | { | |
96049f3a | 1056 | if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) |
824bd0ce AS |
1057 | /* elem already exists */ |
1058 | return -EEXIST; | |
1059 | ||
96049f3a | 1060 | if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) |
824bd0ce AS |
1061 | /* elem doesn't exist, cannot update it */ |
1062 | return -ENOENT; | |
1063 | ||
1064 | return 0; | |
1065 | } | |
1066 | ||
0f8e4bd8 AS |
1067 | /* Called from syscall or from eBPF program */ |
1068 | static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, | |
1069 | u64 map_flags) | |
1070 | { | |
1071 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
824bd0ce | 1072 | struct htab_elem *l_new = NULL, *l_old; |
4fe84359 | 1073 | struct hlist_nulls_head *head; |
0f8e4bd8 | 1074 | unsigned long flags; |
824bd0ce AS |
1075 | struct bucket *b; |
1076 | u32 key_size, hash; | |
0f8e4bd8 AS |
1077 | int ret; |
1078 | ||
96049f3a | 1079 | if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) |
0f8e4bd8 AS |
1080 | /* unknown flags */ |
1081 | return -EINVAL; | |
1082 | ||
694cea39 THJ |
1083 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1084 | !rcu_read_lock_bh_held()); | |
0f8e4bd8 | 1085 | |
0f8e4bd8 AS |
1086 | key_size = map->key_size; |
1087 | ||
c0203475 | 1088 | hash = htab_map_hash(key, key_size, htab->hashrnd); |
824bd0ce | 1089 | |
824bd0ce | 1090 | b = __select_bucket(htab, hash); |
688ecfe6 | 1091 | head = &b->head; |
0f8e4bd8 | 1092 | |
96049f3a AS |
1093 | if (unlikely(map_flags & BPF_F_LOCK)) { |
1094 | if (unlikely(!map_value_has_spin_lock(map))) | |
1095 | return -EINVAL; | |
1096 | /* find an element without taking the bucket lock */ | |
1097 | l_old = lookup_nulls_elem_raw(head, hash, key, key_size, | |
1098 | htab->n_buckets); | |
1099 | ret = check_flags(htab, l_old, map_flags); | |
1100 | if (ret) | |
1101 | return ret; | |
1102 | if (l_old) { | |
1103 | /* grab the element lock and update value in place */ | |
1104 | copy_map_value_locked(map, | |
1105 | l_old->key + round_up(key_size, 8), | |
1106 | value, false); | |
1107 | return 0; | |
1108 | } | |
1109 | /* fall through, grab the bucket lock and lookup again. | |
1110 | * 99.9% chance that the element won't be found, | |
1111 | * but second lookup under lock has to be done. | |
1112 | */ | |
1113 | } | |
1114 | ||
20b6cc34 SL |
1115 | ret = htab_lock_bucket(htab, b, hash, &flags); |
1116 | if (ret) | |
1117 | return ret; | |
0f8e4bd8 | 1118 | |
824bd0ce | 1119 | l_old = lookup_elem_raw(head, hash, key, key_size); |
0f8e4bd8 | 1120 | |
824bd0ce AS |
1121 | ret = check_flags(htab, l_old, map_flags); |
1122 | if (ret) | |
0f8e4bd8 | 1123 | goto err; |
0f8e4bd8 | 1124 | |
96049f3a AS |
1125 | if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { |
1126 | /* first lookup without the bucket lock didn't find the element, | |
1127 | * but second lookup with the bucket lock found it. | |
1128 | * This case is highly unlikely, but has to be dealt with: | |
1129 | * grab the element lock in addition to the bucket lock | |
1130 | * and update element in place | |
1131 | */ | |
1132 | copy_map_value_locked(map, | |
1133 | l_old->key + round_up(key_size, 8), | |
1134 | value, false); | |
1135 | ret = 0; | |
1136 | goto err; | |
1137 | } | |
1138 | ||
a6ed3ea6 | 1139 | l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, |
8c290e60 | 1140 | l_old); |
6c905981 AS |
1141 | if (IS_ERR(l_new)) { |
1142 | /* all pre-allocated elements are in use or memory exhausted */ | |
1143 | ret = PTR_ERR(l_new); | |
1144 | goto err; | |
1145 | } | |
1146 | ||
824bd0ce AS |
1147 | /* add new element to the head of the list, so that |
1148 | * concurrent search will find it before old elem | |
0f8e4bd8 | 1149 | */ |
4fe84359 | 1150 | hlist_nulls_add_head_rcu(&l_new->hash_node, head); |
0f8e4bd8 | 1151 | if (l_old) { |
4fe84359 | 1152 | hlist_nulls_del_rcu(&l_old->hash_node); |
8c290e60 AS |
1153 | if (!htab_is_prealloc(htab)) |
1154 | free_htab_elem(htab, l_old); | |
68134668 | 1155 | else |
14a324f6 | 1156 | check_and_free_fields(htab, l_old); |
0f8e4bd8 | 1157 | } |
6c905981 | 1158 | ret = 0; |
0f8e4bd8 | 1159 | err: |
20b6cc34 | 1160 | htab_unlock_bucket(htab, b, hash, flags); |
0f8e4bd8 AS |
1161 | return ret; |
1162 | } | |
1163 | ||
68134668 AS |
1164 | static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) |
1165 | { | |
14a324f6 | 1166 | check_and_free_fields(htab, elem); |
68134668 AS |
1167 | bpf_lru_push_free(&htab->lru, &elem->lru_node); |
1168 | } | |
1169 | ||
29ba732a MKL |
1170 | static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, |
1171 | u64 map_flags) | |
1172 | { | |
1173 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
1174 | struct htab_elem *l_new, *l_old = NULL; | |
4fe84359 | 1175 | struct hlist_nulls_head *head; |
29ba732a MKL |
1176 | unsigned long flags; |
1177 | struct bucket *b; | |
1178 | u32 key_size, hash; | |
1179 | int ret; | |
1180 | ||
1181 | if (unlikely(map_flags > BPF_EXIST)) | |
1182 | /* unknown flags */ | |
1183 | return -EINVAL; | |
1184 | ||
694cea39 THJ |
1185 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1186 | !rcu_read_lock_bh_held()); | |
29ba732a MKL |
1187 | |
1188 | key_size = map->key_size; | |
1189 | ||
c0203475 | 1190 | hash = htab_map_hash(key, key_size, htab->hashrnd); |
29ba732a MKL |
1191 | |
1192 | b = __select_bucket(htab, hash); | |
1193 | head = &b->head; | |
1194 | ||
1195 | /* For LRU, we need to alloc before taking bucket's | |
1196 | * spinlock because getting free nodes from LRU may need | |
1197 | * to remove older elements from htab and this removal | |
1198 | * operation will need a bucket lock. | |
1199 | */ | |
1200 | l_new = prealloc_lru_pop(htab, key, hash); | |
1201 | if (!l_new) | |
1202 | return -ENOMEM; | |
68134668 AS |
1203 | copy_map_value(&htab->map, |
1204 | l_new->key + round_up(map->key_size, 8), value); | |
29ba732a | 1205 | |
20b6cc34 SL |
1206 | ret = htab_lock_bucket(htab, b, hash, &flags); |
1207 | if (ret) | |
1208 | return ret; | |
29ba732a MKL |
1209 | |
1210 | l_old = lookup_elem_raw(head, hash, key, key_size); | |
1211 | ||
1212 | ret = check_flags(htab, l_old, map_flags); | |
1213 | if (ret) | |
1214 | goto err; | |
1215 | ||
1216 | /* add new element to the head of the list, so that | |
1217 | * concurrent search will find it before old elem | |
1218 | */ | |
4fe84359 | 1219 | hlist_nulls_add_head_rcu(&l_new->hash_node, head); |
29ba732a MKL |
1220 | if (l_old) { |
1221 | bpf_lru_node_set_ref(&l_new->lru_node); | |
4fe84359 | 1222 | hlist_nulls_del_rcu(&l_old->hash_node); |
29ba732a MKL |
1223 | } |
1224 | ret = 0; | |
1225 | ||
1226 | err: | |
20b6cc34 | 1227 | htab_unlock_bucket(htab, b, hash, flags); |
29ba732a MKL |
1228 | |
1229 | if (ret) | |
68134668 | 1230 | htab_lru_push_free(htab, l_new); |
29ba732a | 1231 | else if (l_old) |
68134668 | 1232 | htab_lru_push_free(htab, l_old); |
29ba732a MKL |
1233 | |
1234 | return ret; | |
1235 | } | |
1236 | ||
15a07b33 AS |
1237 | static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, |
1238 | void *value, u64 map_flags, | |
1239 | bool onallcpus) | |
824bd0ce AS |
1240 | { |
1241 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
1242 | struct htab_elem *l_new = NULL, *l_old; | |
4fe84359 | 1243 | struct hlist_nulls_head *head; |
824bd0ce AS |
1244 | unsigned long flags; |
1245 | struct bucket *b; | |
1246 | u32 key_size, hash; | |
1247 | int ret; | |
1248 | ||
1249 | if (unlikely(map_flags > BPF_EXIST)) | |
1250 | /* unknown flags */ | |
1251 | return -EINVAL; | |
1252 | ||
694cea39 THJ |
1253 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1254 | !rcu_read_lock_bh_held()); | |
824bd0ce AS |
1255 | |
1256 | key_size = map->key_size; | |
1257 | ||
c0203475 | 1258 | hash = htab_map_hash(key, key_size, htab->hashrnd); |
824bd0ce AS |
1259 | |
1260 | b = __select_bucket(htab, hash); | |
1261 | head = &b->head; | |
1262 | ||
20b6cc34 SL |
1263 | ret = htab_lock_bucket(htab, b, hash, &flags); |
1264 | if (ret) | |
1265 | return ret; | |
824bd0ce AS |
1266 | |
1267 | l_old = lookup_elem_raw(head, hash, key, key_size); | |
1268 | ||
1269 | ret = check_flags(htab, l_old, map_flags); | |
1270 | if (ret) | |
1271 | goto err; | |
1272 | ||
1273 | if (l_old) { | |
1274 | /* per-cpu hash map can update value in-place */ | |
fd91de7b MKL |
1275 | pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), |
1276 | value, onallcpus); | |
824bd0ce AS |
1277 | } else { |
1278 | l_new = alloc_htab_elem(htab, key, value, key_size, | |
8c290e60 | 1279 | hash, true, onallcpus, NULL); |
6c905981 AS |
1280 | if (IS_ERR(l_new)) { |
1281 | ret = PTR_ERR(l_new); | |
824bd0ce AS |
1282 | goto err; |
1283 | } | |
4fe84359 | 1284 | hlist_nulls_add_head_rcu(&l_new->hash_node, head); |
824bd0ce AS |
1285 | } |
1286 | ret = 0; | |
1287 | err: | |
20b6cc34 | 1288 | htab_unlock_bucket(htab, b, hash, flags); |
824bd0ce AS |
1289 | return ret; |
1290 | } | |
1291 | ||
8f844938 MKL |
1292 | static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, |
1293 | void *value, u64 map_flags, | |
1294 | bool onallcpus) | |
1295 | { | |
1296 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
1297 | struct htab_elem *l_new = NULL, *l_old; | |
4fe84359 | 1298 | struct hlist_nulls_head *head; |
8f844938 MKL |
1299 | unsigned long flags; |
1300 | struct bucket *b; | |
1301 | u32 key_size, hash; | |
1302 | int ret; | |
1303 | ||
1304 | if (unlikely(map_flags > BPF_EXIST)) | |
1305 | /* unknown flags */ | |
1306 | return -EINVAL; | |
1307 | ||
694cea39 THJ |
1308 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1309 | !rcu_read_lock_bh_held()); | |
8f844938 MKL |
1310 | |
1311 | key_size = map->key_size; | |
1312 | ||
c0203475 | 1313 | hash = htab_map_hash(key, key_size, htab->hashrnd); |
8f844938 MKL |
1314 | |
1315 | b = __select_bucket(htab, hash); | |
1316 | head = &b->head; | |
1317 | ||
1318 | /* For LRU, we need to alloc before taking bucket's | |
1319 | * spinlock because LRU's elem alloc may need | |
1320 | * to remove older elem from htab and this removal | |
1321 | * operation will need a bucket lock. | |
1322 | */ | |
1323 | if (map_flags != BPF_EXIST) { | |
1324 | l_new = prealloc_lru_pop(htab, key, hash); | |
1325 | if (!l_new) | |
1326 | return -ENOMEM; | |
1327 | } | |
1328 | ||
20b6cc34 SL |
1329 | ret = htab_lock_bucket(htab, b, hash, &flags); |
1330 | if (ret) | |
1331 | return ret; | |
8f844938 MKL |
1332 | |
1333 | l_old = lookup_elem_raw(head, hash, key, key_size); | |
1334 | ||
1335 | ret = check_flags(htab, l_old, map_flags); | |
1336 | if (ret) | |
1337 | goto err; | |
1338 | ||
1339 | if (l_old) { | |
1340 | bpf_lru_node_set_ref(&l_old->lru_node); | |
1341 | ||
1342 | /* per-cpu hash map can update value in-place */ | |
1343 | pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), | |
1344 | value, onallcpus); | |
1345 | } else { | |
d3bec013 | 1346 | pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), |
8f844938 | 1347 | value, onallcpus); |
4fe84359 | 1348 | hlist_nulls_add_head_rcu(&l_new->hash_node, head); |
8f844938 MKL |
1349 | l_new = NULL; |
1350 | } | |
1351 | ret = 0; | |
1352 | err: | |
20b6cc34 | 1353 | htab_unlock_bucket(htab, b, hash, flags); |
8f844938 MKL |
1354 | if (l_new) |
1355 | bpf_lru_push_free(&htab->lru, &l_new->lru_node); | |
1356 | return ret; | |
1357 | } | |
1358 | ||
15a07b33 AS |
1359 | static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, |
1360 | void *value, u64 map_flags) | |
1361 | { | |
1362 | return __htab_percpu_map_update_elem(map, key, value, map_flags, false); | |
1363 | } | |
1364 | ||
8f844938 MKL |
1365 | static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, |
1366 | void *value, u64 map_flags) | |
1367 | { | |
1368 | return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, | |
1369 | false); | |
1370 | } | |
1371 | ||
0f8e4bd8 AS |
1372 | /* Called from syscall or from eBPF program */ |
1373 | static int htab_map_delete_elem(struct bpf_map *map, void *key) | |
1374 | { | |
1375 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
4fe84359 | 1376 | struct hlist_nulls_head *head; |
688ecfe6 | 1377 | struct bucket *b; |
0f8e4bd8 AS |
1378 | struct htab_elem *l; |
1379 | unsigned long flags; | |
1380 | u32 hash, key_size; | |
20b6cc34 | 1381 | int ret; |
0f8e4bd8 | 1382 | |
694cea39 THJ |
1383 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1384 | !rcu_read_lock_bh_held()); | |
0f8e4bd8 AS |
1385 | |
1386 | key_size = map->key_size; | |
1387 | ||
c0203475 | 1388 | hash = htab_map_hash(key, key_size, htab->hashrnd); |
688ecfe6 | 1389 | b = __select_bucket(htab, hash); |
1390 | head = &b->head; | |
0f8e4bd8 | 1391 | |
20b6cc34 SL |
1392 | ret = htab_lock_bucket(htab, b, hash, &flags); |
1393 | if (ret) | |
1394 | return ret; | |
0f8e4bd8 | 1395 | |
0f8e4bd8 AS |
1396 | l = lookup_elem_raw(head, hash, key, key_size); |
1397 | ||
1398 | if (l) { | |
4fe84359 | 1399 | hlist_nulls_del_rcu(&l->hash_node); |
6c905981 | 1400 | free_htab_elem(htab, l); |
20b6cc34 SL |
1401 | } else { |
1402 | ret = -ENOENT; | |
0f8e4bd8 AS |
1403 | } |
1404 | ||
20b6cc34 | 1405 | htab_unlock_bucket(htab, b, hash, flags); |
0f8e4bd8 AS |
1406 | return ret; |
1407 | } | |
1408 | ||
29ba732a MKL |
1409 | static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) |
1410 | { | |
1411 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
4fe84359 | 1412 | struct hlist_nulls_head *head; |
29ba732a MKL |
1413 | struct bucket *b; |
1414 | struct htab_elem *l; | |
1415 | unsigned long flags; | |
1416 | u32 hash, key_size; | |
20b6cc34 | 1417 | int ret; |
29ba732a | 1418 | |
694cea39 THJ |
1419 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1420 | !rcu_read_lock_bh_held()); | |
29ba732a MKL |
1421 | |
1422 | key_size = map->key_size; | |
1423 | ||
c0203475 | 1424 | hash = htab_map_hash(key, key_size, htab->hashrnd); |
29ba732a MKL |
1425 | b = __select_bucket(htab, hash); |
1426 | head = &b->head; | |
1427 | ||
20b6cc34 SL |
1428 | ret = htab_lock_bucket(htab, b, hash, &flags); |
1429 | if (ret) | |
1430 | return ret; | |
29ba732a MKL |
1431 | |
1432 | l = lookup_elem_raw(head, hash, key, key_size); | |
1433 | ||
20b6cc34 | 1434 | if (l) |
4fe84359 | 1435 | hlist_nulls_del_rcu(&l->hash_node); |
20b6cc34 SL |
1436 | else |
1437 | ret = -ENOENT; | |
29ba732a | 1438 | |
20b6cc34 | 1439 | htab_unlock_bucket(htab, b, hash, flags); |
29ba732a | 1440 | if (l) |
68134668 | 1441 | htab_lru_push_free(htab, l); |
29ba732a MKL |
1442 | return ret; |
1443 | } | |
1444 | ||
0f8e4bd8 AS |
1445 | static void delete_all_elements(struct bpf_htab *htab) |
1446 | { | |
1447 | int i; | |
1448 | ||
fba1a1c6 AS |
1449 | /* It's called from a worker thread, so disable migration here, |
1450 | * since bpf_mem_cache_free() relies on that. | |
1451 | */ | |
1452 | migrate_disable(); | |
0f8e4bd8 | 1453 | for (i = 0; i < htab->n_buckets; i++) { |
4fe84359 AS |
1454 | struct hlist_nulls_head *head = select_bucket(htab, i); |
1455 | struct hlist_nulls_node *n; | |
0f8e4bd8 AS |
1456 | struct htab_elem *l; |
1457 | ||
4fe84359 AS |
1458 | hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { |
1459 | hlist_nulls_del_rcu(&l->hash_node); | |
8c290e60 | 1460 | htab_elem_free(htab, l); |
0f8e4bd8 AS |
1461 | } |
1462 | } | |
fba1a1c6 | 1463 | migrate_enable(); |
0f8e4bd8 | 1464 | } |
bcc6b1b7 | 1465 | |
68134668 AS |
1466 | static void htab_free_malloced_timers(struct bpf_htab *htab) |
1467 | { | |
1468 | int i; | |
1469 | ||
1470 | rcu_read_lock(); | |
1471 | for (i = 0; i < htab->n_buckets; i++) { | |
1472 | struct hlist_nulls_head *head = select_bucket(htab, i); | |
1473 | struct hlist_nulls_node *n; | |
1474 | struct htab_elem *l; | |
1475 | ||
14a324f6 KKD |
1476 | hlist_nulls_for_each_entry(l, n, head, hash_node) { |
1477 | /* We don't reset or free kptr on uref dropping to zero, | |
1478 | * hence just free timer. | |
1479 | */ | |
1480 | bpf_timer_cancel_and_free(l->key + | |
1481 | round_up(htab->map.key_size, 8) + | |
1482 | htab->map.timer_off); | |
1483 | } | |
68134668 AS |
1484 | cond_resched_rcu(); |
1485 | } | |
1486 | rcu_read_unlock(); | |
1487 | } | |
1488 | ||
1489 | static void htab_map_free_timers(struct bpf_map *map) | |
1490 | { | |
1491 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
1492 | ||
14a324f6 KKD |
1493 | /* We don't reset or free kptr on uref dropping to zero. */ |
1494 | if (!map_value_has_timer(&htab->map)) | |
68134668 AS |
1495 | return; |
1496 | if (!htab_is_prealloc(htab)) | |
1497 | htab_free_malloced_timers(htab); | |
1498 | else | |
1499 | htab_free_prealloced_timers(htab); | |
1500 | } | |
1501 | ||
0f8e4bd8 AS |
1502 | /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ |
1503 | static void htab_map_free(struct bpf_map *map) | |
1504 | { | |
1505 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
20b6cc34 | 1506 | int i; |
0f8e4bd8 | 1507 | |
bba1dc0b AS |
1508 | /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. |
1509 | * bpf_free_used_maps() is called after bpf prog is no longer executing. | |
1510 | * There is no need to synchronize_rcu() here to protect map elements. | |
0f8e4bd8 | 1511 | */ |
0f8e4bd8 | 1512 | |
9f2c6e96 AS |
1513 | /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it |
1514 | * underneath and is reponsible for waiting for callbacks to finish | |
1515 | * during bpf_mem_alloc_destroy(). | |
0f8e4bd8 | 1516 | */ |
14a324f6 | 1517 | if (!htab_is_prealloc(htab)) { |
6c905981 | 1518 | delete_all_elements(htab); |
14a324f6 KKD |
1519 | } else { |
1520 | htab_free_prealloced_kptrs(htab); | |
29ba732a | 1521 | prealloc_destroy(htab); |
14a324f6 | 1522 | } |
29ba732a | 1523 | |
14a324f6 | 1524 | bpf_map_free_kptr_off_tab(map); |
a6ed3ea6 | 1525 | free_percpu(htab->extra_elems); |
d407bd25 | 1526 | bpf_map_area_free(htab->buckets); |
ee4ed53c | 1527 | bpf_mem_alloc_destroy(&htab->pcpu_ma); |
fba1a1c6 | 1528 | bpf_mem_alloc_destroy(&htab->ma); |
86fe28f7 AS |
1529 | if (htab->use_percpu_counter) |
1530 | percpu_counter_destroy(&htab->pcount); | |
20b6cc34 SL |
1531 | for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) |
1532 | free_percpu(htab->map_locked[i]); | |
8aaeed81 | 1533 | lockdep_unregister_key(&htab->lockdep_key); |
73cf09a3 | 1534 | bpf_map_area_free(htab); |
0f8e4bd8 AS |
1535 | } |
1536 | ||
699c86d6 YS |
1537 | static void htab_map_seq_show_elem(struct bpf_map *map, void *key, |
1538 | struct seq_file *m) | |
1539 | { | |
1540 | void *value; | |
1541 | ||
1542 | rcu_read_lock(); | |
1543 | ||
1544 | value = htab_map_lookup_elem(map, key); | |
1545 | if (!value) { | |
1546 | rcu_read_unlock(); | |
1547 | return; | |
1548 | } | |
1549 | ||
1550 | btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); | |
1551 | seq_puts(m, ": "); | |
1552 | btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); | |
1553 | seq_puts(m, "\n"); | |
1554 | ||
1555 | rcu_read_unlock(); | |
1556 | } | |
1557 | ||
3e87f192 DS |
1558 | static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, |
1559 | void *value, bool is_lru_map, | |
1560 | bool is_percpu, u64 flags) | |
1561 | { | |
1562 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
1563 | struct hlist_nulls_head *head; | |
1564 | unsigned long bflags; | |
1565 | struct htab_elem *l; | |
1566 | u32 hash, key_size; | |
1567 | struct bucket *b; | |
1568 | int ret; | |
1569 | ||
1570 | key_size = map->key_size; | |
1571 | ||
1572 | hash = htab_map_hash(key, key_size, htab->hashrnd); | |
1573 | b = __select_bucket(htab, hash); | |
1574 | head = &b->head; | |
1575 | ||
1576 | ret = htab_lock_bucket(htab, b, hash, &bflags); | |
1577 | if (ret) | |
1578 | return ret; | |
1579 | ||
1580 | l = lookup_elem_raw(head, hash, key, key_size); | |
1581 | if (!l) { | |
1582 | ret = -ENOENT; | |
1583 | } else { | |
1584 | if (is_percpu) { | |
1585 | u32 roundup_value_size = round_up(map->value_size, 8); | |
1586 | void __percpu *pptr; | |
1587 | int off = 0, cpu; | |
1588 | ||
1589 | pptr = htab_elem_get_ptr(l, key_size); | |
1590 | for_each_possible_cpu(cpu) { | |
1591 | bpf_long_memcpy(value + off, | |
1592 | per_cpu_ptr(pptr, cpu), | |
1593 | roundup_value_size); | |
1594 | off += roundup_value_size; | |
1595 | } | |
1596 | } else { | |
1597 | u32 roundup_key_size = round_up(map->key_size, 8); | |
1598 | ||
1599 | if (flags & BPF_F_LOCK) | |
1600 | copy_map_value_locked(map, value, l->key + | |
1601 | roundup_key_size, | |
1602 | true); | |
1603 | else | |
1604 | copy_map_value(map, value, l->key + | |
1605 | roundup_key_size); | |
68134668 | 1606 | check_and_init_map_value(map, value); |
3e87f192 DS |
1607 | } |
1608 | ||
1609 | hlist_nulls_del_rcu(&l->hash_node); | |
1610 | if (!is_lru_map) | |
1611 | free_htab_elem(htab, l); | |
1612 | } | |
1613 | ||
1614 | htab_unlock_bucket(htab, b, hash, bflags); | |
1615 | ||
1616 | if (is_lru_map && l) | |
68134668 | 1617 | htab_lru_push_free(htab, l); |
3e87f192 DS |
1618 | |
1619 | return ret; | |
1620 | } | |
1621 | ||
1622 | static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, | |
1623 | void *value, u64 flags) | |
1624 | { | |
1625 | return __htab_map_lookup_and_delete_elem(map, key, value, false, false, | |
1626 | flags); | |
1627 | } | |
1628 | ||
1629 | static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map, | |
1630 | void *key, void *value, | |
1631 | u64 flags) | |
1632 | { | |
1633 | return __htab_map_lookup_and_delete_elem(map, key, value, false, true, | |
1634 | flags); | |
1635 | } | |
1636 | ||
1637 | static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key, | |
1638 | void *value, u64 flags) | |
1639 | { | |
1640 | return __htab_map_lookup_and_delete_elem(map, key, value, true, false, | |
1641 | flags); | |
1642 | } | |
1643 | ||
1644 | static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map, | |
1645 | void *key, void *value, | |
1646 | u64 flags) | |
1647 | { | |
1648 | return __htab_map_lookup_and_delete_elem(map, key, value, true, true, | |
1649 | flags); | |
1650 | } | |
1651 | ||
05799638 YS |
1652 | static int |
1653 | __htab_map_lookup_and_delete_batch(struct bpf_map *map, | |
1654 | const union bpf_attr *attr, | |
1655 | union bpf_attr __user *uattr, | |
1656 | bool do_delete, bool is_lru_map, | |
1657 | bool is_percpu) | |
1658 | { | |
1659 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
1660 | u32 bucket_cnt, total, key_size, value_size, roundup_key_size; | |
1661 | void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; | |
1662 | void __user *uvalues = u64_to_user_ptr(attr->batch.values); | |
1663 | void __user *ukeys = u64_to_user_ptr(attr->batch.keys); | |
2f4b0319 | 1664 | void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); |
9263dddc | 1665 | u32 batch, max_count, size, bucket_size, map_id; |
b9aff38d | 1666 | struct htab_elem *node_to_free = NULL; |
05799638 YS |
1667 | u64 elem_map_flags, map_flags; |
1668 | struct hlist_nulls_head *head; | |
1669 | struct hlist_nulls_node *n; | |
492e0d0d BV |
1670 | unsigned long flags = 0; |
1671 | bool locked = false; | |
05799638 YS |
1672 | struct htab_elem *l; |
1673 | struct bucket *b; | |
1674 | int ret = 0; | |
1675 | ||
1676 | elem_map_flags = attr->batch.elem_flags; | |
1677 | if ((elem_map_flags & ~BPF_F_LOCK) || | |
1678 | ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) | |
1679 | return -EINVAL; | |
1680 | ||
1681 | map_flags = attr->batch.flags; | |
1682 | if (map_flags) | |
1683 | return -EINVAL; | |
1684 | ||
1685 | max_count = attr->batch.count; | |
1686 | if (!max_count) | |
1687 | return 0; | |
1688 | ||
1689 | if (put_user(0, &uattr->batch.count)) | |
1690 | return -EFAULT; | |
1691 | ||
1692 | batch = 0; | |
1693 | if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) | |
1694 | return -EFAULT; | |
1695 | ||
1696 | if (batch >= htab->n_buckets) | |
1697 | return -ENOENT; | |
1698 | ||
1699 | key_size = htab->map.key_size; | |
1700 | roundup_key_size = round_up(htab->map.key_size, 8); | |
1701 | value_size = htab->map.value_size; | |
1702 | size = round_up(value_size, 8); | |
1703 | if (is_percpu) | |
1704 | value_size = size * num_possible_cpus(); | |
1705 | total = 0; | |
1706 | /* while experimenting with hash tables with sizes ranging from 10 to | |
c561d110 | 1707 | * 1000, it was observed that a bucket can have up to 5 entries. |
05799638 YS |
1708 | */ |
1709 | bucket_size = 5; | |
1710 | ||
1711 | alloc: | |
1712 | /* We cannot do copy_from_user or copy_to_user inside | |
1713 | * the rcu_read_lock. Allocate enough space here. | |
1714 | */ | |
c4eb1f40 TY |
1715 | keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN); |
1716 | values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN); | |
05799638 YS |
1717 | if (!keys || !values) { |
1718 | ret = -ENOMEM; | |
1719 | goto after_loop; | |
1720 | } | |
1721 | ||
1722 | again: | |
085fee1a | 1723 | bpf_disable_instrumentation(); |
05799638 YS |
1724 | rcu_read_lock(); |
1725 | again_nocopy: | |
1726 | dst_key = keys; | |
1727 | dst_val = values; | |
1728 | b = &htab->buckets[batch]; | |
1729 | head = &b->head; | |
492e0d0d | 1730 | /* do not grab the lock unless need it (bucket_cnt > 0). */ |
20b6cc34 SL |
1731 | if (locked) { |
1732 | ret = htab_lock_bucket(htab, b, batch, &flags); | |
66a7a92e HT |
1733 | if (ret) { |
1734 | rcu_read_unlock(); | |
1735 | bpf_enable_instrumentation(); | |
1736 | goto after_loop; | |
1737 | } | |
20b6cc34 | 1738 | } |
05799638 YS |
1739 | |
1740 | bucket_cnt = 0; | |
1741 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) | |
1742 | bucket_cnt++; | |
1743 | ||
492e0d0d BV |
1744 | if (bucket_cnt && !locked) { |
1745 | locked = true; | |
1746 | goto again_nocopy; | |
1747 | } | |
1748 | ||
05799638 YS |
1749 | if (bucket_cnt > (max_count - total)) { |
1750 | if (total == 0) | |
1751 | ret = -ENOSPC; | |
492e0d0d BV |
1752 | /* Note that since bucket_cnt > 0 here, it is implicit |
1753 | * that the locked was grabbed, so release it. | |
1754 | */ | |
20b6cc34 | 1755 | htab_unlock_bucket(htab, b, batch, flags); |
05799638 | 1756 | rcu_read_unlock(); |
085fee1a | 1757 | bpf_enable_instrumentation(); |
05799638 YS |
1758 | goto after_loop; |
1759 | } | |
1760 | ||
1761 | if (bucket_cnt > bucket_size) { | |
1762 | bucket_size = bucket_cnt; | |
492e0d0d BV |
1763 | /* Note that since bucket_cnt > 0 here, it is implicit |
1764 | * that the locked was grabbed, so release it. | |
1765 | */ | |
20b6cc34 | 1766 | htab_unlock_bucket(htab, b, batch, flags); |
05799638 | 1767 | rcu_read_unlock(); |
085fee1a | 1768 | bpf_enable_instrumentation(); |
05799638 YS |
1769 | kvfree(keys); |
1770 | kvfree(values); | |
1771 | goto alloc; | |
1772 | } | |
1773 | ||
492e0d0d BV |
1774 | /* Next block is only safe to run if you have grabbed the lock */ |
1775 | if (!locked) | |
1776 | goto next_batch; | |
1777 | ||
05799638 YS |
1778 | hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { |
1779 | memcpy(dst_key, l->key, key_size); | |
1780 | ||
1781 | if (is_percpu) { | |
1782 | int off = 0, cpu; | |
1783 | void __percpu *pptr; | |
1784 | ||
1785 | pptr = htab_elem_get_ptr(l, map->key_size); | |
1786 | for_each_possible_cpu(cpu) { | |
1787 | bpf_long_memcpy(dst_val + off, | |
1788 | per_cpu_ptr(pptr, cpu), size); | |
1789 | off += size; | |
1790 | } | |
1791 | } else { | |
1792 | value = l->key + roundup_key_size; | |
9263dddc TC |
1793 | if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { |
1794 | struct bpf_map **inner_map = value; | |
1795 | ||
1796 | /* Actual value is the id of the inner map */ | |
1797 | map_id = map->ops->map_fd_sys_lookup_elem(*inner_map); | |
1798 | value = &map_id; | |
1799 | } | |
1800 | ||
05799638 YS |
1801 | if (elem_map_flags & BPF_F_LOCK) |
1802 | copy_map_value_locked(map, dst_val, value, | |
1803 | true); | |
1804 | else | |
1805 | copy_map_value(map, dst_val, value); | |
68134668 | 1806 | check_and_init_map_value(map, dst_val); |
05799638 YS |
1807 | } |
1808 | if (do_delete) { | |
1809 | hlist_nulls_del_rcu(&l->hash_node); | |
b9aff38d YS |
1810 | |
1811 | /* bpf_lru_push_free() will acquire lru_lock, which | |
1812 | * may cause deadlock. See comments in function | |
1813 | * prealloc_lru_pop(). Let us do bpf_lru_push_free() | |
1814 | * after releasing the bucket lock. | |
1815 | */ | |
1816 | if (is_lru_map) { | |
1817 | l->batch_flink = node_to_free; | |
1818 | node_to_free = l; | |
1819 | } else { | |
05799638 | 1820 | free_htab_elem(htab, l); |
b9aff38d | 1821 | } |
05799638 YS |
1822 | } |
1823 | dst_key += key_size; | |
1824 | dst_val += value_size; | |
1825 | } | |
1826 | ||
20b6cc34 | 1827 | htab_unlock_bucket(htab, b, batch, flags); |
492e0d0d | 1828 | locked = false; |
b9aff38d YS |
1829 | |
1830 | while (node_to_free) { | |
1831 | l = node_to_free; | |
1832 | node_to_free = node_to_free->batch_flink; | |
68134668 | 1833 | htab_lru_push_free(htab, l); |
b9aff38d YS |
1834 | } |
1835 | ||
492e0d0d | 1836 | next_batch: |
05799638 YS |
1837 | /* If we are not copying data, we can go to next bucket and avoid |
1838 | * unlocking the rcu. | |
1839 | */ | |
1840 | if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { | |
1841 | batch++; | |
1842 | goto again_nocopy; | |
1843 | } | |
1844 | ||
1845 | rcu_read_unlock(); | |
085fee1a | 1846 | bpf_enable_instrumentation(); |
05799638 YS |
1847 | if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, |
1848 | key_size * bucket_cnt) || | |
1849 | copy_to_user(uvalues + total * value_size, values, | |
1850 | value_size * bucket_cnt))) { | |
1851 | ret = -EFAULT; | |
1852 | goto after_loop; | |
1853 | } | |
1854 | ||
1855 | total += bucket_cnt; | |
1856 | batch++; | |
1857 | if (batch >= htab->n_buckets) { | |
1858 | ret = -ENOENT; | |
1859 | goto after_loop; | |
1860 | } | |
1861 | goto again; | |
1862 | ||
1863 | after_loop: | |
1864 | if (ret == -EFAULT) | |
1865 | goto out; | |
1866 | ||
1867 | /* copy # of entries and next batch */ | |
1868 | ubatch = u64_to_user_ptr(attr->batch.out_batch); | |
1869 | if (copy_to_user(ubatch, &batch, sizeof(batch)) || | |
1870 | put_user(total, &uattr->batch.count)) | |
1871 | ret = -EFAULT; | |
1872 | ||
1873 | out: | |
1874 | kvfree(keys); | |
1875 | kvfree(values); | |
1876 | return ret; | |
1877 | } | |
1878 | ||
1879 | static int | |
1880 | htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, | |
1881 | union bpf_attr __user *uattr) | |
1882 | { | |
1883 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, | |
1884 | false, true); | |
1885 | } | |
1886 | ||
1887 | static int | |
1888 | htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, | |
1889 | const union bpf_attr *attr, | |
1890 | union bpf_attr __user *uattr) | |
1891 | { | |
1892 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, | |
1893 | false, true); | |
1894 | } | |
1895 | ||
1896 | static int | |
1897 | htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, | |
1898 | union bpf_attr __user *uattr) | |
1899 | { | |
1900 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, | |
1901 | false, false); | |
1902 | } | |
1903 | ||
1904 | static int | |
1905 | htab_map_lookup_and_delete_batch(struct bpf_map *map, | |
1906 | const union bpf_attr *attr, | |
1907 | union bpf_attr __user *uattr) | |
1908 | { | |
1909 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, | |
1910 | false, false); | |
1911 | } | |
1912 | ||
1913 | static int | |
1914 | htab_lru_percpu_map_lookup_batch(struct bpf_map *map, | |
1915 | const union bpf_attr *attr, | |
1916 | union bpf_attr __user *uattr) | |
1917 | { | |
1918 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, | |
1919 | true, true); | |
1920 | } | |
1921 | ||
1922 | static int | |
1923 | htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, | |
1924 | const union bpf_attr *attr, | |
1925 | union bpf_attr __user *uattr) | |
1926 | { | |
1927 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, | |
1928 | true, true); | |
1929 | } | |
1930 | ||
1931 | static int | |
1932 | htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, | |
1933 | union bpf_attr __user *uattr) | |
1934 | { | |
1935 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, | |
1936 | true, false); | |
1937 | } | |
1938 | ||
1939 | static int | |
1940 | htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, | |
1941 | const union bpf_attr *attr, | |
1942 | union bpf_attr __user *uattr) | |
1943 | { | |
1944 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, | |
1945 | true, false); | |
1946 | } | |
1947 | ||
d6c4503c YS |
1948 | struct bpf_iter_seq_hash_map_info { |
1949 | struct bpf_map *map; | |
1950 | struct bpf_htab *htab; | |
1951 | void *percpu_value_buf; // non-zero means percpu hash | |
d6c4503c YS |
1952 | u32 bucket_id; |
1953 | u32 skip_elems; | |
1954 | }; | |
1955 | ||
1956 | static struct htab_elem * | |
1957 | bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, | |
1958 | struct htab_elem *prev_elem) | |
1959 | { | |
1960 | const struct bpf_htab *htab = info->htab; | |
d6c4503c YS |
1961 | u32 skip_elems = info->skip_elems; |
1962 | u32 bucket_id = info->bucket_id; | |
1963 | struct hlist_nulls_head *head; | |
1964 | struct hlist_nulls_node *n; | |
1965 | struct htab_elem *elem; | |
1966 | struct bucket *b; | |
1967 | u32 i, count; | |
1968 | ||
1969 | if (bucket_id >= htab->n_buckets) | |
1970 | return NULL; | |
1971 | ||
1972 | /* try to find next elem in the same bucket */ | |
1973 | if (prev_elem) { | |
1974 | /* no update/deletion on this bucket, prev_elem should be still valid | |
1975 | * and we won't skip elements. | |
1976 | */ | |
1977 | n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); | |
1978 | elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); | |
1979 | if (elem) | |
1980 | return elem; | |
1981 | ||
1982 | /* not found, unlock and go to the next bucket */ | |
1983 | b = &htab->buckets[bucket_id++]; | |
dc0988bb | 1984 | rcu_read_unlock(); |
d6c4503c YS |
1985 | skip_elems = 0; |
1986 | } | |
1987 | ||
1988 | for (i = bucket_id; i < htab->n_buckets; i++) { | |
1989 | b = &htab->buckets[i]; | |
dc0988bb | 1990 | rcu_read_lock(); |
d6c4503c YS |
1991 | |
1992 | count = 0; | |
1993 | head = &b->head; | |
1994 | hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { | |
1995 | if (count >= skip_elems) { | |
d6c4503c YS |
1996 | info->bucket_id = i; |
1997 | info->skip_elems = count; | |
1998 | return elem; | |
1999 | } | |
2000 | count++; | |
2001 | } | |
2002 | ||
dc0988bb | 2003 | rcu_read_unlock(); |
d6c4503c YS |
2004 | skip_elems = 0; |
2005 | } | |
2006 | ||
2007 | info->bucket_id = i; | |
2008 | info->skip_elems = 0; | |
2009 | return NULL; | |
2010 | } | |
2011 | ||
2012 | static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) | |
2013 | { | |
2014 | struct bpf_iter_seq_hash_map_info *info = seq->private; | |
2015 | struct htab_elem *elem; | |
2016 | ||
2017 | elem = bpf_hash_map_seq_find_next(info, NULL); | |
2018 | if (!elem) | |
2019 | return NULL; | |
2020 | ||
2021 | if (*pos == 0) | |
2022 | ++*pos; | |
2023 | return elem; | |
2024 | } | |
2025 | ||
2026 | static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |
2027 | { | |
2028 | struct bpf_iter_seq_hash_map_info *info = seq->private; | |
2029 | ||
2030 | ++*pos; | |
2031 | ++info->skip_elems; | |
2032 | return bpf_hash_map_seq_find_next(info, v); | |
2033 | } | |
2034 | ||
2035 | static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) | |
2036 | { | |
2037 | struct bpf_iter_seq_hash_map_info *info = seq->private; | |
2038 | u32 roundup_key_size, roundup_value_size; | |
2039 | struct bpf_iter__bpf_map_elem ctx = {}; | |
2040 | struct bpf_map *map = info->map; | |
2041 | struct bpf_iter_meta meta; | |
2042 | int ret = 0, off = 0, cpu; | |
2043 | struct bpf_prog *prog; | |
2044 | void __percpu *pptr; | |
2045 | ||
2046 | meta.seq = seq; | |
2047 | prog = bpf_iter_get_info(&meta, elem == NULL); | |
2048 | if (prog) { | |
2049 | ctx.meta = &meta; | |
2050 | ctx.map = info->map; | |
2051 | if (elem) { | |
2052 | roundup_key_size = round_up(map->key_size, 8); | |
2053 | ctx.key = elem->key; | |
2054 | if (!info->percpu_value_buf) { | |
2055 | ctx.value = elem->key + roundup_key_size; | |
2056 | } else { | |
2057 | roundup_value_size = round_up(map->value_size, 8); | |
2058 | pptr = htab_elem_get_ptr(elem, map->key_size); | |
2059 | for_each_possible_cpu(cpu) { | |
2060 | bpf_long_memcpy(info->percpu_value_buf + off, | |
2061 | per_cpu_ptr(pptr, cpu), | |
2062 | roundup_value_size); | |
2063 | off += roundup_value_size; | |
2064 | } | |
2065 | ctx.value = info->percpu_value_buf; | |
2066 | } | |
2067 | } | |
2068 | ret = bpf_iter_run_prog(prog, &ctx); | |
2069 | } | |
2070 | ||
2071 | return ret; | |
2072 | } | |
2073 | ||
2074 | static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) | |
2075 | { | |
2076 | return __bpf_hash_map_seq_show(seq, v); | |
2077 | } | |
2078 | ||
2079 | static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) | |
2080 | { | |
d6c4503c YS |
2081 | if (!v) |
2082 | (void)__bpf_hash_map_seq_show(seq, NULL); | |
2083 | else | |
dc0988bb | 2084 | rcu_read_unlock(); |
d6c4503c YS |
2085 | } |
2086 | ||
2087 | static int bpf_iter_init_hash_map(void *priv_data, | |
2088 | struct bpf_iter_aux_info *aux) | |
2089 | { | |
2090 | struct bpf_iter_seq_hash_map_info *seq_info = priv_data; | |
2091 | struct bpf_map *map = aux->map; | |
2092 | void *value_buf; | |
2093 | u32 buf_size; | |
2094 | ||
2095 | if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || | |
2096 | map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { | |
2097 | buf_size = round_up(map->value_size, 8) * num_possible_cpus(); | |
2098 | value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); | |
2099 | if (!value_buf) | |
2100 | return -ENOMEM; | |
2101 | ||
2102 | seq_info->percpu_value_buf = value_buf; | |
2103 | } | |
2104 | ||
ef1e93d2 | 2105 | bpf_map_inc_with_uref(map); |
d6c4503c YS |
2106 | seq_info->map = map; |
2107 | seq_info->htab = container_of(map, struct bpf_htab, map); | |
2108 | return 0; | |
2109 | } | |
2110 | ||
2111 | static void bpf_iter_fini_hash_map(void *priv_data) | |
2112 | { | |
2113 | struct bpf_iter_seq_hash_map_info *seq_info = priv_data; | |
2114 | ||
ef1e93d2 | 2115 | bpf_map_put_with_uref(seq_info->map); |
d6c4503c YS |
2116 | kfree(seq_info->percpu_value_buf); |
2117 | } | |
2118 | ||
2119 | static const struct seq_operations bpf_hash_map_seq_ops = { | |
2120 | .start = bpf_hash_map_seq_start, | |
2121 | .next = bpf_hash_map_seq_next, | |
2122 | .stop = bpf_hash_map_seq_stop, | |
2123 | .show = bpf_hash_map_seq_show, | |
2124 | }; | |
2125 | ||
2126 | static const struct bpf_iter_seq_info iter_seq_info = { | |
2127 | .seq_ops = &bpf_hash_map_seq_ops, | |
2128 | .init_seq_private = bpf_iter_init_hash_map, | |
2129 | .fini_seq_private = bpf_iter_fini_hash_map, | |
2130 | .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), | |
2131 | }; | |
2132 | ||
102acbac | 2133 | static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, |
314ee05e YS |
2134 | void *callback_ctx, u64 flags) |
2135 | { | |
2136 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
2137 | struct hlist_nulls_head *head; | |
2138 | struct hlist_nulls_node *n; | |
2139 | struct htab_elem *elem; | |
2140 | u32 roundup_key_size; | |
2141 | int i, num_elems = 0; | |
2142 | void __percpu *pptr; | |
2143 | struct bucket *b; | |
2144 | void *key, *val; | |
2145 | bool is_percpu; | |
2146 | u64 ret = 0; | |
2147 | ||
2148 | if (flags != 0) | |
2149 | return -EINVAL; | |
2150 | ||
2151 | is_percpu = htab_is_percpu(htab); | |
2152 | ||
2153 | roundup_key_size = round_up(map->key_size, 8); | |
2154 | /* disable migration so percpu value prepared here will be the | |
2155 | * same as the one seen by the bpf program with bpf_map_lookup_elem(). | |
2156 | */ | |
2157 | if (is_percpu) | |
2158 | migrate_disable(); | |
2159 | for (i = 0; i < htab->n_buckets; i++) { | |
2160 | b = &htab->buckets[i]; | |
2161 | rcu_read_lock(); | |
2162 | head = &b->head; | |
2163 | hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { | |
2164 | key = elem->key; | |
2165 | if (is_percpu) { | |
2166 | /* current cpu value for percpu map */ | |
2167 | pptr = htab_elem_get_ptr(elem, map->key_size); | |
2168 | val = this_cpu_ptr(pptr); | |
2169 | } else { | |
2170 | val = elem->key + roundup_key_size; | |
2171 | } | |
2172 | num_elems++; | |
102acbac KC |
2173 | ret = callback_fn((u64)(long)map, (u64)(long)key, |
2174 | (u64)(long)val, (u64)(long)callback_ctx, 0); | |
314ee05e YS |
2175 | /* return value: 0 - continue, 1 - stop and return */ |
2176 | if (ret) { | |
2177 | rcu_read_unlock(); | |
2178 | goto out; | |
2179 | } | |
2180 | } | |
2181 | rcu_read_unlock(); | |
2182 | } | |
2183 | out: | |
2184 | if (is_percpu) | |
2185 | migrate_enable(); | |
2186 | return num_elems; | |
2187 | } | |
2188 | ||
c317ab71 | 2189 | BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab) |
40077e0c | 2190 | const struct bpf_map_ops htab_map_ops = { |
f4d05259 | 2191 | .map_meta_equal = bpf_map_meta_equal, |
9328e0d1 | 2192 | .map_alloc_check = htab_map_alloc_check, |
0f8e4bd8 AS |
2193 | .map_alloc = htab_map_alloc, |
2194 | .map_free = htab_map_free, | |
2195 | .map_get_next_key = htab_map_get_next_key, | |
68134668 | 2196 | .map_release_uref = htab_map_free_timers, |
0f8e4bd8 | 2197 | .map_lookup_elem = htab_map_lookup_elem, |
3e87f192 | 2198 | .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, |
0f8e4bd8 AS |
2199 | .map_update_elem = htab_map_update_elem, |
2200 | .map_delete_elem = htab_map_delete_elem, | |
9015d2f5 | 2201 | .map_gen_lookup = htab_map_gen_lookup, |
699c86d6 | 2202 | .map_seq_show_elem = htab_map_seq_show_elem, |
314ee05e YS |
2203 | .map_set_for_each_callback_args = map_set_for_each_callback_args, |
2204 | .map_for_each_callback = bpf_for_each_hash_elem, | |
05799638 | 2205 | BATCH_OPS(htab), |
c317ab71 | 2206 | .map_btf_id = &htab_map_btf_ids[0], |
d6c4503c | 2207 | .iter_seq_info = &iter_seq_info, |
0f8e4bd8 AS |
2208 | }; |
2209 | ||
40077e0c | 2210 | const struct bpf_map_ops htab_lru_map_ops = { |
f4d05259 | 2211 | .map_meta_equal = bpf_map_meta_equal, |
9328e0d1 | 2212 | .map_alloc_check = htab_map_alloc_check, |
29ba732a MKL |
2213 | .map_alloc = htab_map_alloc, |
2214 | .map_free = htab_map_free, | |
2215 | .map_get_next_key = htab_map_get_next_key, | |
68134668 | 2216 | .map_release_uref = htab_map_free_timers, |
29ba732a | 2217 | .map_lookup_elem = htab_lru_map_lookup_elem, |
3e87f192 | 2218 | .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, |
50b045a8 | 2219 | .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, |
29ba732a MKL |
2220 | .map_update_elem = htab_lru_map_update_elem, |
2221 | .map_delete_elem = htab_lru_map_delete_elem, | |
cc555421 | 2222 | .map_gen_lookup = htab_lru_map_gen_lookup, |
699c86d6 | 2223 | .map_seq_show_elem = htab_map_seq_show_elem, |
314ee05e YS |
2224 | .map_set_for_each_callback_args = map_set_for_each_callback_args, |
2225 | .map_for_each_callback = bpf_for_each_hash_elem, | |
05799638 | 2226 | BATCH_OPS(htab_lru), |
c317ab71 | 2227 | .map_btf_id = &htab_map_btf_ids[0], |
d6c4503c | 2228 | .iter_seq_info = &iter_seq_info, |
29ba732a MKL |
2229 | }; |
2230 | ||
824bd0ce AS |
2231 | /* Called from eBPF program */ |
2232 | static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) | |
2233 | { | |
2234 | struct htab_elem *l = __htab_map_lookup_elem(map, key); | |
2235 | ||
2236 | if (l) | |
2237 | return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); | |
2238 | else | |
2239 | return NULL; | |
2240 | } | |
2241 | ||
07343110 FZ |
2242 | static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) |
2243 | { | |
2244 | struct htab_elem *l; | |
2245 | ||
2246 | if (cpu >= nr_cpu_ids) | |
2247 | return NULL; | |
2248 | ||
2249 | l = __htab_map_lookup_elem(map, key); | |
2250 | if (l) | |
2251 | return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); | |
2252 | else | |
2253 | return NULL; | |
2254 | } | |
2255 | ||
8f844938 MKL |
2256 | static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) |
2257 | { | |
2258 | struct htab_elem *l = __htab_map_lookup_elem(map, key); | |
2259 | ||
2260 | if (l) { | |
2261 | bpf_lru_node_set_ref(&l->lru_node); | |
2262 | return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); | |
2263 | } | |
2264 | ||
2265 | return NULL; | |
2266 | } | |
2267 | ||
07343110 FZ |
2268 | static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) |
2269 | { | |
2270 | struct htab_elem *l; | |
2271 | ||
2272 | if (cpu >= nr_cpu_ids) | |
2273 | return NULL; | |
2274 | ||
2275 | l = __htab_map_lookup_elem(map, key); | |
2276 | if (l) { | |
2277 | bpf_lru_node_set_ref(&l->lru_node); | |
2278 | return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); | |
2279 | } | |
2280 | ||
2281 | return NULL; | |
2282 | } | |
2283 | ||
15a07b33 AS |
2284 | int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) |
2285 | { | |
2286 | struct htab_elem *l; | |
2287 | void __percpu *pptr; | |
2288 | int ret = -ENOENT; | |
2289 | int cpu, off = 0; | |
2290 | u32 size; | |
2291 | ||
2292 | /* per_cpu areas are zero-filled and bpf programs can only | |
2293 | * access 'value_size' of them, so copying rounded areas | |
2294 | * will not leak any kernel data | |
2295 | */ | |
2296 | size = round_up(map->value_size, 8); | |
2297 | rcu_read_lock(); | |
2298 | l = __htab_map_lookup_elem(map, key); | |
2299 | if (!l) | |
2300 | goto out; | |
50b045a8 DB |
2301 | /* We do not mark LRU map element here in order to not mess up |
2302 | * eviction heuristics when user space does a map walk. | |
2303 | */ | |
15a07b33 AS |
2304 | pptr = htab_elem_get_ptr(l, map->key_size); |
2305 | for_each_possible_cpu(cpu) { | |
2306 | bpf_long_memcpy(value + off, | |
2307 | per_cpu_ptr(pptr, cpu), size); | |
2308 | off += size; | |
2309 | } | |
2310 | ret = 0; | |
2311 | out: | |
2312 | rcu_read_unlock(); | |
2313 | return ret; | |
2314 | } | |
2315 | ||
2316 | int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, | |
2317 | u64 map_flags) | |
2318 | { | |
8f844938 | 2319 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
6bbd9a05 SL |
2320 | int ret; |
2321 | ||
2322 | rcu_read_lock(); | |
8f844938 MKL |
2323 | if (htab_is_lru(htab)) |
2324 | ret = __htab_lru_percpu_map_update_elem(map, key, value, | |
2325 | map_flags, true); | |
2326 | else | |
2327 | ret = __htab_percpu_map_update_elem(map, key, value, map_flags, | |
2328 | true); | |
6bbd9a05 SL |
2329 | rcu_read_unlock(); |
2330 | ||
2331 | return ret; | |
15a07b33 AS |
2332 | } |
2333 | ||
c7b27c37 YS |
2334 | static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, |
2335 | struct seq_file *m) | |
2336 | { | |
2337 | struct htab_elem *l; | |
2338 | void __percpu *pptr; | |
2339 | int cpu; | |
2340 | ||
2341 | rcu_read_lock(); | |
2342 | ||
2343 | l = __htab_map_lookup_elem(map, key); | |
2344 | if (!l) { | |
2345 | rcu_read_unlock(); | |
2346 | return; | |
2347 | } | |
2348 | ||
2349 | btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); | |
2350 | seq_puts(m, ": {\n"); | |
2351 | pptr = htab_elem_get_ptr(l, map->key_size); | |
2352 | for_each_possible_cpu(cpu) { | |
2353 | seq_printf(m, "\tcpu%d: ", cpu); | |
2354 | btf_type_seq_show(map->btf, map->btf_value_type_id, | |
2355 | per_cpu_ptr(pptr, cpu), m); | |
2356 | seq_puts(m, "\n"); | |
2357 | } | |
2358 | seq_puts(m, "}\n"); | |
2359 | ||
2360 | rcu_read_unlock(); | |
2361 | } | |
2362 | ||
40077e0c | 2363 | const struct bpf_map_ops htab_percpu_map_ops = { |
f4d05259 | 2364 | .map_meta_equal = bpf_map_meta_equal, |
9328e0d1 | 2365 | .map_alloc_check = htab_map_alloc_check, |
824bd0ce AS |
2366 | .map_alloc = htab_map_alloc, |
2367 | .map_free = htab_map_free, | |
2368 | .map_get_next_key = htab_map_get_next_key, | |
2369 | .map_lookup_elem = htab_percpu_map_lookup_elem, | |
3e87f192 | 2370 | .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, |
824bd0ce AS |
2371 | .map_update_elem = htab_percpu_map_update_elem, |
2372 | .map_delete_elem = htab_map_delete_elem, | |
07343110 | 2373 | .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem, |
c7b27c37 | 2374 | .map_seq_show_elem = htab_percpu_map_seq_show_elem, |
314ee05e YS |
2375 | .map_set_for_each_callback_args = map_set_for_each_callback_args, |
2376 | .map_for_each_callback = bpf_for_each_hash_elem, | |
05799638 | 2377 | BATCH_OPS(htab_percpu), |
c317ab71 | 2378 | .map_btf_id = &htab_map_btf_ids[0], |
d6c4503c | 2379 | .iter_seq_info = &iter_seq_info, |
824bd0ce AS |
2380 | }; |
2381 | ||
40077e0c | 2382 | const struct bpf_map_ops htab_lru_percpu_map_ops = { |
f4d05259 | 2383 | .map_meta_equal = bpf_map_meta_equal, |
9328e0d1 | 2384 | .map_alloc_check = htab_map_alloc_check, |
8f844938 MKL |
2385 | .map_alloc = htab_map_alloc, |
2386 | .map_free = htab_map_free, | |
2387 | .map_get_next_key = htab_map_get_next_key, | |
2388 | .map_lookup_elem = htab_lru_percpu_map_lookup_elem, | |
3e87f192 | 2389 | .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, |
8f844938 MKL |
2390 | .map_update_elem = htab_lru_percpu_map_update_elem, |
2391 | .map_delete_elem = htab_lru_map_delete_elem, | |
07343110 | 2392 | .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem, |
c7b27c37 | 2393 | .map_seq_show_elem = htab_percpu_map_seq_show_elem, |
314ee05e YS |
2394 | .map_set_for_each_callback_args = map_set_for_each_callback_args, |
2395 | .map_for_each_callback = bpf_for_each_hash_elem, | |
05799638 | 2396 | BATCH_OPS(htab_lru_percpu), |
c317ab71 | 2397 | .map_btf_id = &htab_map_btf_ids[0], |
d6c4503c | 2398 | .iter_seq_info = &iter_seq_info, |
8f844938 MKL |
2399 | }; |
2400 | ||
9328e0d1 | 2401 | static int fd_htab_map_alloc_check(union bpf_attr *attr) |
bcc6b1b7 | 2402 | { |
bcc6b1b7 | 2403 | if (attr->value_size != sizeof(u32)) |
9328e0d1 JK |
2404 | return -EINVAL; |
2405 | return htab_map_alloc_check(attr); | |
bcc6b1b7 MKL |
2406 | } |
2407 | ||
2408 | static void fd_htab_map_free(struct bpf_map *map) | |
2409 | { | |
2410 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | |
2411 | struct hlist_nulls_node *n; | |
2412 | struct hlist_nulls_head *head; | |
2413 | struct htab_elem *l; | |
2414 | int i; | |
2415 | ||
2416 | for (i = 0; i < htab->n_buckets; i++) { | |
2417 | head = select_bucket(htab, i); | |
2418 | ||
2419 | hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { | |
2420 | void *ptr = fd_htab_map_get_ptr(map, l); | |
2421 | ||
2422 | map->ops->map_fd_put_ptr(ptr); | |
2423 | } | |
2424 | } | |
2425 | ||
2426 | htab_map_free(map); | |
2427 | } | |
2428 | ||
14dc6f04 MKL |
2429 | /* only called from syscall */ |
2430 | int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) | |
2431 | { | |
2432 | void **ptr; | |
2433 | int ret = 0; | |
2434 | ||
2435 | if (!map->ops->map_fd_sys_lookup_elem) | |
2436 | return -ENOTSUPP; | |
2437 | ||
2438 | rcu_read_lock(); | |
2439 | ptr = htab_map_lookup_elem(map, key); | |
2440 | if (ptr) | |
2441 | *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); | |
2442 | else | |
2443 | ret = -ENOENT; | |
2444 | rcu_read_unlock(); | |
2445 | ||
2446 | return ret; | |
2447 | } | |
2448 | ||
bcc6b1b7 MKL |
2449 | /* only called from syscall */ |
2450 | int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, | |
2451 | void *key, void *value, u64 map_flags) | |
2452 | { | |
2453 | void *ptr; | |
2454 | int ret; | |
2455 | u32 ufd = *(u32 *)value; | |
2456 | ||
2457 | ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); | |
2458 | if (IS_ERR(ptr)) | |
2459 | return PTR_ERR(ptr); | |
2460 | ||
2461 | ret = htab_map_update_elem(map, key, &ptr, map_flags); | |
2462 | if (ret) | |
2463 | map->ops->map_fd_put_ptr(ptr); | |
2464 | ||
2465 | return ret; | |
2466 | } | |
2467 | ||
2468 | static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) | |
2469 | { | |
2470 | struct bpf_map *map, *inner_map_meta; | |
2471 | ||
2472 | inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); | |
2473 | if (IS_ERR(inner_map_meta)) | |
2474 | return inner_map_meta; | |
2475 | ||
9328e0d1 | 2476 | map = htab_map_alloc(attr); |
bcc6b1b7 MKL |
2477 | if (IS_ERR(map)) { |
2478 | bpf_map_meta_free(inner_map_meta); | |
2479 | return map; | |
2480 | } | |
2481 | ||
2482 | map->inner_map_meta = inner_map_meta; | |
2483 | ||
2484 | return map; | |
2485 | } | |
2486 | ||
2487 | static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) | |
2488 | { | |
2489 | struct bpf_map **inner_map = htab_map_lookup_elem(map, key); | |
2490 | ||
2491 | if (!inner_map) | |
2492 | return NULL; | |
2493 | ||
2494 | return READ_ONCE(*inner_map); | |
2495 | } | |
2496 | ||
4a8f87e6 | 2497 | static int htab_of_map_gen_lookup(struct bpf_map *map, |
7b0c2a05 DB |
2498 | struct bpf_insn *insn_buf) |
2499 | { | |
2500 | struct bpf_insn *insn = insn_buf; | |
2501 | const int ret = BPF_REG_0; | |
2502 | ||
09772d92 DB |
2503 | BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, |
2504 | (void *(*)(struct bpf_map *map, void *key))NULL)); | |
3d717fad | 2505 | *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); |
7b0c2a05 DB |
2506 | *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); |
2507 | *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, | |
2508 | offsetof(struct htab_elem, key) + | |
2509 | round_up(map->key_size, 8)); | |
2510 | *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); | |
2511 | ||
2512 | return insn - insn_buf; | |
2513 | } | |
2514 | ||
bcc6b1b7 MKL |
2515 | static void htab_of_map_free(struct bpf_map *map) |
2516 | { | |
2517 | bpf_map_meta_free(map->inner_map_meta); | |
2518 | fd_htab_map_free(map); | |
2519 | } | |
2520 | ||
40077e0c | 2521 | const struct bpf_map_ops htab_of_maps_map_ops = { |
9328e0d1 | 2522 | .map_alloc_check = fd_htab_map_alloc_check, |
bcc6b1b7 MKL |
2523 | .map_alloc = htab_of_map_alloc, |
2524 | .map_free = htab_of_map_free, | |
2525 | .map_get_next_key = htab_map_get_next_key, | |
2526 | .map_lookup_elem = htab_of_map_lookup_elem, | |
2527 | .map_delete_elem = htab_map_delete_elem, | |
2528 | .map_fd_get_ptr = bpf_map_fd_get_ptr, | |
2529 | .map_fd_put_ptr = bpf_map_fd_put_ptr, | |
14dc6f04 | 2530 | .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, |
7b0c2a05 | 2531 | .map_gen_lookup = htab_of_map_gen_lookup, |
e8d2bec0 | 2532 | .map_check_btf = map_check_no_btf, |
9263dddc | 2533 | BATCH_OPS(htab), |
c317ab71 | 2534 | .map_btf_id = &htab_map_btf_ids[0], |
bcc6b1b7 | 2535 | }; |