]> Git Repo - linux.git/blob - kernel/bpf/hashtab.c
scsi: zfcp: Trace when request remove fails after qdio send fails
[linux.git] / kernel / bpf / hashtab.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3  * Copyright (c) 2016 Facebook
4  */
5 #include <linux/bpf.h>
6 #include <linux/btf.h>
7 #include <linux/jhash.h>
8 #include <linux/filter.h>
9 #include <linux/rculist_nulls.h>
10 #include <linux/random.h>
11 #include <uapi/linux/btf.h>
12 #include <linux/rcupdate_trace.h>
13 #include <linux/btf_ids.h>
14 #include "percpu_freelist.h"
15 #include "bpf_lru_list.h"
16 #include "map_in_map.h"
17 #include <linux/bpf_mem_alloc.h>
18
19 #define HTAB_CREATE_FLAG_MASK                                           \
20         (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |    \
21          BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
22
23 #define BATCH_OPS(_name)                        \
24         .map_lookup_batch =                     \
25         _name##_map_lookup_batch,               \
26         .map_lookup_and_delete_batch =          \
27         _name##_map_lookup_and_delete_batch,    \
28         .map_update_batch =                     \
29         generic_map_update_batch,               \
30         .map_delete_batch =                     \
31         generic_map_delete_batch
32
33 /*
34  * The bucket lock has two protection scopes:
35  *
36  * 1) Serializing concurrent operations from BPF programs on different
37  *    CPUs
38  *
39  * 2) Serializing concurrent operations from BPF programs and sys_bpf()
40  *
41  * BPF programs can execute in any context including perf, kprobes and
42  * tracing. As there are almost no limits where perf, kprobes and tracing
43  * can be invoked from the lock operations need to be protected against
44  * deadlocks. Deadlocks can be caused by recursion and by an invocation in
45  * the lock held section when functions which acquire this lock are invoked
46  * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
47  * variable bpf_prog_active, which prevents BPF programs attached to perf
48  * events, kprobes and tracing to be invoked before the prior invocation
49  * from one of these contexts completed. sys_bpf() uses the same mechanism
50  * by pinning the task to the current CPU and incrementing the recursion
51  * protection across the map operation.
52  *
53  * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
54  * operations like memory allocations (even with GFP_ATOMIC) from atomic
55  * contexts. This is required because even with GFP_ATOMIC the memory
56  * allocator calls into code paths which acquire locks with long held lock
57  * sections. To ensure the deterministic behaviour these locks are regular
58  * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
59  * true atomic contexts on an RT kernel are the low level hardware
60  * handling, scheduling, low level interrupt handling, NMIs etc. None of
61  * these contexts should ever do memory allocations.
62  *
63  * As regular device interrupt handlers and soft interrupts are forced into
64  * thread context, the existing code which does
65  *   spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
66  * just works.
67  *
68  * In theory the BPF locks could be converted to regular spinlocks as well,
69  * but the bucket locks and percpu_freelist locks can be taken from
70  * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
71  * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
72  * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
73  * because there is no memory allocation within the lock held sections. However
74  * after hash map was fully converted to use bpf_mem_alloc, there will be
75  * non-synchronous memory allocation for non-preallocated hash map, so it is
76  * safe to always use raw spinlock for bucket lock.
77  */
78 struct bucket {
79         struct hlist_nulls_head head;
80         raw_spinlock_t raw_lock;
81 };
82
83 #define HASHTAB_MAP_LOCK_COUNT 8
84 #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
85
86 struct bpf_htab {
87         struct bpf_map map;
88         struct bpf_mem_alloc ma;
89         struct bpf_mem_alloc pcpu_ma;
90         struct bucket *buckets;
91         void *elems;
92         union {
93                 struct pcpu_freelist freelist;
94                 struct bpf_lru lru;
95         };
96         struct htab_elem *__percpu *extra_elems;
97         /* number of elements in non-preallocated hashtable are kept
98          * in either pcount or count
99          */
100         struct percpu_counter pcount;
101         atomic_t count;
102         bool use_percpu_counter;
103         u32 n_buckets;  /* number of hash buckets */
104         u32 elem_size;  /* size of each element in bytes */
105         u32 hashrnd;
106         struct lock_class_key lockdep_key;
107         int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
108 };
109
110 /* each htab element is struct htab_elem + key + value */
111 struct htab_elem {
112         union {
113                 struct hlist_nulls_node hash_node;
114                 struct {
115                         void *padding;
116                         union {
117                                 struct pcpu_freelist_node fnode;
118                                 struct htab_elem *batch_flink;
119                         };
120                 };
121         };
122         union {
123                 /* pointer to per-cpu pointer */
124                 void *ptr_to_pptr;
125                 struct bpf_lru_node lru_node;
126         };
127         u32 hash;
128         char key[] __aligned(8);
129 };
130
131 static inline bool htab_is_prealloc(const struct bpf_htab *htab)
132 {
133         return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
134 }
135
136 static void htab_init_buckets(struct bpf_htab *htab)
137 {
138         unsigned int i;
139
140         for (i = 0; i < htab->n_buckets; i++) {
141                 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
142                 raw_spin_lock_init(&htab->buckets[i].raw_lock);
143                 lockdep_set_class(&htab->buckets[i].raw_lock,
144                                           &htab->lockdep_key);
145                 cond_resched();
146         }
147 }
148
149 static inline int htab_lock_bucket(const struct bpf_htab *htab,
150                                    struct bucket *b, u32 hash,
151                                    unsigned long *pflags)
152 {
153         unsigned long flags;
154
155         hash = hash & HASHTAB_MAP_LOCK_MASK;
156
157         preempt_disable();
158         if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
159                 __this_cpu_dec(*(htab->map_locked[hash]));
160                 preempt_enable();
161                 return -EBUSY;
162         }
163
164         raw_spin_lock_irqsave(&b->raw_lock, flags);
165         *pflags = flags;
166
167         return 0;
168 }
169
170 static inline void htab_unlock_bucket(const struct bpf_htab *htab,
171                                       struct bucket *b, u32 hash,
172                                       unsigned long flags)
173 {
174         hash = hash & HASHTAB_MAP_LOCK_MASK;
175         raw_spin_unlock_irqrestore(&b->raw_lock, flags);
176         __this_cpu_dec(*(htab->map_locked[hash]));
177         preempt_enable();
178 }
179
180 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
181
182 static bool htab_is_lru(const struct bpf_htab *htab)
183 {
184         return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
185                 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
186 }
187
188 static bool htab_is_percpu(const struct bpf_htab *htab)
189 {
190         return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
191                 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
192 }
193
194 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
195                                      void __percpu *pptr)
196 {
197         *(void __percpu **)(l->key + key_size) = pptr;
198 }
199
200 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
201 {
202         return *(void __percpu **)(l->key + key_size);
203 }
204
205 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
206 {
207         return *(void **)(l->key + roundup(map->key_size, 8));
208 }
209
210 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
211 {
212         return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
213 }
214
215 static bool htab_has_extra_elems(struct bpf_htab *htab)
216 {
217         return !htab_is_percpu(htab) && !htab_is_lru(htab);
218 }
219
220 static void htab_free_prealloced_timers(struct bpf_htab *htab)
221 {
222         u32 num_entries = htab->map.max_entries;
223         int i;
224
225         if (!btf_record_has_field(htab->map.record, BPF_TIMER))
226                 return;
227         if (htab_has_extra_elems(htab))
228                 num_entries += num_possible_cpus();
229
230         for (i = 0; i < num_entries; i++) {
231                 struct htab_elem *elem;
232
233                 elem = get_htab_elem(htab, i);
234                 bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
235                 cond_resched();
236         }
237 }
238
239 static void htab_free_prealloced_fields(struct bpf_htab *htab)
240 {
241         u32 num_entries = htab->map.max_entries;
242         int i;
243
244         if (IS_ERR_OR_NULL(htab->map.record))
245                 return;
246         if (htab_has_extra_elems(htab))
247                 num_entries += num_possible_cpus();
248         for (i = 0; i < num_entries; i++) {
249                 struct htab_elem *elem;
250
251                 elem = get_htab_elem(htab, i);
252                 bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
253                 cond_resched();
254         }
255 }
256
257 static void htab_free_elems(struct bpf_htab *htab)
258 {
259         int i;
260
261         if (!htab_is_percpu(htab))
262                 goto free_elems;
263
264         for (i = 0; i < htab->map.max_entries; i++) {
265                 void __percpu *pptr;
266
267                 pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
268                                          htab->map.key_size);
269                 free_percpu(pptr);
270                 cond_resched();
271         }
272 free_elems:
273         bpf_map_area_free(htab->elems);
274 }
275
276 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock
277  * (bucket_lock). If both locks need to be acquired together, the lock
278  * order is always lru_lock -> bucket_lock and this only happens in
279  * bpf_lru_list.c logic. For example, certain code path of
280  * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
281  * will acquire lru_lock first followed by acquiring bucket_lock.
282  *
283  * In hashtab.c, to avoid deadlock, lock acquisition of
284  * bucket_lock followed by lru_lock is not allowed. In such cases,
285  * bucket_lock needs to be released first before acquiring lru_lock.
286  */
287 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
288                                           u32 hash)
289 {
290         struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
291         struct htab_elem *l;
292
293         if (node) {
294                 l = container_of(node, struct htab_elem, lru_node);
295                 memcpy(l->key, key, htab->map.key_size);
296                 return l;
297         }
298
299         return NULL;
300 }
301
302 static int prealloc_init(struct bpf_htab *htab)
303 {
304         u32 num_entries = htab->map.max_entries;
305         int err = -ENOMEM, i;
306
307         if (htab_has_extra_elems(htab))
308                 num_entries += num_possible_cpus();
309
310         htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
311                                          htab->map.numa_node);
312         if (!htab->elems)
313                 return -ENOMEM;
314
315         if (!htab_is_percpu(htab))
316                 goto skip_percpu_elems;
317
318         for (i = 0; i < num_entries; i++) {
319                 u32 size = round_up(htab->map.value_size, 8);
320                 void __percpu *pptr;
321
322                 pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
323                                             GFP_USER | __GFP_NOWARN);
324                 if (!pptr)
325                         goto free_elems;
326                 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
327                                   pptr);
328                 cond_resched();
329         }
330
331 skip_percpu_elems:
332         if (htab_is_lru(htab))
333                 err = bpf_lru_init(&htab->lru,
334                                    htab->map.map_flags & BPF_F_NO_COMMON_LRU,
335                                    offsetof(struct htab_elem, hash) -
336                                    offsetof(struct htab_elem, lru_node),
337                                    htab_lru_map_delete_node,
338                                    htab);
339         else
340                 err = pcpu_freelist_init(&htab->freelist);
341
342         if (err)
343                 goto free_elems;
344
345         if (htab_is_lru(htab))
346                 bpf_lru_populate(&htab->lru, htab->elems,
347                                  offsetof(struct htab_elem, lru_node),
348                                  htab->elem_size, num_entries);
349         else
350                 pcpu_freelist_populate(&htab->freelist,
351                                        htab->elems + offsetof(struct htab_elem, fnode),
352                                        htab->elem_size, num_entries);
353
354         return 0;
355
356 free_elems:
357         htab_free_elems(htab);
358         return err;
359 }
360
361 static void prealloc_destroy(struct bpf_htab *htab)
362 {
363         htab_free_elems(htab);
364
365         if (htab_is_lru(htab))
366                 bpf_lru_destroy(&htab->lru);
367         else
368                 pcpu_freelist_destroy(&htab->freelist);
369 }
370
371 static int alloc_extra_elems(struct bpf_htab *htab)
372 {
373         struct htab_elem *__percpu *pptr, *l_new;
374         struct pcpu_freelist_node *l;
375         int cpu;
376
377         pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
378                                     GFP_USER | __GFP_NOWARN);
379         if (!pptr)
380                 return -ENOMEM;
381
382         for_each_possible_cpu(cpu) {
383                 l = pcpu_freelist_pop(&htab->freelist);
384                 /* pop will succeed, since prealloc_init()
385                  * preallocated extra num_possible_cpus elements
386                  */
387                 l_new = container_of(l, struct htab_elem, fnode);
388                 *per_cpu_ptr(pptr, cpu) = l_new;
389         }
390         htab->extra_elems = pptr;
391         return 0;
392 }
393
394 /* Called from syscall */
395 static int htab_map_alloc_check(union bpf_attr *attr)
396 {
397         bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
398                        attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
399         bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
400                     attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
401         /* percpu_lru means each cpu has its own LRU list.
402          * it is different from BPF_MAP_TYPE_PERCPU_HASH where
403          * the map's value itself is percpu.  percpu_lru has
404          * nothing to do with the map's value.
405          */
406         bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
407         bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
408         bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
409         int numa_node = bpf_map_attr_numa_node(attr);
410
411         BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
412                      offsetof(struct htab_elem, hash_node.pprev));
413
414         if (lru && !bpf_capable())
415                 /* LRU implementation is much complicated than other
416                  * maps.  Hence, limit to CAP_BPF.
417                  */
418                 return -EPERM;
419
420         if (zero_seed && !capable(CAP_SYS_ADMIN))
421                 /* Guard against local DoS, and discourage production use. */
422                 return -EPERM;
423
424         if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
425             !bpf_map_flags_access_ok(attr->map_flags))
426                 return -EINVAL;
427
428         if (!lru && percpu_lru)
429                 return -EINVAL;
430
431         if (lru && !prealloc)
432                 return -ENOTSUPP;
433
434         if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
435                 return -EINVAL;
436
437         /* check sanity of attributes.
438          * value_size == 0 may be allowed in the future to use map as a set
439          */
440         if (attr->max_entries == 0 || attr->key_size == 0 ||
441             attr->value_size == 0)
442                 return -EINVAL;
443
444         if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
445            sizeof(struct htab_elem))
446                 /* if key_size + value_size is bigger, the user space won't be
447                  * able to access the elements via bpf syscall. This check
448                  * also makes sure that the elem_size doesn't overflow and it's
449                  * kmalloc-able later in htab_map_update_elem()
450                  */
451                 return -E2BIG;
452
453         return 0;
454 }
455
456 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
457 {
458         bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
459                        attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
460         bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
461                     attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
462         /* percpu_lru means each cpu has its own LRU list.
463          * it is different from BPF_MAP_TYPE_PERCPU_HASH where
464          * the map's value itself is percpu.  percpu_lru has
465          * nothing to do with the map's value.
466          */
467         bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
468         bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
469         struct bpf_htab *htab;
470         int err, i;
471
472         htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
473         if (!htab)
474                 return ERR_PTR(-ENOMEM);
475
476         lockdep_register_key(&htab->lockdep_key);
477
478         bpf_map_init_from_attr(&htab->map, attr);
479
480         if (percpu_lru) {
481                 /* ensure each CPU's lru list has >=1 elements.
482                  * since we are at it, make each lru list has the same
483                  * number of elements.
484                  */
485                 htab->map.max_entries = roundup(attr->max_entries,
486                                                 num_possible_cpus());
487                 if (htab->map.max_entries < attr->max_entries)
488                         htab->map.max_entries = rounddown(attr->max_entries,
489                                                           num_possible_cpus());
490         }
491
492         /* hash table size must be power of 2 */
493         htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
494
495         htab->elem_size = sizeof(struct htab_elem) +
496                           round_up(htab->map.key_size, 8);
497         if (percpu)
498                 htab->elem_size += sizeof(void *);
499         else
500                 htab->elem_size += round_up(htab->map.value_size, 8);
501
502         err = -E2BIG;
503         /* prevent zero size kmalloc and check for u32 overflow */
504         if (htab->n_buckets == 0 ||
505             htab->n_buckets > U32_MAX / sizeof(struct bucket))
506                 goto free_htab;
507
508         err = -ENOMEM;
509         htab->buckets = bpf_map_area_alloc(htab->n_buckets *
510                                            sizeof(struct bucket),
511                                            htab->map.numa_node);
512         if (!htab->buckets)
513                 goto free_htab;
514
515         for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
516                 htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
517                                                            sizeof(int),
518                                                            sizeof(int),
519                                                            GFP_USER);
520                 if (!htab->map_locked[i])
521                         goto free_map_locked;
522         }
523
524         if (htab->map.map_flags & BPF_F_ZERO_SEED)
525                 htab->hashrnd = 0;
526         else
527                 htab->hashrnd = get_random_u32();
528
529         htab_init_buckets(htab);
530
531 /* compute_batch_value() computes batch value as num_online_cpus() * 2
532  * and __percpu_counter_compare() needs
533  * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
534  * for percpu_counter to be faster than atomic_t. In practice the average bpf
535  * hash map size is 10k, which means that a system with 64 cpus will fill
536  * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
537  * define our own batch count as 32 then 10k hash map can be filled up to 80%:
538  * 10k - 8k > 32 _batch_ * 64 _cpus_
539  * and __percpu_counter_compare() will still be fast. At that point hash map
540  * collisions will dominate its performance anyway. Assume that hash map filled
541  * to 50+% isn't going to be O(1) and use the following formula to choose
542  * between percpu_counter and atomic_t.
543  */
544 #define PERCPU_COUNTER_BATCH 32
545         if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
546                 htab->use_percpu_counter = true;
547
548         if (htab->use_percpu_counter) {
549                 err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
550                 if (err)
551                         goto free_map_locked;
552         }
553
554         if (prealloc) {
555                 err = prealloc_init(htab);
556                 if (err)
557                         goto free_map_locked;
558
559                 if (!percpu && !lru) {
560                         /* lru itself can remove the least used element, so
561                          * there is no need for an extra elem during map_update.
562                          */
563                         err = alloc_extra_elems(htab);
564                         if (err)
565                                 goto free_prealloc;
566                 }
567         } else {
568                 err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
569                 if (err)
570                         goto free_map_locked;
571                 if (percpu) {
572                         err = bpf_mem_alloc_init(&htab->pcpu_ma,
573                                                  round_up(htab->map.value_size, 8), true);
574                         if (err)
575                                 goto free_map_locked;
576                 }
577         }
578
579         return &htab->map;
580
581 free_prealloc:
582         prealloc_destroy(htab);
583 free_map_locked:
584         if (htab->use_percpu_counter)
585                 percpu_counter_destroy(&htab->pcount);
586         for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
587                 free_percpu(htab->map_locked[i]);
588         bpf_map_area_free(htab->buckets);
589         bpf_mem_alloc_destroy(&htab->pcpu_ma);
590         bpf_mem_alloc_destroy(&htab->ma);
591 free_htab:
592         lockdep_unregister_key(&htab->lockdep_key);
593         bpf_map_area_free(htab);
594         return ERR_PTR(err);
595 }
596
597 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
598 {
599         return jhash(key, key_len, hashrnd);
600 }
601
602 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
603 {
604         return &htab->buckets[hash & (htab->n_buckets - 1)];
605 }
606
607 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
608 {
609         return &__select_bucket(htab, hash)->head;
610 }
611
612 /* this lookup function can only be called with bucket lock taken */
613 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
614                                          void *key, u32 key_size)
615 {
616         struct hlist_nulls_node *n;
617         struct htab_elem *l;
618
619         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
620                 if (l->hash == hash && !memcmp(&l->key, key, key_size))
621                         return l;
622
623         return NULL;
624 }
625
626 /* can be called without bucket lock. it will repeat the loop in
627  * the unlikely event when elements moved from one bucket into another
628  * while link list is being walked
629  */
630 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
631                                                u32 hash, void *key,
632                                                u32 key_size, u32 n_buckets)
633 {
634         struct hlist_nulls_node *n;
635         struct htab_elem *l;
636
637 again:
638         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
639                 if (l->hash == hash && !memcmp(&l->key, key, key_size))
640                         return l;
641
642         if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
643                 goto again;
644
645         return NULL;
646 }
647
648 /* Called from syscall or from eBPF program directly, so
649  * arguments have to match bpf_map_lookup_elem() exactly.
650  * The return value is adjusted by BPF instructions
651  * in htab_map_gen_lookup().
652  */
653 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
654 {
655         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
656         struct hlist_nulls_head *head;
657         struct htab_elem *l;
658         u32 hash, key_size;
659
660         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
661                      !rcu_read_lock_bh_held());
662
663         key_size = map->key_size;
664
665         hash = htab_map_hash(key, key_size, htab->hashrnd);
666
667         head = select_bucket(htab, hash);
668
669         l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
670
671         return l;
672 }
673
674 static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
675 {
676         struct htab_elem *l = __htab_map_lookup_elem(map, key);
677
678         if (l)
679                 return l->key + round_up(map->key_size, 8);
680
681         return NULL;
682 }
683
684 /* inline bpf_map_lookup_elem() call.
685  * Instead of:
686  * bpf_prog
687  *   bpf_map_lookup_elem
688  *     map->ops->map_lookup_elem
689  *       htab_map_lookup_elem
690  *         __htab_map_lookup_elem
691  * do:
692  * bpf_prog
693  *   __htab_map_lookup_elem
694  */
695 static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
696 {
697         struct bpf_insn *insn = insn_buf;
698         const int ret = BPF_REG_0;
699
700         BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
701                      (void *(*)(struct bpf_map *map, void *key))NULL));
702         *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
703         *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
704         *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
705                                 offsetof(struct htab_elem, key) +
706                                 round_up(map->key_size, 8));
707         return insn - insn_buf;
708 }
709
710 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
711                                                         void *key, const bool mark)
712 {
713         struct htab_elem *l = __htab_map_lookup_elem(map, key);
714
715         if (l) {
716                 if (mark)
717                         bpf_lru_node_set_ref(&l->lru_node);
718                 return l->key + round_up(map->key_size, 8);
719         }
720
721         return NULL;
722 }
723
724 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
725 {
726         return __htab_lru_map_lookup_elem(map, key, true);
727 }
728
729 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
730 {
731         return __htab_lru_map_lookup_elem(map, key, false);
732 }
733
734 static int htab_lru_map_gen_lookup(struct bpf_map *map,
735                                    struct bpf_insn *insn_buf)
736 {
737         struct bpf_insn *insn = insn_buf;
738         const int ret = BPF_REG_0;
739         const int ref_reg = BPF_REG_1;
740
741         BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
742                      (void *(*)(struct bpf_map *map, void *key))NULL));
743         *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
744         *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
745         *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
746                               offsetof(struct htab_elem, lru_node) +
747                               offsetof(struct bpf_lru_node, ref));
748         *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
749         *insn++ = BPF_ST_MEM(BPF_B, ret,
750                              offsetof(struct htab_elem, lru_node) +
751                              offsetof(struct bpf_lru_node, ref),
752                              1);
753         *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
754                                 offsetof(struct htab_elem, key) +
755                                 round_up(map->key_size, 8));
756         return insn - insn_buf;
757 }
758
759 static void check_and_free_fields(struct bpf_htab *htab,
760                                   struct htab_elem *elem)
761 {
762         void *map_value = elem->key + round_up(htab->map.key_size, 8);
763
764         bpf_obj_free_fields(htab->map.record, map_value);
765 }
766
767 /* It is called from the bpf_lru_list when the LRU needs to delete
768  * older elements from the htab.
769  */
770 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
771 {
772         struct bpf_htab *htab = arg;
773         struct htab_elem *l = NULL, *tgt_l;
774         struct hlist_nulls_head *head;
775         struct hlist_nulls_node *n;
776         unsigned long flags;
777         struct bucket *b;
778         int ret;
779
780         tgt_l = container_of(node, struct htab_elem, lru_node);
781         b = __select_bucket(htab, tgt_l->hash);
782         head = &b->head;
783
784         ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
785         if (ret)
786                 return false;
787
788         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
789                 if (l == tgt_l) {
790                         hlist_nulls_del_rcu(&l->hash_node);
791                         check_and_free_fields(htab, l);
792                         break;
793                 }
794
795         htab_unlock_bucket(htab, b, tgt_l->hash, flags);
796
797         return l == tgt_l;
798 }
799
800 /* Called from syscall */
801 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
802 {
803         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
804         struct hlist_nulls_head *head;
805         struct htab_elem *l, *next_l;
806         u32 hash, key_size;
807         int i = 0;
808
809         WARN_ON_ONCE(!rcu_read_lock_held());
810
811         key_size = map->key_size;
812
813         if (!key)
814                 goto find_first_elem;
815
816         hash = htab_map_hash(key, key_size, htab->hashrnd);
817
818         head = select_bucket(htab, hash);
819
820         /* lookup the key */
821         l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
822
823         if (!l)
824                 goto find_first_elem;
825
826         /* key was found, get next key in the same bucket */
827         next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
828                                   struct htab_elem, hash_node);
829
830         if (next_l) {
831                 /* if next elem in this hash list is non-zero, just return it */
832                 memcpy(next_key, next_l->key, key_size);
833                 return 0;
834         }
835
836         /* no more elements in this hash list, go to the next bucket */
837         i = hash & (htab->n_buckets - 1);
838         i++;
839
840 find_first_elem:
841         /* iterate over buckets */
842         for (; i < htab->n_buckets; i++) {
843                 head = select_bucket(htab, i);
844
845                 /* pick first element in the bucket */
846                 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
847                                           struct htab_elem, hash_node);
848                 if (next_l) {
849                         /* if it's not empty, just return it */
850                         memcpy(next_key, next_l->key, key_size);
851                         return 0;
852                 }
853         }
854
855         /* iterated over all buckets and all elements */
856         return -ENOENT;
857 }
858
859 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
860 {
861         if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
862                 bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
863         check_and_free_fields(htab, l);
864         bpf_mem_cache_free(&htab->ma, l);
865 }
866
867 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
868 {
869         struct bpf_map *map = &htab->map;
870         void *ptr;
871
872         if (map->ops->map_fd_put_ptr) {
873                 ptr = fd_htab_map_get_ptr(map, l);
874                 map->ops->map_fd_put_ptr(ptr);
875         }
876 }
877
878 static bool is_map_full(struct bpf_htab *htab)
879 {
880         if (htab->use_percpu_counter)
881                 return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
882                                                 PERCPU_COUNTER_BATCH) >= 0;
883         return atomic_read(&htab->count) >= htab->map.max_entries;
884 }
885
886 static void inc_elem_count(struct bpf_htab *htab)
887 {
888         if (htab->use_percpu_counter)
889                 percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
890         else
891                 atomic_inc(&htab->count);
892 }
893
894 static void dec_elem_count(struct bpf_htab *htab)
895 {
896         if (htab->use_percpu_counter)
897                 percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
898         else
899                 atomic_dec(&htab->count);
900 }
901
902
903 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
904 {
905         htab_put_fd_value(htab, l);
906
907         if (htab_is_prealloc(htab)) {
908                 check_and_free_fields(htab, l);
909                 __pcpu_freelist_push(&htab->freelist, &l->fnode);
910         } else {
911                 dec_elem_count(htab);
912                 htab_elem_free(htab, l);
913         }
914 }
915
916 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
917                             void *value, bool onallcpus)
918 {
919         if (!onallcpus) {
920                 /* copy true value_size bytes */
921                 memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
922         } else {
923                 u32 size = round_up(htab->map.value_size, 8);
924                 int off = 0, cpu;
925
926                 for_each_possible_cpu(cpu) {
927                         bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
928                                         value + off, size);
929                         off += size;
930                 }
931         }
932 }
933
934 static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
935                             void *value, bool onallcpus)
936 {
937         /* When not setting the initial value on all cpus, zero-fill element
938          * values for other cpus. Otherwise, bpf program has no way to ensure
939          * known initial values for cpus other than current one
940          * (onallcpus=false always when coming from bpf prog).
941          */
942         if (!onallcpus) {
943                 u32 size = round_up(htab->map.value_size, 8);
944                 int current_cpu = raw_smp_processor_id();
945                 int cpu;
946
947                 for_each_possible_cpu(cpu) {
948                         if (cpu == current_cpu)
949                                 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,
950                                                 size);
951                         else
952                                 memset(per_cpu_ptr(pptr, cpu), 0, size);
953                 }
954         } else {
955                 pcpu_copy_value(htab, pptr, value, onallcpus);
956         }
957 }
958
959 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
960 {
961         return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
962                BITS_PER_LONG == 64;
963 }
964
965 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
966                                          void *value, u32 key_size, u32 hash,
967                                          bool percpu, bool onallcpus,
968                                          struct htab_elem *old_elem)
969 {
970         u32 size = htab->map.value_size;
971         bool prealloc = htab_is_prealloc(htab);
972         struct htab_elem *l_new, **pl_new;
973         void __percpu *pptr;
974
975         if (prealloc) {
976                 if (old_elem) {
977                         /* if we're updating the existing element,
978                          * use per-cpu extra elems to avoid freelist_pop/push
979                          */
980                         pl_new = this_cpu_ptr(htab->extra_elems);
981                         l_new = *pl_new;
982                         htab_put_fd_value(htab, old_elem);
983                         *pl_new = old_elem;
984                 } else {
985                         struct pcpu_freelist_node *l;
986
987                         l = __pcpu_freelist_pop(&htab->freelist);
988                         if (!l)
989                                 return ERR_PTR(-E2BIG);
990                         l_new = container_of(l, struct htab_elem, fnode);
991                 }
992         } else {
993                 if (is_map_full(htab))
994                         if (!old_elem)
995                                 /* when map is full and update() is replacing
996                                  * old element, it's ok to allocate, since
997                                  * old element will be freed immediately.
998                                  * Otherwise return an error
999                                  */
1000                                 return ERR_PTR(-E2BIG);
1001                 inc_elem_count(htab);
1002                 l_new = bpf_mem_cache_alloc(&htab->ma);
1003                 if (!l_new) {
1004                         l_new = ERR_PTR(-ENOMEM);
1005                         goto dec_count;
1006                 }
1007                 check_and_init_map_value(&htab->map,
1008                                          l_new->key + round_up(key_size, 8));
1009         }
1010
1011         memcpy(l_new->key, key, key_size);
1012         if (percpu) {
1013                 if (prealloc) {
1014                         pptr = htab_elem_get_ptr(l_new, key_size);
1015                 } else {
1016                         /* alloc_percpu zero-fills */
1017                         pptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
1018                         if (!pptr) {
1019                                 bpf_mem_cache_free(&htab->ma, l_new);
1020                                 l_new = ERR_PTR(-ENOMEM);
1021                                 goto dec_count;
1022                         }
1023                         l_new->ptr_to_pptr = pptr;
1024                         pptr = *(void **)pptr;
1025                 }
1026
1027                 pcpu_init_value(htab, pptr, value, onallcpus);
1028
1029                 if (!prealloc)
1030                         htab_elem_set_ptr(l_new, key_size, pptr);
1031         } else if (fd_htab_map_needs_adjust(htab)) {
1032                 size = round_up(size, 8);
1033                 memcpy(l_new->key + round_up(key_size, 8), value, size);
1034         } else {
1035                 copy_map_value(&htab->map,
1036                                l_new->key + round_up(key_size, 8),
1037                                value);
1038         }
1039
1040         l_new->hash = hash;
1041         return l_new;
1042 dec_count:
1043         dec_elem_count(htab);
1044         return l_new;
1045 }
1046
1047 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
1048                        u64 map_flags)
1049 {
1050         if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
1051                 /* elem already exists */
1052                 return -EEXIST;
1053
1054         if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
1055                 /* elem doesn't exist, cannot update it */
1056                 return -ENOENT;
1057
1058         return 0;
1059 }
1060
1061 /* Called from syscall or from eBPF program */
1062 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
1063                                 u64 map_flags)
1064 {
1065         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1066         struct htab_elem *l_new = NULL, *l_old;
1067         struct hlist_nulls_head *head;
1068         unsigned long flags;
1069         struct bucket *b;
1070         u32 key_size, hash;
1071         int ret;
1072
1073         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
1074                 /* unknown flags */
1075                 return -EINVAL;
1076
1077         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1078                      !rcu_read_lock_bh_held());
1079
1080         key_size = map->key_size;
1081
1082         hash = htab_map_hash(key, key_size, htab->hashrnd);
1083
1084         b = __select_bucket(htab, hash);
1085         head = &b->head;
1086
1087         if (unlikely(map_flags & BPF_F_LOCK)) {
1088                 if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1089                         return -EINVAL;
1090                 /* find an element without taking the bucket lock */
1091                 l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
1092                                               htab->n_buckets);
1093                 ret = check_flags(htab, l_old, map_flags);
1094                 if (ret)
1095                         return ret;
1096                 if (l_old) {
1097                         /* grab the element lock and update value in place */
1098                         copy_map_value_locked(map,
1099                                               l_old->key + round_up(key_size, 8),
1100                                               value, false);
1101                         return 0;
1102                 }
1103                 /* fall through, grab the bucket lock and lookup again.
1104                  * 99.9% chance that the element won't be found,
1105                  * but second lookup under lock has to be done.
1106                  */
1107         }
1108
1109         ret = htab_lock_bucket(htab, b, hash, &flags);
1110         if (ret)
1111                 return ret;
1112
1113         l_old = lookup_elem_raw(head, hash, key, key_size);
1114
1115         ret = check_flags(htab, l_old, map_flags);
1116         if (ret)
1117                 goto err;
1118
1119         if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
1120                 /* first lookup without the bucket lock didn't find the element,
1121                  * but second lookup with the bucket lock found it.
1122                  * This case is highly unlikely, but has to be dealt with:
1123                  * grab the element lock in addition to the bucket lock
1124                  * and update element in place
1125                  */
1126                 copy_map_value_locked(map,
1127                                       l_old->key + round_up(key_size, 8),
1128                                       value, false);
1129                 ret = 0;
1130                 goto err;
1131         }
1132
1133         l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
1134                                 l_old);
1135         if (IS_ERR(l_new)) {
1136                 /* all pre-allocated elements are in use or memory exhausted */
1137                 ret = PTR_ERR(l_new);
1138                 goto err;
1139         }
1140
1141         /* add new element to the head of the list, so that
1142          * concurrent search will find it before old elem
1143          */
1144         hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1145         if (l_old) {
1146                 hlist_nulls_del_rcu(&l_old->hash_node);
1147                 if (!htab_is_prealloc(htab))
1148                         free_htab_elem(htab, l_old);
1149                 else
1150                         check_and_free_fields(htab, l_old);
1151         }
1152         ret = 0;
1153 err:
1154         htab_unlock_bucket(htab, b, hash, flags);
1155         return ret;
1156 }
1157
1158 static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
1159 {
1160         check_and_free_fields(htab, elem);
1161         bpf_lru_push_free(&htab->lru, &elem->lru_node);
1162 }
1163
1164 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
1165                                     u64 map_flags)
1166 {
1167         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1168         struct htab_elem *l_new, *l_old = NULL;
1169         struct hlist_nulls_head *head;
1170         unsigned long flags;
1171         struct bucket *b;
1172         u32 key_size, hash;
1173         int ret;
1174
1175         if (unlikely(map_flags > BPF_EXIST))
1176                 /* unknown flags */
1177                 return -EINVAL;
1178
1179         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1180                      !rcu_read_lock_bh_held());
1181
1182         key_size = map->key_size;
1183
1184         hash = htab_map_hash(key, key_size, htab->hashrnd);
1185
1186         b = __select_bucket(htab, hash);
1187         head = &b->head;
1188
1189         /* For LRU, we need to alloc before taking bucket's
1190          * spinlock because getting free nodes from LRU may need
1191          * to remove older elements from htab and this removal
1192          * operation will need a bucket lock.
1193          */
1194         l_new = prealloc_lru_pop(htab, key, hash);
1195         if (!l_new)
1196                 return -ENOMEM;
1197         copy_map_value(&htab->map,
1198                        l_new->key + round_up(map->key_size, 8), value);
1199
1200         ret = htab_lock_bucket(htab, b, hash, &flags);
1201         if (ret)
1202                 return ret;
1203
1204         l_old = lookup_elem_raw(head, hash, key, key_size);
1205
1206         ret = check_flags(htab, l_old, map_flags);
1207         if (ret)
1208                 goto err;
1209
1210         /* add new element to the head of the list, so that
1211          * concurrent search will find it before old elem
1212          */
1213         hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1214         if (l_old) {
1215                 bpf_lru_node_set_ref(&l_new->lru_node);
1216                 hlist_nulls_del_rcu(&l_old->hash_node);
1217         }
1218         ret = 0;
1219
1220 err:
1221         htab_unlock_bucket(htab, b, hash, flags);
1222
1223         if (ret)
1224                 htab_lru_push_free(htab, l_new);
1225         else if (l_old)
1226                 htab_lru_push_free(htab, l_old);
1227
1228         return ret;
1229 }
1230
1231 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1232                                          void *value, u64 map_flags,
1233                                          bool onallcpus)
1234 {
1235         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1236         struct htab_elem *l_new = NULL, *l_old;
1237         struct hlist_nulls_head *head;
1238         unsigned long flags;
1239         struct bucket *b;
1240         u32 key_size, hash;
1241         int ret;
1242
1243         if (unlikely(map_flags > BPF_EXIST))
1244                 /* unknown flags */
1245                 return -EINVAL;
1246
1247         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1248                      !rcu_read_lock_bh_held());
1249
1250         key_size = map->key_size;
1251
1252         hash = htab_map_hash(key, key_size, htab->hashrnd);
1253
1254         b = __select_bucket(htab, hash);
1255         head = &b->head;
1256
1257         ret = htab_lock_bucket(htab, b, hash, &flags);
1258         if (ret)
1259                 return ret;
1260
1261         l_old = lookup_elem_raw(head, hash, key, key_size);
1262
1263         ret = check_flags(htab, l_old, map_flags);
1264         if (ret)
1265                 goto err;
1266
1267         if (l_old) {
1268                 /* per-cpu hash map can update value in-place */
1269                 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1270                                 value, onallcpus);
1271         } else {
1272                 l_new = alloc_htab_elem(htab, key, value, key_size,
1273                                         hash, true, onallcpus, NULL);
1274                 if (IS_ERR(l_new)) {
1275                         ret = PTR_ERR(l_new);
1276                         goto err;
1277                 }
1278                 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1279         }
1280         ret = 0;
1281 err:
1282         htab_unlock_bucket(htab, b, hash, flags);
1283         return ret;
1284 }
1285
1286 static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1287                                              void *value, u64 map_flags,
1288                                              bool onallcpus)
1289 {
1290         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1291         struct htab_elem *l_new = NULL, *l_old;
1292         struct hlist_nulls_head *head;
1293         unsigned long flags;
1294         struct bucket *b;
1295         u32 key_size, hash;
1296         int ret;
1297
1298         if (unlikely(map_flags > BPF_EXIST))
1299                 /* unknown flags */
1300                 return -EINVAL;
1301
1302         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1303                      !rcu_read_lock_bh_held());
1304
1305         key_size = map->key_size;
1306
1307         hash = htab_map_hash(key, key_size, htab->hashrnd);
1308
1309         b = __select_bucket(htab, hash);
1310         head = &b->head;
1311
1312         /* For LRU, we need to alloc before taking bucket's
1313          * spinlock because LRU's elem alloc may need
1314          * to remove older elem from htab and this removal
1315          * operation will need a bucket lock.
1316          */
1317         if (map_flags != BPF_EXIST) {
1318                 l_new = prealloc_lru_pop(htab, key, hash);
1319                 if (!l_new)
1320                         return -ENOMEM;
1321         }
1322
1323         ret = htab_lock_bucket(htab, b, hash, &flags);
1324         if (ret)
1325                 return ret;
1326
1327         l_old = lookup_elem_raw(head, hash, key, key_size);
1328
1329         ret = check_flags(htab, l_old, map_flags);
1330         if (ret)
1331                 goto err;
1332
1333         if (l_old) {
1334                 bpf_lru_node_set_ref(&l_old->lru_node);
1335
1336                 /* per-cpu hash map can update value in-place */
1337                 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1338                                 value, onallcpus);
1339         } else {
1340                 pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
1341                                 value, onallcpus);
1342                 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1343                 l_new = NULL;
1344         }
1345         ret = 0;
1346 err:
1347         htab_unlock_bucket(htab, b, hash, flags);
1348         if (l_new)
1349                 bpf_lru_push_free(&htab->lru, &l_new->lru_node);
1350         return ret;
1351 }
1352
1353 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1354                                        void *value, u64 map_flags)
1355 {
1356         return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
1357 }
1358
1359 static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1360                                            void *value, u64 map_flags)
1361 {
1362         return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1363                                                  false);
1364 }
1365
1366 /* Called from syscall or from eBPF program */
1367 static int htab_map_delete_elem(struct bpf_map *map, void *key)
1368 {
1369         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1370         struct hlist_nulls_head *head;
1371         struct bucket *b;
1372         struct htab_elem *l;
1373         unsigned long flags;
1374         u32 hash, key_size;
1375         int ret;
1376
1377         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1378                      !rcu_read_lock_bh_held());
1379
1380         key_size = map->key_size;
1381
1382         hash = htab_map_hash(key, key_size, htab->hashrnd);
1383         b = __select_bucket(htab, hash);
1384         head = &b->head;
1385
1386         ret = htab_lock_bucket(htab, b, hash, &flags);
1387         if (ret)
1388                 return ret;
1389
1390         l = lookup_elem_raw(head, hash, key, key_size);
1391
1392         if (l) {
1393                 hlist_nulls_del_rcu(&l->hash_node);
1394                 free_htab_elem(htab, l);
1395         } else {
1396                 ret = -ENOENT;
1397         }
1398
1399         htab_unlock_bucket(htab, b, hash, flags);
1400         return ret;
1401 }
1402
1403 static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
1404 {
1405         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1406         struct hlist_nulls_head *head;
1407         struct bucket *b;
1408         struct htab_elem *l;
1409         unsigned long flags;
1410         u32 hash, key_size;
1411         int ret;
1412
1413         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1414                      !rcu_read_lock_bh_held());
1415
1416         key_size = map->key_size;
1417
1418         hash = htab_map_hash(key, key_size, htab->hashrnd);
1419         b = __select_bucket(htab, hash);
1420         head = &b->head;
1421
1422         ret = htab_lock_bucket(htab, b, hash, &flags);
1423         if (ret)
1424                 return ret;
1425
1426         l = lookup_elem_raw(head, hash, key, key_size);
1427
1428         if (l)
1429                 hlist_nulls_del_rcu(&l->hash_node);
1430         else
1431                 ret = -ENOENT;
1432
1433         htab_unlock_bucket(htab, b, hash, flags);
1434         if (l)
1435                 htab_lru_push_free(htab, l);
1436         return ret;
1437 }
1438
1439 static void delete_all_elements(struct bpf_htab *htab)
1440 {
1441         int i;
1442
1443         /* It's called from a worker thread, so disable migration here,
1444          * since bpf_mem_cache_free() relies on that.
1445          */
1446         migrate_disable();
1447         for (i = 0; i < htab->n_buckets; i++) {
1448                 struct hlist_nulls_head *head = select_bucket(htab, i);
1449                 struct hlist_nulls_node *n;
1450                 struct htab_elem *l;
1451
1452                 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1453                         hlist_nulls_del_rcu(&l->hash_node);
1454                         htab_elem_free(htab, l);
1455                 }
1456         }
1457         migrate_enable();
1458 }
1459
1460 static void htab_free_malloced_timers(struct bpf_htab *htab)
1461 {
1462         int i;
1463
1464         rcu_read_lock();
1465         for (i = 0; i < htab->n_buckets; i++) {
1466                 struct hlist_nulls_head *head = select_bucket(htab, i);
1467                 struct hlist_nulls_node *n;
1468                 struct htab_elem *l;
1469
1470                 hlist_nulls_for_each_entry(l, n, head, hash_node) {
1471                         /* We only free timer on uref dropping to zero */
1472                         bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8));
1473                 }
1474                 cond_resched_rcu();
1475         }
1476         rcu_read_unlock();
1477 }
1478
1479 static void htab_map_free_timers(struct bpf_map *map)
1480 {
1481         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1482
1483         /* We only free timer on uref dropping to zero */
1484         if (!btf_record_has_field(htab->map.record, BPF_TIMER))
1485                 return;
1486         if (!htab_is_prealloc(htab))
1487                 htab_free_malloced_timers(htab);
1488         else
1489                 htab_free_prealloced_timers(htab);
1490 }
1491
1492 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1493 static void htab_map_free(struct bpf_map *map)
1494 {
1495         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1496         int i;
1497
1498         /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
1499          * bpf_free_used_maps() is called after bpf prog is no longer executing.
1500          * There is no need to synchronize_rcu() here to protect map elements.
1501          */
1502
1503         /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
1504          * underneath and is reponsible for waiting for callbacks to finish
1505          * during bpf_mem_alloc_destroy().
1506          */
1507         if (!htab_is_prealloc(htab)) {
1508                 delete_all_elements(htab);
1509         } else {
1510                 htab_free_prealloced_fields(htab);
1511                 prealloc_destroy(htab);
1512         }
1513
1514         free_percpu(htab->extra_elems);
1515         bpf_map_area_free(htab->buckets);
1516         bpf_mem_alloc_destroy(&htab->pcpu_ma);
1517         bpf_mem_alloc_destroy(&htab->ma);
1518         if (htab->use_percpu_counter)
1519                 percpu_counter_destroy(&htab->pcount);
1520         for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
1521                 free_percpu(htab->map_locked[i]);
1522         lockdep_unregister_key(&htab->lockdep_key);
1523         bpf_map_area_free(htab);
1524 }
1525
1526 static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
1527                                    struct seq_file *m)
1528 {
1529         void *value;
1530
1531         rcu_read_lock();
1532
1533         value = htab_map_lookup_elem(map, key);
1534         if (!value) {
1535                 rcu_read_unlock();
1536                 return;
1537         }
1538
1539         btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
1540         seq_puts(m, ": ");
1541         btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
1542         seq_puts(m, "\n");
1543
1544         rcu_read_unlock();
1545 }
1546
1547 static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1548                                              void *value, bool is_lru_map,
1549                                              bool is_percpu, u64 flags)
1550 {
1551         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1552         struct hlist_nulls_head *head;
1553         unsigned long bflags;
1554         struct htab_elem *l;
1555         u32 hash, key_size;
1556         struct bucket *b;
1557         int ret;
1558
1559         key_size = map->key_size;
1560
1561         hash = htab_map_hash(key, key_size, htab->hashrnd);
1562         b = __select_bucket(htab, hash);
1563         head = &b->head;
1564
1565         ret = htab_lock_bucket(htab, b, hash, &bflags);
1566         if (ret)
1567                 return ret;
1568
1569         l = lookup_elem_raw(head, hash, key, key_size);
1570         if (!l) {
1571                 ret = -ENOENT;
1572         } else {
1573                 if (is_percpu) {
1574                         u32 roundup_value_size = round_up(map->value_size, 8);
1575                         void __percpu *pptr;
1576                         int off = 0, cpu;
1577
1578                         pptr = htab_elem_get_ptr(l, key_size);
1579                         for_each_possible_cpu(cpu) {
1580                                 bpf_long_memcpy(value + off,
1581                                                 per_cpu_ptr(pptr, cpu),
1582                                                 roundup_value_size);
1583                                 off += roundup_value_size;
1584                         }
1585                 } else {
1586                         u32 roundup_key_size = round_up(map->key_size, 8);
1587
1588                         if (flags & BPF_F_LOCK)
1589                                 copy_map_value_locked(map, value, l->key +
1590                                                       roundup_key_size,
1591                                                       true);
1592                         else
1593                                 copy_map_value(map, value, l->key +
1594                                                roundup_key_size);
1595                         check_and_init_map_value(map, value);
1596                 }
1597
1598                 hlist_nulls_del_rcu(&l->hash_node);
1599                 if (!is_lru_map)
1600                         free_htab_elem(htab, l);
1601         }
1602
1603         htab_unlock_bucket(htab, b, hash, bflags);
1604
1605         if (is_lru_map && l)
1606                 htab_lru_push_free(htab, l);
1607
1608         return ret;
1609 }
1610
1611 static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1612                                            void *value, u64 flags)
1613 {
1614         return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
1615                                                  flags);
1616 }
1617
1618 static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1619                                                   void *key, void *value,
1620                                                   u64 flags)
1621 {
1622         return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
1623                                                  flags);
1624 }
1625
1626 static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1627                                                void *value, u64 flags)
1628 {
1629         return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
1630                                                  flags);
1631 }
1632
1633 static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1634                                                       void *key, void *value,
1635                                                       u64 flags)
1636 {
1637         return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
1638                                                  flags);
1639 }
1640
1641 static int
1642 __htab_map_lookup_and_delete_batch(struct bpf_map *map,
1643                                    const union bpf_attr *attr,
1644                                    union bpf_attr __user *uattr,
1645                                    bool do_delete, bool is_lru_map,
1646                                    bool is_percpu)
1647 {
1648         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1649         u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
1650         void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
1651         void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1652         void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1653         void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1654         u32 batch, max_count, size, bucket_size, map_id;
1655         struct htab_elem *node_to_free = NULL;
1656         u64 elem_map_flags, map_flags;
1657         struct hlist_nulls_head *head;
1658         struct hlist_nulls_node *n;
1659         unsigned long flags = 0;
1660         bool locked = false;
1661         struct htab_elem *l;
1662         struct bucket *b;
1663         int ret = 0;
1664
1665         elem_map_flags = attr->batch.elem_flags;
1666         if ((elem_map_flags & ~BPF_F_LOCK) ||
1667             ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1668                 return -EINVAL;
1669
1670         map_flags = attr->batch.flags;
1671         if (map_flags)
1672                 return -EINVAL;
1673
1674         max_count = attr->batch.count;
1675         if (!max_count)
1676                 return 0;
1677
1678         if (put_user(0, &uattr->batch.count))
1679                 return -EFAULT;
1680
1681         batch = 0;
1682         if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
1683                 return -EFAULT;
1684
1685         if (batch >= htab->n_buckets)
1686                 return -ENOENT;
1687
1688         key_size = htab->map.key_size;
1689         roundup_key_size = round_up(htab->map.key_size, 8);
1690         value_size = htab->map.value_size;
1691         size = round_up(value_size, 8);
1692         if (is_percpu)
1693                 value_size = size * num_possible_cpus();
1694         total = 0;
1695         /* while experimenting with hash tables with sizes ranging from 10 to
1696          * 1000, it was observed that a bucket can have up to 5 entries.
1697          */
1698         bucket_size = 5;
1699
1700 alloc:
1701         /* We cannot do copy_from_user or copy_to_user inside
1702          * the rcu_read_lock. Allocate enough space here.
1703          */
1704         keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
1705         values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
1706         if (!keys || !values) {
1707                 ret = -ENOMEM;
1708                 goto after_loop;
1709         }
1710
1711 again:
1712         bpf_disable_instrumentation();
1713         rcu_read_lock();
1714 again_nocopy:
1715         dst_key = keys;
1716         dst_val = values;
1717         b = &htab->buckets[batch];
1718         head = &b->head;
1719         /* do not grab the lock unless need it (bucket_cnt > 0). */
1720         if (locked) {
1721                 ret = htab_lock_bucket(htab, b, batch, &flags);
1722                 if (ret) {
1723                         rcu_read_unlock();
1724                         bpf_enable_instrumentation();
1725                         goto after_loop;
1726                 }
1727         }
1728
1729         bucket_cnt = 0;
1730         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1731                 bucket_cnt++;
1732
1733         if (bucket_cnt && !locked) {
1734                 locked = true;
1735                 goto again_nocopy;
1736         }
1737
1738         if (bucket_cnt > (max_count - total)) {
1739                 if (total == 0)
1740                         ret = -ENOSPC;
1741                 /* Note that since bucket_cnt > 0 here, it is implicit
1742                  * that the locked was grabbed, so release it.
1743                  */
1744                 htab_unlock_bucket(htab, b, batch, flags);
1745                 rcu_read_unlock();
1746                 bpf_enable_instrumentation();
1747                 goto after_loop;
1748         }
1749
1750         if (bucket_cnt > bucket_size) {
1751                 bucket_size = bucket_cnt;
1752                 /* Note that since bucket_cnt > 0 here, it is implicit
1753                  * that the locked was grabbed, so release it.
1754                  */
1755                 htab_unlock_bucket(htab, b, batch, flags);
1756                 rcu_read_unlock();
1757                 bpf_enable_instrumentation();
1758                 kvfree(keys);
1759                 kvfree(values);
1760                 goto alloc;
1761         }
1762
1763         /* Next block is only safe to run if you have grabbed the lock */
1764         if (!locked)
1765                 goto next_batch;
1766
1767         hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1768                 memcpy(dst_key, l->key, key_size);
1769
1770                 if (is_percpu) {
1771                         int off = 0, cpu;
1772                         void __percpu *pptr;
1773
1774                         pptr = htab_elem_get_ptr(l, map->key_size);
1775                         for_each_possible_cpu(cpu) {
1776                                 bpf_long_memcpy(dst_val + off,
1777                                                 per_cpu_ptr(pptr, cpu), size);
1778                                 off += size;
1779                         }
1780                 } else {
1781                         value = l->key + roundup_key_size;
1782                         if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
1783                                 struct bpf_map **inner_map = value;
1784
1785                                  /* Actual value is the id of the inner map */
1786                                 map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
1787                                 value = &map_id;
1788                         }
1789
1790                         if (elem_map_flags & BPF_F_LOCK)
1791                                 copy_map_value_locked(map, dst_val, value,
1792                                                       true);
1793                         else
1794                                 copy_map_value(map, dst_val, value);
1795                         check_and_init_map_value(map, dst_val);
1796                 }
1797                 if (do_delete) {
1798                         hlist_nulls_del_rcu(&l->hash_node);
1799
1800                         /* bpf_lru_push_free() will acquire lru_lock, which
1801                          * may cause deadlock. See comments in function
1802                          * prealloc_lru_pop(). Let us do bpf_lru_push_free()
1803                          * after releasing the bucket lock.
1804                          */
1805                         if (is_lru_map) {
1806                                 l->batch_flink = node_to_free;
1807                                 node_to_free = l;
1808                         } else {
1809                                 free_htab_elem(htab, l);
1810                         }
1811                 }
1812                 dst_key += key_size;
1813                 dst_val += value_size;
1814         }
1815
1816         htab_unlock_bucket(htab, b, batch, flags);
1817         locked = false;
1818
1819         while (node_to_free) {
1820                 l = node_to_free;
1821                 node_to_free = node_to_free->batch_flink;
1822                 htab_lru_push_free(htab, l);
1823         }
1824
1825 next_batch:
1826         /* If we are not copying data, we can go to next bucket and avoid
1827          * unlocking the rcu.
1828          */
1829         if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
1830                 batch++;
1831                 goto again_nocopy;
1832         }
1833
1834         rcu_read_unlock();
1835         bpf_enable_instrumentation();
1836         if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
1837             key_size * bucket_cnt) ||
1838             copy_to_user(uvalues + total * value_size, values,
1839             value_size * bucket_cnt))) {
1840                 ret = -EFAULT;
1841                 goto after_loop;
1842         }
1843
1844         total += bucket_cnt;
1845         batch++;
1846         if (batch >= htab->n_buckets) {
1847                 ret = -ENOENT;
1848                 goto after_loop;
1849         }
1850         goto again;
1851
1852 after_loop:
1853         if (ret == -EFAULT)
1854                 goto out;
1855
1856         /* copy # of entries and next batch */
1857         ubatch = u64_to_user_ptr(attr->batch.out_batch);
1858         if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
1859             put_user(total, &uattr->batch.count))
1860                 ret = -EFAULT;
1861
1862 out:
1863         kvfree(keys);
1864         kvfree(values);
1865         return ret;
1866 }
1867
1868 static int
1869 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1870                              union bpf_attr __user *uattr)
1871 {
1872         return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1873                                                   false, true);
1874 }
1875
1876 static int
1877 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1878                                         const union bpf_attr *attr,
1879                                         union bpf_attr __user *uattr)
1880 {
1881         return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1882                                                   false, true);
1883 }
1884
1885 static int
1886 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1887                       union bpf_attr __user *uattr)
1888 {
1889         return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1890                                                   false, false);
1891 }
1892
1893 static int
1894 htab_map_lookup_and_delete_batch(struct bpf_map *map,
1895                                  const union bpf_attr *attr,
1896                                  union bpf_attr __user *uattr)
1897 {
1898         return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1899                                                   false, false);
1900 }
1901
1902 static int
1903 htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1904                                  const union bpf_attr *attr,
1905                                  union bpf_attr __user *uattr)
1906 {
1907         return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1908                                                   true, true);
1909 }
1910
1911 static int
1912 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1913                                             const union bpf_attr *attr,
1914                                             union bpf_attr __user *uattr)
1915 {
1916         return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1917                                                   true, true);
1918 }
1919
1920 static int
1921 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1922                           union bpf_attr __user *uattr)
1923 {
1924         return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1925                                                   true, false);
1926 }
1927
1928 static int
1929 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1930                                      const union bpf_attr *attr,
1931                                      union bpf_attr __user *uattr)
1932 {
1933         return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1934                                                   true, false);
1935 }
1936
1937 struct bpf_iter_seq_hash_map_info {
1938         struct bpf_map *map;
1939         struct bpf_htab *htab;
1940         void *percpu_value_buf; // non-zero means percpu hash
1941         u32 bucket_id;
1942         u32 skip_elems;
1943 };
1944
1945 static struct htab_elem *
1946 bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
1947                            struct htab_elem *prev_elem)
1948 {
1949         const struct bpf_htab *htab = info->htab;
1950         u32 skip_elems = info->skip_elems;
1951         u32 bucket_id = info->bucket_id;
1952         struct hlist_nulls_head *head;
1953         struct hlist_nulls_node *n;
1954         struct htab_elem *elem;
1955         struct bucket *b;
1956         u32 i, count;
1957
1958         if (bucket_id >= htab->n_buckets)
1959                 return NULL;
1960
1961         /* try to find next elem in the same bucket */
1962         if (prev_elem) {
1963                 /* no update/deletion on this bucket, prev_elem should be still valid
1964                  * and we won't skip elements.
1965                  */
1966                 n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
1967                 elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
1968                 if (elem)
1969                         return elem;
1970
1971                 /* not found, unlock and go to the next bucket */
1972                 b = &htab->buckets[bucket_id++];
1973                 rcu_read_unlock();
1974                 skip_elems = 0;
1975         }
1976
1977         for (i = bucket_id; i < htab->n_buckets; i++) {
1978                 b = &htab->buckets[i];
1979                 rcu_read_lock();
1980
1981                 count = 0;
1982                 head = &b->head;
1983                 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
1984                         if (count >= skip_elems) {
1985                                 info->bucket_id = i;
1986                                 info->skip_elems = count;
1987                                 return elem;
1988                         }
1989                         count++;
1990                 }
1991
1992                 rcu_read_unlock();
1993                 skip_elems = 0;
1994         }
1995
1996         info->bucket_id = i;
1997         info->skip_elems = 0;
1998         return NULL;
1999 }
2000
2001 static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
2002 {
2003         struct bpf_iter_seq_hash_map_info *info = seq->private;
2004         struct htab_elem *elem;
2005
2006         elem = bpf_hash_map_seq_find_next(info, NULL);
2007         if (!elem)
2008                 return NULL;
2009
2010         if (*pos == 0)
2011                 ++*pos;
2012         return elem;
2013 }
2014
2015 static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2016 {
2017         struct bpf_iter_seq_hash_map_info *info = seq->private;
2018
2019         ++*pos;
2020         ++info->skip_elems;
2021         return bpf_hash_map_seq_find_next(info, v);
2022 }
2023
2024 static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
2025 {
2026         struct bpf_iter_seq_hash_map_info *info = seq->private;
2027         u32 roundup_key_size, roundup_value_size;
2028         struct bpf_iter__bpf_map_elem ctx = {};
2029         struct bpf_map *map = info->map;
2030         struct bpf_iter_meta meta;
2031         int ret = 0, off = 0, cpu;
2032         struct bpf_prog *prog;
2033         void __percpu *pptr;
2034
2035         meta.seq = seq;
2036         prog = bpf_iter_get_info(&meta, elem == NULL);
2037         if (prog) {
2038                 ctx.meta = &meta;
2039                 ctx.map = info->map;
2040                 if (elem) {
2041                         roundup_key_size = round_up(map->key_size, 8);
2042                         ctx.key = elem->key;
2043                         if (!info->percpu_value_buf) {
2044                                 ctx.value = elem->key + roundup_key_size;
2045                         } else {
2046                                 roundup_value_size = round_up(map->value_size, 8);
2047                                 pptr = htab_elem_get_ptr(elem, map->key_size);
2048                                 for_each_possible_cpu(cpu) {
2049                                         bpf_long_memcpy(info->percpu_value_buf + off,
2050                                                         per_cpu_ptr(pptr, cpu),
2051                                                         roundup_value_size);
2052                                         off += roundup_value_size;
2053                                 }
2054                                 ctx.value = info->percpu_value_buf;
2055                         }
2056                 }
2057                 ret = bpf_iter_run_prog(prog, &ctx);
2058         }
2059
2060         return ret;
2061 }
2062
2063 static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
2064 {
2065         return __bpf_hash_map_seq_show(seq, v);
2066 }
2067
2068 static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
2069 {
2070         if (!v)
2071                 (void)__bpf_hash_map_seq_show(seq, NULL);
2072         else
2073                 rcu_read_unlock();
2074 }
2075
2076 static int bpf_iter_init_hash_map(void *priv_data,
2077                                   struct bpf_iter_aux_info *aux)
2078 {
2079         struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2080         struct bpf_map *map = aux->map;
2081         void *value_buf;
2082         u32 buf_size;
2083
2084         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
2085             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2086                 buf_size = round_up(map->value_size, 8) * num_possible_cpus();
2087                 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
2088                 if (!value_buf)
2089                         return -ENOMEM;
2090
2091                 seq_info->percpu_value_buf = value_buf;
2092         }
2093
2094         bpf_map_inc_with_uref(map);
2095         seq_info->map = map;
2096         seq_info->htab = container_of(map, struct bpf_htab, map);
2097         return 0;
2098 }
2099
2100 static void bpf_iter_fini_hash_map(void *priv_data)
2101 {
2102         struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2103
2104         bpf_map_put_with_uref(seq_info->map);
2105         kfree(seq_info->percpu_value_buf);
2106 }
2107
2108 static const struct seq_operations bpf_hash_map_seq_ops = {
2109         .start  = bpf_hash_map_seq_start,
2110         .next   = bpf_hash_map_seq_next,
2111         .stop   = bpf_hash_map_seq_stop,
2112         .show   = bpf_hash_map_seq_show,
2113 };
2114
2115 static const struct bpf_iter_seq_info iter_seq_info = {
2116         .seq_ops                = &bpf_hash_map_seq_ops,
2117         .init_seq_private       = bpf_iter_init_hash_map,
2118         .fini_seq_private       = bpf_iter_fini_hash_map,
2119         .seq_priv_size          = sizeof(struct bpf_iter_seq_hash_map_info),
2120 };
2121
2122 static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
2123                                   void *callback_ctx, u64 flags)
2124 {
2125         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2126         struct hlist_nulls_head *head;
2127         struct hlist_nulls_node *n;
2128         struct htab_elem *elem;
2129         u32 roundup_key_size;
2130         int i, num_elems = 0;
2131         void __percpu *pptr;
2132         struct bucket *b;
2133         void *key, *val;
2134         bool is_percpu;
2135         u64 ret = 0;
2136
2137         if (flags != 0)
2138                 return -EINVAL;
2139
2140         is_percpu = htab_is_percpu(htab);
2141
2142         roundup_key_size = round_up(map->key_size, 8);
2143         /* disable migration so percpu value prepared here will be the
2144          * same as the one seen by the bpf program with bpf_map_lookup_elem().
2145          */
2146         if (is_percpu)
2147                 migrate_disable();
2148         for (i = 0; i < htab->n_buckets; i++) {
2149                 b = &htab->buckets[i];
2150                 rcu_read_lock();
2151                 head = &b->head;
2152                 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2153                         key = elem->key;
2154                         if (is_percpu) {
2155                                 /* current cpu value for percpu map */
2156                                 pptr = htab_elem_get_ptr(elem, map->key_size);
2157                                 val = this_cpu_ptr(pptr);
2158                         } else {
2159                                 val = elem->key + roundup_key_size;
2160                         }
2161                         num_elems++;
2162                         ret = callback_fn((u64)(long)map, (u64)(long)key,
2163                                           (u64)(long)val, (u64)(long)callback_ctx, 0);
2164                         /* return value: 0 - continue, 1 - stop and return */
2165                         if (ret) {
2166                                 rcu_read_unlock();
2167                                 goto out;
2168                         }
2169                 }
2170                 rcu_read_unlock();
2171         }
2172 out:
2173         if (is_percpu)
2174                 migrate_enable();
2175         return num_elems;
2176 }
2177
2178 BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
2179 const struct bpf_map_ops htab_map_ops = {
2180         .map_meta_equal = bpf_map_meta_equal,
2181         .map_alloc_check = htab_map_alloc_check,
2182         .map_alloc = htab_map_alloc,
2183         .map_free = htab_map_free,
2184         .map_get_next_key = htab_map_get_next_key,
2185         .map_release_uref = htab_map_free_timers,
2186         .map_lookup_elem = htab_map_lookup_elem,
2187         .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
2188         .map_update_elem = htab_map_update_elem,
2189         .map_delete_elem = htab_map_delete_elem,
2190         .map_gen_lookup = htab_map_gen_lookup,
2191         .map_seq_show_elem = htab_map_seq_show_elem,
2192         .map_set_for_each_callback_args = map_set_for_each_callback_args,
2193         .map_for_each_callback = bpf_for_each_hash_elem,
2194         BATCH_OPS(htab),
2195         .map_btf_id = &htab_map_btf_ids[0],
2196         .iter_seq_info = &iter_seq_info,
2197 };
2198
2199 const struct bpf_map_ops htab_lru_map_ops = {
2200         .map_meta_equal = bpf_map_meta_equal,
2201         .map_alloc_check = htab_map_alloc_check,
2202         .map_alloc = htab_map_alloc,
2203         .map_free = htab_map_free,
2204         .map_get_next_key = htab_map_get_next_key,
2205         .map_release_uref = htab_map_free_timers,
2206         .map_lookup_elem = htab_lru_map_lookup_elem,
2207         .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
2208         .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
2209         .map_update_elem = htab_lru_map_update_elem,
2210         .map_delete_elem = htab_lru_map_delete_elem,
2211         .map_gen_lookup = htab_lru_map_gen_lookup,
2212         .map_seq_show_elem = htab_map_seq_show_elem,
2213         .map_set_for_each_callback_args = map_set_for_each_callback_args,
2214         .map_for_each_callback = bpf_for_each_hash_elem,
2215         BATCH_OPS(htab_lru),
2216         .map_btf_id = &htab_map_btf_ids[0],
2217         .iter_seq_info = &iter_seq_info,
2218 };
2219
2220 /* Called from eBPF program */
2221 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2222 {
2223         struct htab_elem *l = __htab_map_lookup_elem(map, key);
2224
2225         if (l)
2226                 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2227         else
2228                 return NULL;
2229 }
2230
2231 static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2232 {
2233         struct htab_elem *l;
2234
2235         if (cpu >= nr_cpu_ids)
2236                 return NULL;
2237
2238         l = __htab_map_lookup_elem(map, key);
2239         if (l)
2240                 return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2241         else
2242                 return NULL;
2243 }
2244
2245 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2246 {
2247         struct htab_elem *l = __htab_map_lookup_elem(map, key);
2248
2249         if (l) {
2250                 bpf_lru_node_set_ref(&l->lru_node);
2251                 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2252         }
2253
2254         return NULL;
2255 }
2256
2257 static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2258 {
2259         struct htab_elem *l;
2260
2261         if (cpu >= nr_cpu_ids)
2262                 return NULL;
2263
2264         l = __htab_map_lookup_elem(map, key);
2265         if (l) {
2266                 bpf_lru_node_set_ref(&l->lru_node);
2267                 return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2268         }
2269
2270         return NULL;
2271 }
2272
2273 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
2274 {
2275         struct htab_elem *l;
2276         void __percpu *pptr;
2277         int ret = -ENOENT;
2278         int cpu, off = 0;
2279         u32 size;
2280
2281         /* per_cpu areas are zero-filled and bpf programs can only
2282          * access 'value_size' of them, so copying rounded areas
2283          * will not leak any kernel data
2284          */
2285         size = round_up(map->value_size, 8);
2286         rcu_read_lock();
2287         l = __htab_map_lookup_elem(map, key);
2288         if (!l)
2289                 goto out;
2290         /* We do not mark LRU map element here in order to not mess up
2291          * eviction heuristics when user space does a map walk.
2292          */
2293         pptr = htab_elem_get_ptr(l, map->key_size);
2294         for_each_possible_cpu(cpu) {
2295                 bpf_long_memcpy(value + off,
2296                                 per_cpu_ptr(pptr, cpu), size);
2297                 off += size;
2298         }
2299         ret = 0;
2300 out:
2301         rcu_read_unlock();
2302         return ret;
2303 }
2304
2305 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
2306                            u64 map_flags)
2307 {
2308         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2309         int ret;
2310
2311         rcu_read_lock();
2312         if (htab_is_lru(htab))
2313                 ret = __htab_lru_percpu_map_update_elem(map, key, value,
2314                                                         map_flags, true);
2315         else
2316                 ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
2317                                                     true);
2318         rcu_read_unlock();
2319
2320         return ret;
2321 }
2322
2323 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
2324                                           struct seq_file *m)
2325 {
2326         struct htab_elem *l;
2327         void __percpu *pptr;
2328         int cpu;
2329
2330         rcu_read_lock();
2331
2332         l = __htab_map_lookup_elem(map, key);
2333         if (!l) {
2334                 rcu_read_unlock();
2335                 return;
2336         }
2337
2338         btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
2339         seq_puts(m, ": {\n");
2340         pptr = htab_elem_get_ptr(l, map->key_size);
2341         for_each_possible_cpu(cpu) {
2342                 seq_printf(m, "\tcpu%d: ", cpu);
2343                 btf_type_seq_show(map->btf, map->btf_value_type_id,
2344                                   per_cpu_ptr(pptr, cpu), m);
2345                 seq_puts(m, "\n");
2346         }
2347         seq_puts(m, "}\n");
2348
2349         rcu_read_unlock();
2350 }
2351
2352 const struct bpf_map_ops htab_percpu_map_ops = {
2353         .map_meta_equal = bpf_map_meta_equal,
2354         .map_alloc_check = htab_map_alloc_check,
2355         .map_alloc = htab_map_alloc,
2356         .map_free = htab_map_free,
2357         .map_get_next_key = htab_map_get_next_key,
2358         .map_lookup_elem = htab_percpu_map_lookup_elem,
2359         .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
2360         .map_update_elem = htab_percpu_map_update_elem,
2361         .map_delete_elem = htab_map_delete_elem,
2362         .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
2363         .map_seq_show_elem = htab_percpu_map_seq_show_elem,
2364         .map_set_for_each_callback_args = map_set_for_each_callback_args,
2365         .map_for_each_callback = bpf_for_each_hash_elem,
2366         BATCH_OPS(htab_percpu),
2367         .map_btf_id = &htab_map_btf_ids[0],
2368         .iter_seq_info = &iter_seq_info,
2369 };
2370
2371 const struct bpf_map_ops htab_lru_percpu_map_ops = {
2372         .map_meta_equal = bpf_map_meta_equal,
2373         .map_alloc_check = htab_map_alloc_check,
2374         .map_alloc = htab_map_alloc,
2375         .map_free = htab_map_free,
2376         .map_get_next_key = htab_map_get_next_key,
2377         .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
2378         .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
2379         .map_update_elem = htab_lru_percpu_map_update_elem,
2380         .map_delete_elem = htab_lru_map_delete_elem,
2381         .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
2382         .map_seq_show_elem = htab_percpu_map_seq_show_elem,
2383         .map_set_for_each_callback_args = map_set_for_each_callback_args,
2384         .map_for_each_callback = bpf_for_each_hash_elem,
2385         BATCH_OPS(htab_lru_percpu),
2386         .map_btf_id = &htab_map_btf_ids[0],
2387         .iter_seq_info = &iter_seq_info,
2388 };
2389
2390 static int fd_htab_map_alloc_check(union bpf_attr *attr)
2391 {
2392         if (attr->value_size != sizeof(u32))
2393                 return -EINVAL;
2394         return htab_map_alloc_check(attr);
2395 }
2396
2397 static void fd_htab_map_free(struct bpf_map *map)
2398 {
2399         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2400         struct hlist_nulls_node *n;
2401         struct hlist_nulls_head *head;
2402         struct htab_elem *l;
2403         int i;
2404
2405         for (i = 0; i < htab->n_buckets; i++) {
2406                 head = select_bucket(htab, i);
2407
2408                 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
2409                         void *ptr = fd_htab_map_get_ptr(map, l);
2410
2411                         map->ops->map_fd_put_ptr(ptr);
2412                 }
2413         }
2414
2415         htab_map_free(map);
2416 }
2417
2418 /* only called from syscall */
2419 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
2420 {
2421         void **ptr;
2422         int ret = 0;
2423
2424         if (!map->ops->map_fd_sys_lookup_elem)
2425                 return -ENOTSUPP;
2426
2427         rcu_read_lock();
2428         ptr = htab_map_lookup_elem(map, key);
2429         if (ptr)
2430                 *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
2431         else
2432                 ret = -ENOENT;
2433         rcu_read_unlock();
2434
2435         return ret;
2436 }
2437
2438 /* only called from syscall */
2439 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
2440                                 void *key, void *value, u64 map_flags)
2441 {
2442         void *ptr;
2443         int ret;
2444         u32 ufd = *(u32 *)value;
2445
2446         ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
2447         if (IS_ERR(ptr))
2448                 return PTR_ERR(ptr);
2449
2450         ret = htab_map_update_elem(map, key, &ptr, map_flags);
2451         if (ret)
2452                 map->ops->map_fd_put_ptr(ptr);
2453
2454         return ret;
2455 }
2456
2457 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
2458 {
2459         struct bpf_map *map, *inner_map_meta;
2460
2461         inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
2462         if (IS_ERR(inner_map_meta))
2463                 return inner_map_meta;
2464
2465         map = htab_map_alloc(attr);
2466         if (IS_ERR(map)) {
2467                 bpf_map_meta_free(inner_map_meta);
2468                 return map;
2469         }
2470
2471         map->inner_map_meta = inner_map_meta;
2472
2473         return map;
2474 }
2475
2476 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
2477 {
2478         struct bpf_map **inner_map  = htab_map_lookup_elem(map, key);
2479
2480         if (!inner_map)
2481                 return NULL;
2482
2483         return READ_ONCE(*inner_map);
2484 }
2485
2486 static int htab_of_map_gen_lookup(struct bpf_map *map,
2487                                   struct bpf_insn *insn_buf)
2488 {
2489         struct bpf_insn *insn = insn_buf;
2490         const int ret = BPF_REG_0;
2491
2492         BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2493                      (void *(*)(struct bpf_map *map, void *key))NULL));
2494         *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2495         *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
2496         *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
2497                                 offsetof(struct htab_elem, key) +
2498                                 round_up(map->key_size, 8));
2499         *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
2500
2501         return insn - insn_buf;
2502 }
2503
2504 static void htab_of_map_free(struct bpf_map *map)
2505 {
2506         bpf_map_meta_free(map->inner_map_meta);
2507         fd_htab_map_free(map);
2508 }
2509
2510 const struct bpf_map_ops htab_of_maps_map_ops = {
2511         .map_alloc_check = fd_htab_map_alloc_check,
2512         .map_alloc = htab_of_map_alloc,
2513         .map_free = htab_of_map_free,
2514         .map_get_next_key = htab_map_get_next_key,
2515         .map_lookup_elem = htab_of_map_lookup_elem,
2516         .map_delete_elem = htab_map_delete_elem,
2517         .map_fd_get_ptr = bpf_map_fd_get_ptr,
2518         .map_fd_put_ptr = bpf_map_fd_put_ptr,
2519         .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
2520         .map_gen_lookup = htab_of_map_gen_lookup,
2521         .map_check_btf = map_check_no_btf,
2522         BATCH_OPS(htab),
2523         .map_btf_id = &htab_map_btf_ids[0],
2524 };
This page took 0.175993 seconds and 4 git commands to generate.