kernel/bpf/hashtab.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
   3  * Copyright (c) 2016 Facebook
   4  */
   5 #include <linux/bpf.h>
   6 #include <linux/btf.h>
   7 #include <linux/jhash.h>
   8 #include <linux/filter.h>
   9 #include <linux/rculist_nulls.h>
  10 #include <linux/random.h>
  11 #include <uapi/linux/btf.h>
  12 #include <linux/rcupdate_trace.h>
  13 #include <linux/btf_ids.h>
  14 #include "percpu_freelist.h"
  15 #include "bpf_lru_list.h"
  16 #include "map_in_map.h"
  17 #include <linux/bpf_mem_alloc.h>
  18
  19 #define HTAB_CREATE_FLAG_MASK                                           \
  20         (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |    \
  21          BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
  22
  23 #define BATCH_OPS(_name)                        \
  24         .map_lookup_batch =                     \
  25         _name##_map_lookup_batch,               \
  26         .map_lookup_and_delete_batch =          \
  27         _name##_map_lookup_and_delete_batch,    \
  28         .map_update_batch =                     \
  29         generic_map_update_batch,               \
  30         .map_delete_batch =                     \
  31         generic_map_delete_batch
  32
  33 /*
  34  * The bucket lock has two protection scopes:
  35  *
  36  * 1) Serializing concurrent operations from BPF programs on different
  37  *    CPUs
  38  *
  39  * 2) Serializing concurrent operations from BPF programs and sys_bpf()
  40  *
  41  * BPF programs can execute in any context including perf, kprobes and
  42  * tracing. As there are almost no limits where perf, kprobes and tracing
  43  * can be invoked from the lock operations need to be protected against
  44  * deadlocks. Deadlocks can be caused by recursion and by an invocation in
  45  * the lock held section when functions which acquire this lock are invoked
  46  * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
  47  * variable bpf_prog_active, which prevents BPF programs attached to perf
  48  * events, kprobes and tracing to be invoked before the prior invocation
  49  * from one of these contexts completed. sys_bpf() uses the same mechanism
  50  * by pinning the task to the current CPU and incrementing the recursion
  51  * protection across the map operation.
  52  *
  53  * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
  54  * operations like memory allocations (even with GFP_ATOMIC) from atomic
  55  * contexts. This is required because even with GFP_ATOMIC the memory
  56  * allocator calls into code paths which acquire locks with long held lock
  57  * sections. To ensure the deterministic behaviour these locks are regular
  58  * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
  59  * true atomic contexts on an RT kernel are the low level hardware
  60  * handling, scheduling, low level interrupt handling, NMIs etc. None of
  61  * these contexts should ever do memory allocations.
  62  *
  63  * As regular device interrupt handlers and soft interrupts are forced into
  64  * thread context, the existing code which does
  65  *   spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
  66  * just works.
  67  *
  68  * In theory the BPF locks could be converted to regular spinlocks as well,
  69  * but the bucket locks and percpu_freelist locks can be taken from
  70  * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
  71  * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
  72  * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
  73  * because there is no memory allocation within the lock held sections. However
  74  * after hash map was fully converted to use bpf_mem_alloc, there will be
  75  * non-synchronous memory allocation for non-preallocated hash map, so it is
  76  * safe to always use raw spinlock for bucket lock.
  77  */
  78 struct bucket {
  79         struct hlist_nulls_head head;
  80         raw_spinlock_t raw_lock;
  81 };
  82
  83 #define HASHTAB_MAP_LOCK_COUNT 8
  84 #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
  85
  86 struct bpf_htab {
  87         struct bpf_map map;
  88         struct bpf_mem_alloc ma;
  89         struct bpf_mem_alloc pcpu_ma;
  90         struct bucket *buckets;
  91         void *elems;
  92         union {
  93                 struct pcpu_freelist freelist;
  94                 struct bpf_lru lru;
  95         };
  96         struct htab_elem *__percpu *extra_elems;
  97         /* number of elements in non-preallocated hashtable are kept
  98          * in either pcount or count
  99          */
 100         struct percpu_counter pcount;
 101         atomic_t count;
 102         bool use_percpu_counter;
 103         u32 n_buckets;  /* number of hash buckets */
 104         u32 elem_size;  /* size of each element in bytes */
 105         u32 hashrnd;
 106         struct lock_class_key lockdep_key;
 107         int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
 108 };
 109
 110 /* each htab element is struct htab_elem + key + value */
 111 struct htab_elem {
 112         union {
 113                 struct hlist_nulls_node hash_node;
 114                 struct {
 115                         void *padding;
 116                         union {
 117                                 struct pcpu_freelist_node fnode;
 118                                 struct htab_elem *batch_flink;
 119                         };
 120                 };
 121         };
 122         union {
 123                 /* pointer to per-cpu pointer */
 124                 void *ptr_to_pptr;
 125                 struct bpf_lru_node lru_node;
 126         };
 127         u32 hash;
 128         char key[] __aligned(8);
 129 };
 130
 131 static inline bool htab_is_prealloc(const struct bpf_htab *htab)
 132 {
 133         return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
 134 }
 135
 136 static void htab_init_buckets(struct bpf_htab *htab)
 137 {
 138         unsigned int i;
 139
 140         for (i = 0; i < htab->n_buckets; i++) {
 141                 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
 142                 raw_spin_lock_init(&htab->buckets[i].raw_lock);
 143                 lockdep_set_class(&htab->buckets[i].raw_lock,
 144                                           &htab->lockdep_key);
 145                 cond_resched();
 146         }
 147 }
 148
 149 static inline int htab_lock_bucket(const struct bpf_htab *htab,
 150                                    struct bucket *b, u32 hash,
 151                                    unsigned long *pflags)
 152 {
 153         unsigned long flags;
 154
 155         hash = hash & HASHTAB_MAP_LOCK_MASK;
 156
 157         preempt_disable();
 158         if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
 159                 __this_cpu_dec(*(htab->map_locked[hash]));
 160                 preempt_enable();
 161                 return -EBUSY;
 162         }
 163
 164         raw_spin_lock_irqsave(&b->raw_lock, flags);
 165         *pflags = flags;
 166
 167         return 0;
 168 }
 169
 170 static inline void htab_unlock_bucket(const struct bpf_htab *htab,
 171                                       struct bucket *b, u32 hash,
 172                                       unsigned long flags)
 173 {
 174         hash = hash & HASHTAB_MAP_LOCK_MASK;
 175         raw_spin_unlock_irqrestore(&b->raw_lock, flags);
 176         __this_cpu_dec(*(htab->map_locked[hash]));
 177         preempt_enable();
 178 }
 179
 180 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
 181
 182 static bool htab_is_lru(const struct bpf_htab *htab)
 183 {
 184         return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
 185                 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
 186 }
 187
 188 static bool htab_is_percpu(const struct bpf_htab *htab)
 189 {
 190         return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 191                 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
 192 }
 193
 194 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
 195                                      void __percpu *pptr)
 196 {
 197         *(void __percpu **)(l->key + key_size) = pptr;
 198 }
 199
 200 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
 201 {
 202         return *(void __percpu **)(l->key + key_size);
 203 }
 204
 205 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
 206 {
 207         return *(void **)(l->key + roundup(map->key_size, 8));
 208 }
 209
 210 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
 211 {
 212         return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
 213 }
 214
 215 static bool htab_has_extra_elems(struct bpf_htab *htab)
 216 {
 217         return !htab_is_percpu(htab) && !htab_is_lru(htab);
 218 }
 219
 220 static void htab_free_prealloced_timers(struct bpf_htab *htab)
 221 {
 222         u32 num_entries = htab->map.max_entries;
 223         int i;
 224
 225         if (!btf_record_has_field(htab->map.record, BPF_TIMER))
 226                 return;
 227         if (htab_has_extra_elems(htab))
 228                 num_entries += num_possible_cpus();
 229
 230         for (i = 0; i < num_entries; i++) {
 231                 struct htab_elem *elem;
 232
 233                 elem = get_htab_elem(htab, i);
 234                 bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
 235                 cond_resched();
 236         }
 237 }
 238
 239 static void htab_free_prealloced_fields(struct bpf_htab *htab)
 240 {
 241         u32 num_entries = htab->map.max_entries;
 242         int i;
 243
 244         if (IS_ERR_OR_NULL(htab->map.record))
 245                 return;
 246         if (htab_has_extra_elems(htab))
 247                 num_entries += num_possible_cpus();
 248         for (i = 0; i < num_entries; i++) {
 249                 struct htab_elem *elem;
 250
 251                 elem = get_htab_elem(htab, i);
 252                 bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
 253                 cond_resched();
 254         }
 255 }
 256
 257 static void htab_free_elems(struct bpf_htab *htab)
 258 {
 259         int i;
 260
 261         if (!htab_is_percpu(htab))
 262                 goto free_elems;
 263
 264         for (i = 0; i < htab->map.max_entries; i++) {
 265                 void __percpu *pptr;
 266
 267                 pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
 268                                          htab->map.key_size);
 269                 free_percpu(pptr);
 270                 cond_resched();
 271         }
 272 free_elems:
 273         bpf_map_area_free(htab->elems);
 274 }
 275
 276 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock
 277  * (bucket_lock). If both locks need to be acquired together, the lock
 278  * order is always lru_lock -> bucket_lock and this only happens in
 279  * bpf_lru_list.c logic. For example, certain code path of
 280  * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
 281  * will acquire lru_lock first followed by acquiring bucket_lock.
 282  *
 283  * In hashtab.c, to avoid deadlock, lock acquisition of
 284  * bucket_lock followed by lru_lock is not allowed. In such cases,
 285  * bucket_lock needs to be released first before acquiring lru_lock.
 286  */
 287 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
 288                                           u32 hash)
 289 {
 290         struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
 291         struct htab_elem *l;
 292
 293         if (node) {
 294                 l = container_of(node, struct htab_elem, lru_node);
 295                 memcpy(l->key, key, htab->map.key_size);
 296                 return l;
 297         }
 298
 299         return NULL;
 300 }
 301
 302 static int prealloc_init(struct bpf_htab *htab)
 303 {
 304         u32 num_entries = htab->map.max_entries;
 305         int err = -ENOMEM, i;
 306
 307         if (htab_has_extra_elems(htab))
 308                 num_entries += num_possible_cpus();
 309
 310         htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
 311                                          htab->map.numa_node);
 312         if (!htab->elems)
 313                 return -ENOMEM;
 314
 315         if (!htab_is_percpu(htab))
 316                 goto skip_percpu_elems;
 317
 318         for (i = 0; i < num_entries; i++) {
 319                 u32 size = round_up(htab->map.value_size, 8);
 320                 void __percpu *pptr;
 321
 322                 pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
 323                                             GFP_USER | __GFP_NOWARN);
 324                 if (!pptr)
 325                         goto free_elems;
 326                 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
 327                                   pptr);
 328                 cond_resched();
 329         }
 330
 331 skip_percpu_elems:
 332         if (htab_is_lru(htab))
 333                 err = bpf_lru_init(&htab->lru,
 334                                    htab->map.map_flags & BPF_F_NO_COMMON_LRU,
 335                                    offsetof(struct htab_elem, hash) -
 336                                    offsetof(struct htab_elem, lru_node),
 337                                    htab_lru_map_delete_node,
 338                                    htab);
 339         else
 340                 err = pcpu_freelist_init(&htab->freelist);
 341
 342         if (err)
 343                 goto free_elems;
 344
 345         if (htab_is_lru(htab))
 346                 bpf_lru_populate(&htab->lru, htab->elems,
 347                                  offsetof(struct htab_elem, lru_node),
 348                                  htab->elem_size, num_entries);
 349         else
 350                 pcpu_freelist_populate(&htab->freelist,
 351                                        htab->elems + offsetof(struct htab_elem, fnode),
 352                                        htab->elem_size, num_entries);
 353
 354         return 0;
 355
 356 free_elems:
 357         htab_free_elems(htab);
 358         return err;
 359 }
 360
 361 static void prealloc_destroy(struct bpf_htab *htab)
 362 {
 363         htab_free_elems(htab);
 364
 365         if (htab_is_lru(htab))
 366                 bpf_lru_destroy(&htab->lru);
 367         else
 368                 pcpu_freelist_destroy(&htab->freelist);
 369 }
 370
 371 static int alloc_extra_elems(struct bpf_htab *htab)
 372 {
 373         struct htab_elem *__percpu *pptr, *l_new;
 374         struct pcpu_freelist_node *l;
 375         int cpu;
 376
 377         pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
 378                                     GFP_USER | __GFP_NOWARN);
 379         if (!pptr)
 380                 return -ENOMEM;
 381
 382         for_each_possible_cpu(cpu) {
 383                 l = pcpu_freelist_pop(&htab->freelist);
 384                 /* pop will succeed, since prealloc_init()
 385                  * preallocated extra num_possible_cpus elements
 386                  */
 387                 l_new = container_of(l, struct htab_elem, fnode);
 388                 *per_cpu_ptr(pptr, cpu) = l_new;
 389         }
 390         htab->extra_elems = pptr;
 391         return 0;
 392 }
 393
 394 /* Called from syscall */
 395 static int htab_map_alloc_check(union bpf_attr *attr)
 396 {
 397         bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 398                        attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
 399         bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
 400                     attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
 401         /* percpu_lru means each cpu has its own LRU list.
 402          * it is different from BPF_MAP_TYPE_PERCPU_HASH where
 403          * the map's value itself is percpu.  percpu_lru has
 404          * nothing to do with the map's value.
 405          */
 406         bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 407         bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
 408         bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
 409         int numa_node = bpf_map_attr_numa_node(attr);
 410
 411         BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
 412                      offsetof(struct htab_elem, hash_node.pprev));
 413
 414         if (lru && !bpf_capable())
 415                 /* LRU implementation is much complicated than other
 416                  * maps.  Hence, limit to CAP_BPF.
 417                  */
 418                 return -EPERM;
 419
 420         if (zero_seed && !capable(CAP_SYS_ADMIN))
 421                 /* Guard against local DoS, and discourage production use. */
 422                 return -EPERM;
 423
 424         if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
 425             !bpf_map_flags_access_ok(attr->map_flags))
 426                 return -EINVAL;
 427
 428         if (!lru && percpu_lru)
 429                 return -EINVAL;
 430
 431         if (lru && !prealloc)
 432                 return -ENOTSUPP;
 433
 434         if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
 435                 return -EINVAL;
 436
 437         /* check sanity of attributes.
 438          * value_size == 0 may be allowed in the future to use map as a set
 439          */
 440         if (attr->max_entries == 0 || attr->key_size == 0 ||
 441             attr->value_size == 0)
 442                 return -EINVAL;
 443
 444         if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
 445            sizeof(struct htab_elem))
 446                 /* if key_size + value_size is bigger, the user space won't be
 447                  * able to access the elements via bpf syscall. This check
 448                  * also makes sure that the elem_size doesn't overflow and it's
 449                  * kmalloc-able later in htab_map_update_elem()
 450                  */
 451                 return -E2BIG;
 452
 453         return 0;
 454 }
 455
 456 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 457 {
 458         bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 459                        attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
 460         bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
 461                     attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
 462         /* percpu_lru means each cpu has its own LRU list.
 463          * it is different from BPF_MAP_TYPE_PERCPU_HASH where
 464          * the map's value itself is percpu.  percpu_lru has
 465          * nothing to do with the map's value.
 466          */
 467         bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 468         bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
 469         struct bpf_htab *htab;
 470         int err, i;
 471
 472         htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
 473         if (!htab)
 474                 return ERR_PTR(-ENOMEM);
 475
 476         lockdep_register_key(&htab->lockdep_key);
 477
 478         bpf_map_init_from_attr(&htab->map, attr);
 479
 480         if (percpu_lru) {
 481                 /* ensure each CPU's lru list has >=1 elements.
 482                  * since we are at it, make each lru list has the same
 483                  * number of elements.
 484                  */
 485                 htab->map.max_entries = roundup(attr->max_entries,
 486                                                 num_possible_cpus());
 487                 if (htab->map.max_entries < attr->max_entries)
 488                         htab->map.max_entries = rounddown(attr->max_entries,
 489                                                           num_possible_cpus());
 490         }
 491
 492         /* hash table size must be power of 2 */
 493         htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
 494
 495         htab->elem_size = sizeof(struct htab_elem) +
 496                           round_up(htab->map.key_size, 8);
 497         if (percpu)
 498                 htab->elem_size += sizeof(void *);
 499         else
 500                 htab->elem_size += round_up(htab->map.value_size, 8);
 501
 502         err = -E2BIG;
 503         /* prevent zero size kmalloc and check for u32 overflow */
 504         if (htab->n_buckets == 0 ||
 505             htab->n_buckets > U32_MAX / sizeof(struct bucket))
 506                 goto free_htab;
 507
 508         err = -ENOMEM;
 509         htab->buckets = bpf_map_area_alloc(htab->n_buckets *
 510                                            sizeof(struct bucket),
 511                                            htab->map.numa_node);
 512         if (!htab->buckets)
 513                 goto free_htab;
 514
 515         for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
 516                 htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
 517                                                            sizeof(int),
 518                                                            sizeof(int),
 519                                                            GFP_USER);
 520                 if (!htab->map_locked[i])
 521                         goto free_map_locked;
 522         }
 523
 524         if (htab->map.map_flags & BPF_F_ZERO_SEED)
 525                 htab->hashrnd = 0;
 526         else
 527                 htab->hashrnd = get_random_u32();
 528
 529         htab_init_buckets(htab);
 530
 531 /* compute_batch_value() computes batch value as num_online_cpus() * 2
 532  * and __percpu_counter_compare() needs
 533  * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
 534  * for percpu_counter to be faster than atomic_t. In practice the average bpf
 535  * hash map size is 10k, which means that a system with 64 cpus will fill
 536  * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
 537  * define our own batch count as 32 then 10k hash map can be filled up to 80%:
 538  * 10k - 8k > 32 _batch_ * 64 _cpus_
 539  * and __percpu_counter_compare() will still be fast. At that point hash map
 540  * collisions will dominate its performance anyway. Assume that hash map filled
 541  * to 50+% isn't going to be O(1) and use the following formula to choose
 542  * between percpu_counter and atomic_t.
 543  */
 544 #define PERCPU_COUNTER_BATCH 32
 545         if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
 546                 htab->use_percpu_counter = true;
 547
 548         if (htab->use_percpu_counter) {
 549                 err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
 550                 if (err)
 551                         goto free_map_locked;
 552         }
 553
 554         if (prealloc) {
 555                 err = prealloc_init(htab);
 556                 if (err)
 557                         goto free_map_locked;
 558
 559                 if (!percpu && !lru) {
 560                         /* lru itself can remove the least used element, so
 561                          * there is no need for an extra elem during map_update.
 562                          */
 563                         err = alloc_extra_elems(htab);
 564                         if (err)
 565                                 goto free_prealloc;
 566                 }
 567         } else {
 568                 err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
 569                 if (err)
 570                         goto free_map_locked;
 571                 if (percpu) {
 572                         err = bpf_mem_alloc_init(&htab->pcpu_ma,
 573                                                  round_up(htab->map.value_size, 8), true);
 574                         if (err)
 575                                 goto free_map_locked;
 576                 }
 577         }
 578
 579         return &htab->map;
 580
 581 free_prealloc:
 582         prealloc_destroy(htab);
 583 free_map_locked:
 584         if (htab->use_percpu_counter)
 585                 percpu_counter_destroy(&htab->pcount);
 586         for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
 587                 free_percpu(htab->map_locked[i]);
 588         bpf_map_area_free(htab->buckets);
 589         bpf_mem_alloc_destroy(&htab->pcpu_ma);
 590         bpf_mem_alloc_destroy(&htab->ma);
 591 free_htab:
 592         lockdep_unregister_key(&htab->lockdep_key);
 593         bpf_map_area_free(htab);
 594         return ERR_PTR(err);
 595 }
 596
 597 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
 598 {
 599         return jhash(key, key_len, hashrnd);
 600 }
 601
 602 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
 603 {
 604         return &htab->buckets[hash & (htab->n_buckets - 1)];
 605 }
 606
 607 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
 608 {
 609         return &__select_bucket(htab, hash)->head;
 610 }
 611
 612 /* this lookup function can only be called with bucket lock taken */
 613 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
 614                                          void *key, u32 key_size)
 615 {
 616         struct hlist_nulls_node *n;
 617         struct htab_elem *l;
 618
 619         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 620                 if (l->hash == hash && !memcmp(&l->key, key, key_size))
 621                         return l;
 622
 623         return NULL;
 624 }
 625
 626 /* can be called without bucket lock. it will repeat the loop in
 627  * the unlikely event when elements moved from one bucket into another
 628  * while link list is being walked
 629  */
 630 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
 631                                                u32 hash, void *key,
 632                                                u32 key_size, u32 n_buckets)
 633 {
 634         struct hlist_nulls_node *n;
 635         struct htab_elem *l;
 636
 637 again:
 638         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 639                 if (l->hash == hash && !memcmp(&l->key, key, key_size))
 640                         return l;
 641
 642         if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
 643                 goto again;
 644
 645         return NULL;
 646 }
 647
 648 /* Called from syscall or from eBPF program directly, so
 649  * arguments have to match bpf_map_lookup_elem() exactly.
 650  * The return value is adjusted by BPF instructions
 651  * in htab_map_gen_lookup().
 652  */
 653 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
 654 {
 655         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 656         struct hlist_nulls_head *head;
 657         struct htab_elem *l;
 658         u32 hash, key_size;
 659
 660         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
 661                      !rcu_read_lock_bh_held());
 662
 663         key_size = map->key_size;
 664
 665         hash = htab_map_hash(key, key_size, htab->hashrnd);
 666
 667         head = select_bucket(htab, hash);
 668
 669         l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
 670
 671         return l;
 672 }
 673
 674 static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
 675 {
 676         struct htab_elem *l = __htab_map_lookup_elem(map, key);
 677
 678         if (l)
 679                 return l->key + round_up(map->key_size, 8);
 680
 681         return NULL;
 682 }
 683
 684 /* inline bpf_map_lookup_elem() call.
 685  * Instead of:
 686  * bpf_prog
 687  *   bpf_map_lookup_elem
 688  *     map->ops->map_lookup_elem
 689  *       htab_map_lookup_elem
 690  *         __htab_map_lookup_elem
 691  * do:
 692  * bpf_prog
 693  *   __htab_map_lookup_elem
 694  */
 695 static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 696 {
 697         struct bpf_insn *insn = insn_buf;
 698         const int ret = BPF_REG_0;
 699
 700         BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
 701                      (void *(*)(struct bpf_map *map, void *key))NULL));
 702         *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
 703         *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
 704         *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 705                                 offsetof(struct htab_elem, key) +
 706                                 round_up(map->key_size, 8));
 707         return insn - insn_buf;
 708 }
 709
 710 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
 711                                                         void *key, const bool mark)
 712 {
 713         struct htab_elem *l = __htab_map_lookup_elem(map, key);
 714
 715         if (l) {
 716                 if (mark)
 717                         bpf_lru_node_set_ref(&l->lru_node);
 718                 return l->key + round_up(map->key_size, 8);
 719         }
 720
 721         return NULL;
 722 }
 723
 724 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
 725 {
 726         return __htab_lru_map_lookup_elem(map, key, true);
 727 }
 728
 729 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
 730 {
 731         return __htab_lru_map_lookup_elem(map, key, false);
 732 }
 733
 734 static int htab_lru_map_gen_lookup(struct bpf_map *map,
 735                                    struct bpf_insn *insn_buf)
 736 {
 737         struct bpf_insn *insn = insn_buf;
 738         const int ret = BPF_REG_0;
 739         const int ref_reg = BPF_REG_1;
 740
 741         BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
 742                      (void *(*)(struct bpf_map *map, void *key))NULL));
 743         *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
 744         *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
 745         *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
 746                               offsetof(struct htab_elem, lru_node) +
 747                               offsetof(struct bpf_lru_node, ref));
 748         *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
 749         *insn++ = BPF_ST_MEM(BPF_B, ret,
 750                              offsetof(struct htab_elem, lru_node) +
 751                              offsetof(struct bpf_lru_node, ref),
 752                              1);
 753         *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 754                                 offsetof(struct htab_elem, key) +
 755                                 round_up(map->key_size, 8));
 756         return insn - insn_buf;
 757 }
 758
 759 static void check_and_free_fields(struct bpf_htab *htab,
 760                                   struct htab_elem *elem)
 761 {
 762         void *map_value = elem->key + round_up(htab->map.key_size, 8);
 763
 764         bpf_obj_free_fields(htab->map.record, map_value);
 765 }
 766
 767 /* It is called from the bpf_lru_list when the LRU needs to delete
 768  * older elements from the htab.
 769  */
 770 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 771 {
 772         struct bpf_htab *htab = arg;
 773         struct htab_elem *l = NULL, *tgt_l;
 774         struct hlist_nulls_head *head;
 775         struct hlist_nulls_node *n;
 776         unsigned long flags;
 777         struct bucket *b;
 778         int ret;
 779
 780         tgt_l = container_of(node, struct htab_elem, lru_node);
 781         b = __select_bucket(htab, tgt_l->hash);
 782         head = &b->head;
 783
 784         ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
 785         if (ret)
 786                 return false;
 787
 788         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 789                 if (l == tgt_l) {
 790                         hlist_nulls_del_rcu(&l->hash_node);
 791                         check_and_free_fields(htab, l);
 792                         break;
 793                 }
 794
 795         htab_unlock_bucket(htab, b, tgt_l->hash, flags);
 796
 797         return l == tgt_l;
 798 }
 799
 800 /* Called from syscall */
 801 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 802 {
 803         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 804         struct hlist_nulls_head *head;
 805         struct htab_elem *l, *next_l;
 806         u32 hash, key_size;
 807         int i = 0;
 808
 809         WARN_ON_ONCE(!rcu_read_lock_held());
 810
 811         key_size = map->key_size;
 812
 813         if (!key)
 814                 goto find_first_elem;
 815
 816         hash = htab_map_hash(key, key_size, htab->hashrnd);
 817
 818         head = select_bucket(htab, hash);
 819
 820         /* lookup the key */
 821         l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
 822
 823         if (!l)
 824                 goto find_first_elem;
 825
 826         /* key was found, get next key in the same bucket */
 827         next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
 828                                   struct htab_elem, hash_node);
 829
 830         if (next_l) {
 831                 /* if next elem in this hash list is non-zero, just return it */
 832                 memcpy(next_key, next_l->key, key_size);
 833                 return 0;
 834         }
 835
 836         /* no more elements in this hash list, go to the next bucket */
 837         i = hash & (htab->n_buckets - 1);
 838         i++;
 839
 840 find_first_elem:
 841         /* iterate over buckets */
 842         for (; i < htab->n_buckets; i++) {
 843                 head = select_bucket(htab, i);
 844
 845                 /* pick first element in the bucket */
 846                 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
 847                                           struct htab_elem, hash_node);
 848                 if (next_l) {
 849                         /* if it's not empty, just return it */
 850                         memcpy(next_key, next_l->key, key_size);
 851                         return 0;
 852                 }
 853         }
 854
 855         /* iterated over all buckets and all elements */
 856         return -ENOENT;
 857 }
 858
 859 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 860 {
 861         if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
 862                 bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
 863         check_and_free_fields(htab, l);
 864         bpf_mem_cache_free(&htab->ma, l);
 865 }
 866
 867 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
 868 {
 869         struct bpf_map *map = &htab->map;
 870         void *ptr;
 871
 872         if (map->ops->map_fd_put_ptr) {
 873                 ptr = fd_htab_map_get_ptr(map, l);
 874                 map->ops->map_fd_put_ptr(ptr);
 875         }
 876 }
 877
 878 static bool is_map_full(struct bpf_htab *htab)
 879 {
 880         if (htab->use_percpu_counter)
 881                 return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
 882                                                 PERCPU_COUNTER_BATCH) >= 0;
 883         return atomic_read(&htab->count) >= htab->map.max_entries;
 884 }
 885
 886 static void inc_elem_count(struct bpf_htab *htab)
 887 {
 888         if (htab->use_percpu_counter)
 889                 percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
 890         else
 891                 atomic_inc(&htab->count);
 892 }
 893
 894 static void dec_elem_count(struct bpf_htab *htab)
 895 {
 896         if (htab->use_percpu_counter)
 897                 percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
 898         else
 899                 atomic_dec(&htab->count);
 900 }
 901
 902
 903 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 904 {
 905         htab_put_fd_value(htab, l);
 906
 907         if (htab_is_prealloc(htab)) {
 908                 check_and_free_fields(htab, l);
 909                 __pcpu_freelist_push(&htab->freelist, &l->fnode);
 910         } else {
 911                 dec_elem_count(htab);
 912                 htab_elem_free(htab, l);
 913         }
 914 }
 915
 916 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
 917                             void *value, bool onallcpus)
 918 {
 919         if (!onallcpus) {
 920                 /* copy true value_size bytes */
 921                 memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
 922         } else {
 923                 u32 size = round_up(htab->map.value_size, 8);
 924                 int off = 0, cpu;
 925
 926                 for_each_possible_cpu(cpu) {
 927                         bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
 928                                         value + off, size);
 929                         off += size;
 930                 }
 931         }
 932 }
 933
 934 static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
 935                             void *value, bool onallcpus)
 936 {
 937         /* When not setting the initial value on all cpus, zero-fill element
 938          * values for other cpus. Otherwise, bpf program has no way to ensure
 939          * known initial values for cpus other than current one
 940          * (onallcpus=false always when coming from bpf prog).
 941          */
 942         if (!onallcpus) {
 943                 u32 size = round_up(htab->map.value_size, 8);
 944                 int current_cpu = raw_smp_processor_id();
 945                 int cpu;
 946
 947                 for_each_possible_cpu(cpu) {
 948                         if (cpu == current_cpu)
 949                                 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,
 950                                                 size);
 951                         else
 952                                 memset(per_cpu_ptr(pptr, cpu), 0, size);
 953                 }
 954         } else {
 955                 pcpu_copy_value(htab, pptr, value, onallcpus);
 956         }
 957 }
 958
 959 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
 960 {
 961         return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
 962                BITS_PER_LONG == 64;
 963 }
 964
 965 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 966                                          void *value, u32 key_size, u32 hash,
 967                                          bool percpu, bool onallcpus,
 968                                          struct htab_elem *old_elem)
 969 {
 970         u32 size = htab->map.value_size;
 971         bool prealloc = htab_is_prealloc(htab);
 972         struct htab_elem *l_new, **pl_new;
 973         void __percpu *pptr;
 974
 975         if (prealloc) {
 976                 if (old_elem) {
 977                         /* if we're updating the existing element,
 978                          * use per-cpu extra elems to avoid freelist_pop/push
 979                          */
 980                         pl_new = this_cpu_ptr(htab->extra_elems);
 981                         l_new = *pl_new;
 982                         htab_put_fd_value(htab, old_elem);
 983                         *pl_new = old_elem;
 984                 } else {
 985                         struct pcpu_freelist_node *l;
 986
 987                         l = __pcpu_freelist_pop(&htab->freelist);
 988                         if (!l)
 989                                 return ERR_PTR(-E2BIG);
 990                         l_new = container_of(l, struct htab_elem, fnode);
 991                 }
 992         } else {
 993                 if (is_map_full(htab))
 994                         if (!old_elem)
 995                                 /* when map is full and update() is replacing
 996                                  * old element, it's ok to allocate, since
 997                                  * old element will be freed immediately.
 998                                  * Otherwise return an error
 999                                  */
1000                                 return ERR_PTR(-E2BIG);
1001                 inc_elem_count(htab);
1002                 l_new = bpf_mem_cache_alloc(&htab->ma);
1003                 if (!l_new) {
1004                         l_new = ERR_PTR(-ENOMEM);
1005                         goto dec_count;
1006                 }
1007                 check_and_init_map_value(&htab->map,
1008                                          l_new->key + round_up(key_size, 8));
1009         }
1010
1011         memcpy(l_new->key, key, key_size);
1012         if (percpu) {
1013                 if (prealloc) {
1014                         pptr = htab_elem_get_ptr(l_new, key_size);
1015                 } else {
1016                         /* alloc_percpu zero-fills */
1017                         pptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
1018                         if (!pptr) {
1019                                 bpf_mem_cache_free(&htab->ma, l_new);
1020                                 l_new = ERR_PTR(-ENOMEM);
1021                                 goto dec_count;
1022                         }
1023                         l_new->ptr_to_pptr = pptr;
1024                         pptr = *(void **)pptr;
1025                 }
1026
1027                 pcpu_init_value(htab, pptr, value, onallcpus);
1028
1029                 if (!prealloc)
1030                         htab_elem_set_ptr(l_new, key_size, pptr);
1031         } else if (fd_htab_map_needs_adjust(htab)) {
1032                 size = round_up(size, 8);
1033                 memcpy(l_new->key + round_up(key_size, 8), value, size);
1034         } else {
1035                 copy_map_value(&htab->map,
1036                                l_new->key + round_up(key_size, 8),
1037                                value);
1038         }
1039
1040         l_new->hash = hash;
1041         return l_new;
1042 dec_count:
1043         dec_elem_count(htab);
1044         return l_new;
1045 }
1046
1047 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
1048                        u64 map_flags)
1049 {
1050         if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
1051                 /* elem already exists */
1052                 return -EEXIST;
1053
1054         if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
1055                 /* elem doesn't exist, cannot update it */
1056                 return -ENOENT;
1057
1058         return 0;
1059 }
1060
1061 /* Called from syscall or from eBPF program */
1062 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
1063                                 u64 map_flags)
1064 {
1065         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1066         struct htab_elem *l_new = NULL, *l_old;
1067         struct hlist_nulls_head *head;
1068         unsigned long flags;
1069         struct bucket *b;
1070         u32 key_size, hash;
1071         int ret;
1072
1073         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
1074                 /* unknown flags */
1075                 return -EINVAL;
1076
1077         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1078                      !rcu_read_lock_bh_held());
1079
1080         key_size = map->key_size;
1081
1082         hash = htab_map_hash(key, key_size, htab->hashrnd);
1083
1084         b = __select_bucket(htab, hash);
1085         head = &b->head;
1086
1087         if (unlikely(map_flags & BPF_F_LOCK)) {
1088                 if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1089                         return -EINVAL;
1090                 /* find an element without taking the bucket lock */
1091                 l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
1092                                               htab->n_buckets);
1093                 ret = check_flags(htab, l_old, map_flags);
1094                 if (ret)
1095                         return ret;
1096                 if (l_old) {
1097                         /* grab the element lock and update value in place */
1098                         copy_map_value_locked(map,
1099                                               l_old->key + round_up(key_size, 8),
1100                                               value, false);
1101                         return 0;
1102                 }
1103                 /* fall through, grab the bucket lock and lookup again.
1104                  * 99.9% chance that the element won't be found,
1105                  * but second lookup under lock has to be done.
1106                  */
1107         }
1108
1109         ret = htab_lock_bucket(htab, b, hash, &flags);
1110         if (ret)
1111                 return ret;
1112
1113         l_old = lookup_elem_raw(head, hash, key, key_size);
1114
1115         ret = check_flags(htab, l_old, map_flags);
1116         if (ret)
1117                 goto err;
1118
1119         if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
1120                 /* first lookup without the bucket lock didn't find the element,
1121                  * but second lookup with the bucket lock found it.
1122                  * This case is highly unlikely, but has to be dealt with:
1123                  * grab the element lock in addition to the bucket lock
1124                  * and update element in place
1125                  */
1126                 copy_map_value_locked(map,
1127                                       l_old->key + round_up(key_size, 8),
1128                                       value, false);
1129                 ret = 0;
1130                 goto err;
1131         }
1132
1133         l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
1134                                 l_old);
1135         if (IS_ERR(l_new)) {
1136                 /* all pre-allocated elements are in use or memory exhausted */
1137                 ret = PTR_ERR(l_new);
1138                 goto err;
1139         }
1140
1141         /* add new element to the head of the list, so that
1142          * concurrent search will find it before old elem
1143          */
1144         hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1145         if (l_old) {
1146                 hlist_nulls_del_rcu(&l_old->hash_node);
1147                 if (!htab_is_prealloc(htab))
1148                         free_htab_elem(htab, l_old);
1149                 else
1150                         check_and_free_fields(htab, l_old);
1151         }
1152         ret = 0;
1153 err:
1154         htab_unlock_bucket(htab, b, hash, flags);
1155         return ret;
1156 }
1157
1158 static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
1159 {
1160         check_and_free_fields(htab, elem);
1161         bpf_lru_push_free(&htab->lru, &elem->lru_node);
1162 }
1163
1164 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
1165                                     u64 map_flags)
1166 {
1167         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1168         struct htab_elem *l_new, *l_old = NULL;
1169         struct hlist_nulls_head *head;
1170         unsigned long flags;
1171         struct bucket *b;
1172         u32 key_size, hash;
1173         int ret;
1174
1175         if (unlikely(map_flags > BPF_EXIST))
1176                 /* unknown flags */
1177                 return -EINVAL;
1178
1179         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1180                      !rcu_read_lock_bh_held());
1181
1182         key_size = map->key_size;
1183
1184         hash = htab_map_hash(key, key_size, htab->hashrnd);
1185
1186         b = __select_bucket(htab, hash);
1187         head = &b->head;
1188
1189         /* For LRU, we need to alloc before taking bucket's
1190          * spinlock because getting free nodes from LRU may need
1191          * to remove older elements from htab and this removal
1192          * operation will need a bucket lock.
1193          */
1194         l_new = prealloc_lru_pop(htab, key, hash);
1195         if (!l_new)
1196                 return -ENOMEM;
1197         copy_map_value(&htab->map,
1198                        l_new->key + round_up(map->key_size, 8), value);
1199
1200         ret = htab_lock_bucket(htab, b, hash, &flags);
1201         if (ret)
1202                 return ret;
1203
1204         l_old = lookup_elem_raw(head, hash, key, key_size);
1205
1206         ret = check_flags(htab, l_old, map_flags);
1207         if (ret)
1208                 goto err;
1209
1210         /* add new element to the head of the list, so that
1211          * concurrent search will find it before old elem
1212          */
1213         hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1214         if (l_old) {
1215                 bpf_lru_node_set_ref(&l_new->lru_node);
1216                 hlist_nulls_del_rcu(&l_old->hash_node);
1217         }
1218         ret = 0;
1219
1220 err:
1221         htab_unlock_bucket(htab, b, hash, flags);
1222
1223         if (ret)
1224                 htab_lru_push_free(htab, l_new);
1225         else if (l_old)
1226                 htab_lru_push_free(htab, l_old);
1227
1228         return ret;
1229 }
1230
1231 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1232                                          void *value, u64 map_flags,
1233                                          bool onallcpus)
1234 {
1235         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1236         struct htab_elem *l_new = NULL, *l_old;
1237         struct hlist_nulls_head *head;
1238         unsigned long flags;
1239         struct bucket *b;
1240         u32 key_size, hash;
1241         int ret;
1242
1243         if (unlikely(map_flags > BPF_EXIST))
1244                 /* unknown flags */
1245                 return -EINVAL;
1246
1247         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1248                      !rcu_read_lock_bh_held());
1249
1250         key_size = map->key_size;
1251
1252         hash = htab_map_hash(key, key_size, htab->hashrnd);
1253
1254         b = __select_bucket(htab, hash);
1255         head = &b->head;
1256
1257         ret = htab_lock_bucket(htab, b, hash, &flags);
1258         if (ret)
1259                 return ret;
1260
1261         l_old = lookup_elem_raw(head, hash, key, key_size);
1262
1263         ret = check_flags(htab, l_old, map_flags);
1264         if (ret)
1265                 goto err;
1266
1267         if (l_old) {
1268                 /* per-cpu hash map can update value in-place */
1269                 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1270                                 value, onallcpus);
1271         } else {
1272                 l_new = alloc_htab_elem(htab, key, value, key_size,
1273                                         hash, true, onallcpus, NULL);
1274                 if (IS_ERR(l_new)) {
1275                         ret = PTR_ERR(l_new);
1276                         goto err;
1277                 }
1278                 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1279         }
1280         ret = 0;
1281 err:
1282         htab_unlock_bucket(htab, b, hash, flags);
1283         return ret;
1284 }
1285
1286 static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1287                                              void *value, u64 map_flags,
1288                                              bool onallcpus)
1289 {
1290         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1291         struct htab_elem *l_new = NULL, *l_old;
1292         struct hlist_nulls_head *head;
1293         unsigned long flags;
1294         struct bucket *b;
1295         u32 key_size, hash;
1296         int ret;
1297
1298         if (unlikely(map_flags > BPF_EXIST))
1299                 /* unknown flags */
1300                 return -EINVAL;
1301
1302         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1303                      !rcu_read_lock_bh_held());
1304
1305         key_size = map->key_size;
1306
1307         hash = htab_map_hash(key, key_size, htab->hashrnd);
1308
1309         b = __select_bucket(htab, hash);
1310         head = &b->head;
1311
1312         /* For LRU, we need to alloc before taking bucket's
1313          * spinlock because LRU's elem alloc may need
1314          * to remove older elem from htab and this removal
1315          * operation will need a bucket lock.
1316          */
1317         if (map_flags != BPF_EXIST) {
1318                 l_new = prealloc_lru_pop(htab, key, hash);
1319                 if (!l_new)
1320                         return -ENOMEM;
1321         }
1322
1323         ret = htab_lock_bucket(htab, b, hash, &flags);
1324         if (ret)
1325                 return ret;
1326
1327         l_old = lookup_elem_raw(head, hash, key, key_size);
1328
1329         ret = check_flags(htab, l_old, map_flags);
1330         if (ret)
1331                 goto err;
1332
1333         if (l_old) {
1334                 bpf_lru_node_set_ref(&l_old->lru_node);
1335
1336                 /* per-cpu hash map can update value in-place */
1337                 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1338                                 value, onallcpus);
1339         } else {
1340                 pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
1341                                 value, onallcpus);
1342                 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1343                 l_new = NULL;
1344         }
1345         ret = 0;
1346 err:
1347         htab_unlock_bucket(htab, b, hash, flags);
1348         if (l_new)
1349                 bpf_lru_push_free(&htab->lru, &l_new->lru_node);
1350         return ret;
1351 }
1352
1353 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1354                                        void *value, u64 map_flags)
1355 {
1356         return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
1357 }
1358
1359 static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1360                                            void *value, u64 map_flags)
1361 {
1362         return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1363                                                  false);
1364 }
1365
1366 /* Called from syscall or from eBPF program */
1367 static int htab_map_delete_elem(struct bpf_map *map, void *key)
1368 {
1369         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1370         struct hlist_nulls_head *head;
1371         struct bucket *b;
1372         struct htab_elem *l;
1373         unsigned long flags;
1374         u32 hash, key_size;
1375         int ret;
1376
1377         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1378                      !rcu_read_lock_bh_held());
1379
1380         key_size = map->key_size;
1381
1382         hash = htab_map_hash(key, key_size, htab->hashrnd);
1383         b = __select_bucket(htab, hash);
1384         head = &b->head;
1385
1386         ret = htab_lock_bucket(htab, b, hash, &flags);
1387         if (ret)
1388                 return ret;
1389
1390         l = lookup_elem_raw(head, hash, key, key_size);
1391
1392         if (l) {
1393                 hlist_nulls_del_rcu(&l->hash_node);
1394                 free_htab_elem(htab, l);
1395         } else {
1396                 ret = -ENOENT;
1397         }
1398
1399         htab_unlock_bucket(htab, b, hash, flags);
1400         return ret;
1401 }
1402
1403 static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
1404 {
1405         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1406         struct hlist_nulls_head *head;
1407         struct bucket *b;
1408         struct htab_elem *l;
1409         unsigned long flags;
1410         u32 hash, key_size;
1411         int ret;
1412
1413         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1414                      !rcu_read_lock_bh_held());
1415
1416         key_size = map->key_size;
1417
1418         hash = htab_map_hash(key, key_size, htab->hashrnd);
1419         b = __select_bucket(htab, hash);
1420         head = &b->head;
1421
1422         ret = htab_lock_bucket(htab, b, hash, &flags);
1423         if (ret)
1424                 return ret;
1425
1426         l = lookup_elem_raw(head, hash, key, key_size);
1427
1428         if (l)
1429                 hlist_nulls_del_rcu(&l->hash_node);
1430         else
1431                 ret = -ENOENT;
1432
1433         htab_unlock_bucket(htab, b, hash, flags);
1434         if (l)
1435                 htab_lru_push_free(htab, l);
1436         return ret;
1437 }
1438
1439 static void delete_all_elements(struct bpf_htab *htab)
1440 {
1441         int i;
1442
1443         /* It's called from a worker thread, so disable migration here,
1444          * since bpf_mem_cache_free() relies on that.
1445          */
1446         migrate_disable();
1447         for (i = 0; i < htab->n_buckets; i++) {
1448                 struct hlist_nulls_head *head = select_bucket(htab, i);
1449                 struct hlist_nulls_node *n;
1450                 struct htab_elem *l;
1451
1452                 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1453                         hlist_nulls_del_rcu(&l->hash_node);
1454                         htab_elem_free(htab, l);
1455                 }
1456         }
1457         migrate_enable();
1458 }
1459
1460 static void htab_free_malloced_timers(struct bpf_htab *htab)
1461 {
1462         int i;
1463
1464         rcu_read_lock();
1465         for (i = 0; i < htab->n_buckets; i++) {
1466                 struct hlist_nulls_head *head = select_bucket(htab, i);
1467                 struct hlist_nulls_node *n;
1468                 struct htab_elem *l;
1469
1470                 hlist_nulls_for_each_entry(l, n, head, hash_node) {
1471                         /* We only free timer on uref dropping to zero */
1472                         bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8));
1473                 }
1474                 cond_resched_rcu();
1475         }
1476         rcu_read_unlock();
1477 }
1478
1479 static void htab_map_free_timers(struct bpf_map *map)
1480 {
1481         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1482
1483         /* We only free timer on uref dropping to zero */
1484         if (!btf_record_has_field(htab->map.record, BPF_TIMER))
1485                 return;
1486         if (!htab_is_prealloc(htab))
1487                 htab_free_malloced_timers(htab);
1488         else
1489                 htab_free_prealloced_timers(htab);
1490 }
1491
1492 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1493 static void htab_map_free(struct bpf_map *map)
1494 {
1495         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1496         int i;
1497
1498         /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
1499          * bpf_free_used_maps() is called after bpf prog is no longer executing.
1500          * There is no need to synchronize_rcu() here to protect map elements.
1501          */
1502
1503         /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
1504          * underneath and is reponsible for waiting for callbacks to finish
1505          * during bpf_mem_alloc_destroy().
1506          */
1507         if (!htab_is_prealloc(htab)) {
1508                 delete_all_elements(htab);
1509         } else {
1510                 htab_free_prealloced_fields(htab);
1511                 prealloc_destroy(htab);
1512         }
1513
1514         free_percpu(htab->extra_elems);
1515         bpf_map_area_free(htab->buckets);
1516         bpf_mem_alloc_destroy(&htab->pcpu_ma);
1517         bpf_mem_alloc_destroy(&htab->ma);
1518         if (htab->use_percpu_counter)
1519                 percpu_counter_destroy(&htab->pcount);
1520         for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
1521                 free_percpu(htab->map_locked[i]);
1522         lockdep_unregister_key(&htab->lockdep_key);
1523         bpf_map_area_free(htab);
1524 }
1525
1526 static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
1527                                    struct seq_file *m)
1528 {
1529         void *value;
1530
1531         rcu_read_lock();
1532
1533         value = htab_map_lookup_elem(map, key);
1534         if (!value) {
1535                 rcu_read_unlock();
1536                 return;
1537         }
1538
1539         btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
1540         seq_puts(m, ": ");
1541         btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
1542         seq_puts(m, "\n");
1543
1544         rcu_read_unlock();
1545 }
1546
1547 static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1548                                              void *value, bool is_lru_map,
1549                                              bool is_percpu, u64 flags)
1550 {
1551         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1552         struct hlist_nulls_head *head;
1553         unsigned long bflags;
1554         struct htab_elem *l;
1555         u32 hash, key_size;
1556         struct bucket *b;
1557         int ret;
1558
1559         key_size = map->key_size;
1560
1561         hash = htab_map_hash(key, key_size, htab->hashrnd);
1562         b = __select_bucket(htab, hash);
1563         head = &b->head;
1564
1565         ret = htab_lock_bucket(htab, b, hash, &bflags);
1566         if (ret)
1567                 return ret;
1568
1569         l = lookup_elem_raw(head, hash, key, key_size);
1570         if (!l) {
1571                 ret = -ENOENT;
1572         } else {
1573                 if (is_percpu) {
1574                         u32 roundup_value_size = round_up(map->value_size, 8);
1575                         void __percpu *pptr;
1576                         int off = 0, cpu;
1577
1578                         pptr = htab_elem_get_ptr(l, key_size);
1579                         for_each_possible_cpu(cpu) {
1580                                 bpf_long_memcpy(value + off,
1581                                                 per_cpu_ptr(pptr, cpu),
1582                                                 roundup_value_size);
1583                                 off += roundup_value_size;
1584                         }
1585                 } else {
1586                         u32 roundup_key_size = round_up(map->key_size, 8);
1587
1588                         if (flags & BPF_F_LOCK)
1589                                 copy_map_value_locked(map, value, l->key +
1590                                                       roundup_key_size,
1591                                                       true);
1592                         else
1593                                 copy_map_value(map, value, l->key +
1594                                                roundup_key_size);
1595                         check_and_init_map_value(map, value);
1596                 }
1597
1598                 hlist_nulls_del_rcu(&l->hash_node);
1599                 if (!is_lru_map)
1600                         free_htab_elem(htab, l);
1601         }
1602
1603         htab_unlock_bucket(htab, b, hash, bflags);
1604
1605         if (is_lru_map && l)
1606                 htab_lru_push_free(htab, l);
1607
1608         return ret;
1609 }
1610
1611 static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1612                                            void *value, u64 flags)
1613 {
1614         return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
1615                                                  flags);
1616 }
1617
1618 static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1619                                                   void *key, void *value,
1620                                                   u64 flags)
1621 {
1622         return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
1623                                                  flags);
1624 }
1625
1626 static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1627                                                void *value, u64 flags)
1628 {
1629         return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
1630                                                  flags);
1631 }
1632
1633 static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1634                                                       void *key, void *value,
1635                                                       u64 flags)
1636 {
1637         return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
1638                                                  flags);
1639 }
1640
1641 static int
1642 __htab_map_lookup_and_delete_batch(struct bpf_map *map,
1643                                    const union bpf_attr *attr,
1644                                    union bpf_attr __user *uattr,
1645                                    bool do_delete, bool is_lru_map,
1646                                    bool is_percpu)
1647 {
1648         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1649         u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
1650         void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
1651         void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1652         void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1653         void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1654         u32 batch, max_count, size, bucket_size, map_id;
1655         struct htab_elem *node_to_free = NULL;
1656         u64 elem_map_flags, map_flags;
1657         struct hlist_nulls_head *head;
1658         struct hlist_nulls_node *n;
1659         unsigned long flags = 0;
1660         bool locked = false;
1661         struct htab_elem *l;
1662         struct bucket *b;
1663         int ret = 0;
1664
1665         elem_map_flags = attr->batch.elem_flags;
1666         if ((elem_map_flags & ~BPF_F_LOCK) ||
1667             ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1668                 return -EINVAL;
1669
1670         map_flags = attr->batch.flags;
1671         if (map_flags)
1672                 return -EINVAL;
1673
1674         max_count = attr->batch.count;
1675         if (!max_count)
1676                 return 0;
1677
1678         if (put_user(0, &uattr->batch.count))
1679                 return -EFAULT;
1680
1681         batch = 0;
1682         if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
1683                 return -EFAULT;
1684
1685         if (batch >= htab->n_buckets)
1686                 return -ENOENT;
1687
1688         key_size = htab->map.key_size;
1689         roundup_key_size = round_up(htab->map.key_size, 8);
1690         value_size = htab->map.value_size;
1691         size = round_up(value_size, 8);
1692         if (is_percpu)
1693                 value_size = size * num_possible_cpus();
1694         total = 0;
1695         /* while experimenting with hash tables with sizes ranging from 10 to
1696          * 1000, it was observed that a bucket can have up to 5 entries.
1697          */
1698         bucket_size = 5;
1699
1700 alloc:
1701         /* We cannot do copy_from_user or copy_to_user inside
1702          * the rcu_read_lock. Allocate enough space here.
1703          */
1704         keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
1705         values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
1706         if (!keys || !values) {
1707                 ret = -ENOMEM;
1708                 goto after_loop;
1709         }
1710
1711 again:
1712         bpf_disable_instrumentation();
1713         rcu_read_lock();
1714 again_nocopy:
1715         dst_key = keys;
1716         dst_val = values;
1717         b = &htab->buckets[batch];
1718         head = &b->head;
1719         /* do not grab the lock unless need it (bucket_cnt > 0). */
1720         if (locked) {
1721                 ret = htab_lock_bucket(htab, b, batch, &flags);
1722                 if (ret) {
1723                         rcu_read_unlock();
1724                         bpf_enable_instrumentation();
1725                         goto after_loop;
1726                 }
1727         }
1728
1729         bucket_cnt = 0;
1730         hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1731                 bucket_cnt++;
1732
1733         if (bucket_cnt && !locked) {
1734                 locked = true;
1735                 goto again_nocopy;
1736         }
1737
1738         if (bucket_cnt > (max_count - total)) {
1739                 if (total == 0)
1740                         ret = -ENOSPC;
1741                 /* Note that since bucket_cnt > 0 here, it is implicit
1742                  * that the locked was grabbed, so release it.
1743                  */
1744                 htab_unlock_bucket(htab, b, batch, flags);
1745                 rcu_read_unlock();
1746                 bpf_enable_instrumentation();
1747                 goto after_loop;
1748         }
1749
1750         if (bucket_cnt > bucket_size) {
1751                 bucket_size = bucket_cnt;
1752                 /* Note that since bucket_cnt > 0 here, it is implicit
1753                  * that the locked was grabbed, so release it.
1754                  */
1755                 htab_unlock_bucket(htab, b, batch, flags);
1756                 rcu_read_unlock();
1757                 bpf_enable_instrumentation();
1758                 kvfree(keys);
1759                 kvfree(values);
1760                 goto alloc;
1761         }
1762
1763         /* Next block is only safe to run if you have grabbed the lock */
1764         if (!locked)
1765                 goto next_batch;
1766
1767         hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1768                 memcpy(dst_key, l->key, key_size);
1769
1770                 if (is_percpu) {
1771                         int off = 0, cpu;
1772                         void __percpu *pptr;
1773
1774                         pptr = htab_elem_get_ptr(l, map->key_size);
1775                         for_each_possible_cpu(cpu) {
1776                                 bpf_long_memcpy(dst_val + off,
1777                                                 per_cpu_ptr(pptr, cpu), size);
1778                                 off += size;
1779                         }
1780                 } else {
1781                         value = l->key + roundup_key_size;
1782                         if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
1783                                 struct bpf_map **inner_map = value;
1784
1785                                  /* Actual value is the id of the inner map */
1786                                 map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
1787                                 value = &map_id;
1788                         }
1789
1790                         if (elem_map_flags & BPF_F_LOCK)
1791                                 copy_map_value_locked(map, dst_val, value,
1792                                                       true);
1793                         else
1794                                 copy_map_value(map, dst_val, value);
1795                         check_and_init_map_value(map, dst_val);
1796                 }
1797                 if (do_delete) {
1798                         hlist_nulls_del_rcu(&l->hash_node);
1799
1800                         /* bpf_lru_push_free() will acquire lru_lock, which
1801                          * may cause deadlock. See comments in function
1802                          * prealloc_lru_pop(). Let us do bpf_lru_push_free()
1803                          * after releasing the bucket lock.
1804                          */
1805                         if (is_lru_map) {
1806                                 l->batch_flink = node_to_free;
1807                                 node_to_free = l;
1808                         } else {
1809                                 free_htab_elem(htab, l);
1810                         }
1811                 }
1812                 dst_key += key_size;
1813                 dst_val += value_size;
1814         }
1815
1816         htab_unlock_bucket(htab, b, batch, flags);
1817         locked = false;
1818
1819         while (node_to_free) {
1820                 l = node_to_free;
1821                 node_to_free = node_to_free->batch_flink;
1822                 htab_lru_push_free(htab, l);
1823         }
1824
1825 next_batch:
1826         /* If we are not copying data, we can go to next bucket and avoid
1827          * unlocking the rcu.
1828          */
1829         if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
1830                 batch++;
1831                 goto again_nocopy;
1832         }
1833
1834         rcu_read_unlock();
1835         bpf_enable_instrumentation();
1836         if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
1837             key_size * bucket_cnt) ||
1838             copy_to_user(uvalues + total * value_size, values,
1839             value_size * bucket_cnt))) {
1840                 ret = -EFAULT;
1841                 goto after_loop;
1842         }
1843
1844         total += bucket_cnt;
1845         batch++;
1846         if (batch >= htab->n_buckets) {
1847                 ret = -ENOENT;
1848                 goto after_loop;
1849         }
1850         goto again;
1851
1852 after_loop:
1853         if (ret == -EFAULT)
1854                 goto out;
1855
1856         /* copy # of entries and next batch */
1857         ubatch = u64_to_user_ptr(attr->batch.out_batch);
1858         if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
1859             put_user(total, &uattr->batch.count))
1860                 ret = -EFAULT;
1861
1862 out:
1863         kvfree(keys);
1864         kvfree(values);
1865         return ret;
1866 }
1867
1868 static int
1869 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1870                              union bpf_attr __user *uattr)
1871 {
1872         return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1873                                                   false, true);
1874 }
1875
1876 static int
1877 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1878                                         const union bpf_attr *attr,
1879                                         union bpf_attr __user *uattr)
1880 {
1881         return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1882                                                   false, true);
1883 }
1884
1885 static int
1886 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1887                       union bpf_attr __user *uattr)
1888 {
1889         return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1890                                                   false, false);
1891 }
1892
1893 static int
1894 htab_map_lookup_and_delete_batch(struct bpf_map *map,
1895                                  const union bpf_attr *attr,
1896                                  union bpf_attr __user *uattr)
1897 {
1898         return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1899                                                   false, false);
1900 }
1901
1902 static int
1903 htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1904                                  const union bpf_attr *attr,
1905                                  union bpf_attr __user *uattr)
1906 {
1907         return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1908                                                   true, true);
1909 }
1910
1911 static int
1912 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1913                                             const union bpf_attr *attr,
1914                                             union bpf_attr __user *uattr)
1915 {
1916         return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1917                                                   true, true);
1918 }
1919
1920 static int
1921 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1922                           union bpf_attr __user *uattr)
1923 {
1924         return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1925                                                   true, false);
1926 }
1927
1928 static int
1929 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1930                                      const union bpf_attr *attr,
1931                                      union bpf_attr __user *uattr)
1932 {
1933         return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1934                                                   true, false);
1935 }
1936
1937 struct bpf_iter_seq_hash_map_info {
1938         struct bpf_map *map;
1939         struct bpf_htab *htab;
1940         void *percpu_value_buf; // non-zero means percpu hash
1941         u32 bucket_id;
1942         u32 skip_elems;
1943 };
1944
1945 static struct htab_elem *
1946 bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
1947                            struct htab_elem *prev_elem)
1948 {
1949         const struct bpf_htab *htab = info->htab;
1950         u32 skip_elems = info->skip_elems;
1951         u32 bucket_id = info->bucket_id;
1952         struct hlist_nulls_head *head;
1953         struct hlist_nulls_node *n;
1954         struct htab_elem *elem;
1955         struct bucket *b;
1956         u32 i, count;
1957
1958         if (bucket_id >= htab->n_buckets)
1959                 return NULL;
1960
1961         /* try to find next elem in the same bucket */
1962         if (prev_elem) {
1963                 /* no update/deletion on this bucket, prev_elem should be still valid
1964                  * and we won't skip elements.
1965                  */
1966                 n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
1967                 elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
1968                 if (elem)
1969                         return elem;
1970
1971                 /* not found, unlock and go to the next bucket */
1972                 b = &htab->buckets[bucket_id++];
1973                 rcu_read_unlock();
1974                 skip_elems = 0;
1975         }
1976
1977         for (i = bucket_id; i < htab->n_buckets; i++) {
1978                 b = &htab->buckets[i];
1979                 rcu_read_lock();
1980
1981                 count = 0;
1982                 head = &b->head;
1983                 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
1984                         if (count >= skip_elems) {
1985                                 info->bucket_id = i;
1986                                 info->skip_elems = count;
1987                                 return elem;
1988                         }
1989                         count++;
1990                 }
1991
1992                 rcu_read_unlock();
1993                 skip_elems = 0;
1994         }
1995
1996         info->bucket_id = i;
1997         info->skip_elems = 0;
1998         return NULL;
1999 }
2000
2001 static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
2002 {
2003         struct bpf_iter_seq_hash_map_info *info = seq->private;
2004         struct htab_elem *elem;
2005
2006         elem = bpf_hash_map_seq_find_next(info, NULL);
2007         if (!elem)
2008                 return NULL;
2009
2010         if (*pos == 0)
2011                 ++*pos;
2012         return elem;
2013 }
2014
2015 static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2016 {
2017         struct bpf_iter_seq_hash_map_info *info = seq->private;
2018
2019         ++*pos;
2020         ++info->skip_elems;
2021         return bpf_hash_map_seq_find_next(info, v);
2022 }
2023
2024 static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
2025 {
2026         struct bpf_iter_seq_hash_map_info *info = seq->private;
2027         u32 roundup_key_size, roundup_value_size;
2028         struct bpf_iter__bpf_map_elem ctx = {};
2029         struct bpf_map *map = info->map;
2030         struct bpf_iter_meta meta;
2031         int ret = 0, off = 0, cpu;
2032         struct bpf_prog *prog;
2033         void __percpu *pptr;
2034
2035         meta.seq = seq;
2036         prog = bpf_iter_get_info(&meta, elem == NULL);
2037         if (prog) {
2038                 ctx.meta = &meta;
2039                 ctx.map = info->map;
2040                 if (elem) {
2041                         roundup_key_size = round_up(map->key_size, 8);
2042                         ctx.key = elem->key;
2043                         if (!info->percpu_value_buf) {
2044                                 ctx.value = elem->key + roundup_key_size;
2045                         } else {
2046                                 roundup_value_size = round_up(map->value_size, 8);
2047                                 pptr = htab_elem_get_ptr(elem, map->key_size);
2048                                 for_each_possible_cpu(cpu) {
2049                                         bpf_long_memcpy(info->percpu_value_buf + off,
2050                                                         per_cpu_ptr(pptr, cpu),
2051                                                         roundup_value_size);
2052                                         off += roundup_value_size;
2053                                 }
2054                                 ctx.value = info->percpu_value_buf;
2055                         }
2056                 }
2057                 ret = bpf_iter_run_prog(prog, &ctx);
2058         }
2059
2060         return ret;
2061 }
2062
2063 static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
2064 {
2065         return __bpf_hash_map_seq_show(seq, v);
2066 }
2067
2068 static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
2069 {
2070         if (!v)
2071                 (void)__bpf_hash_map_seq_show(seq, NULL);
2072         else
2073                 rcu_read_unlock();
2074 }
2075
2076 static int bpf_iter_init_hash_map(void *priv_data,
2077                                   struct bpf_iter_aux_info *aux)
2078 {
2079         struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2080         struct bpf_map *map = aux->map;
2081         void *value_buf;
2082         u32 buf_size;
2083
2084         if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
2085             map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2086                 buf_size = round_up(map->value_size, 8) * num_possible_cpus();
2087                 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
2088                 if (!value_buf)
2089                         return -ENOMEM;
2090
2091                 seq_info->percpu_value_buf = value_buf;
2092         }
2093
2094         bpf_map_inc_with_uref(map);
2095         seq_info->map = map;
2096         seq_info->htab = container_of(map, struct bpf_htab, map);
2097         return 0;
2098 }
2099
2100 static void bpf_iter_fini_hash_map(void *priv_data)
2101 {
2102         struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2103
2104         bpf_map_put_with_uref(seq_info->map);
2105         kfree(seq_info->percpu_value_buf);
2106 }
2107
2108 static const struct seq_operations bpf_hash_map_seq_ops = {
2109         .start  = bpf_hash_map_seq_start,
2110         .next   = bpf_hash_map_seq_next,
2111         .stop   = bpf_hash_map_seq_stop,
2112         .show   = bpf_hash_map_seq_show,
2113 };
2114
2115 static const struct bpf_iter_seq_info iter_seq_info = {
2116         .seq_ops                = &bpf_hash_map_seq_ops,
2117         .init_seq_private       = bpf_iter_init_hash_map,
2118         .fini_seq_private       = bpf_iter_fini_hash_map,
2119         .seq_priv_size          = sizeof(struct bpf_iter_seq_hash_map_info),
2120 };
2121
2122 static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
2123                                   void *callback_ctx, u64 flags)
2124 {
2125         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2126         struct hlist_nulls_head *head;
2127         struct hlist_nulls_node *n;
2128         struct htab_elem *elem;
2129         u32 roundup_key_size;
2130         int i, num_elems = 0;
2131         void __percpu *pptr;
2132         struct bucket *b;
2133         void *key, *val;
2134         bool is_percpu;
2135         u64 ret = 0;
2136
2137         if (flags != 0)
2138                 return -EINVAL;
2139
2140         is_percpu = htab_is_percpu(htab);
2141
2142         roundup_key_size = round_up(map->key_size, 8);
2143         /* disable migration so percpu value prepared here will be the
2144          * same as the one seen by the bpf program with bpf_map_lookup_elem().
2145          */
2146         if (is_percpu)
2147                 migrate_disable();
2148         for (i = 0; i < htab->n_buckets; i++) {
2149                 b = &htab->buckets[i];
2150                 rcu_read_lock();
2151                 head = &b->head;
2152                 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2153                         key = elem->key;
2154                         if (is_percpu) {
2155                                 /* current cpu value for percpu map */
2156                                 pptr = htab_elem_get_ptr(elem, map->key_size);
2157                                 val = this_cpu_ptr(pptr);
2158                         } else {
2159                                 val = elem->key + roundup_key_size;
2160                         }
2161                         num_elems++;
2162                         ret = callback_fn((u64)(long)map, (u64)(long)key,
2163                                           (u64)(long)val, (u64)(long)callback_ctx, 0);
2164                         /* return value: 0 - continue, 1 - stop and return */
2165                         if (ret) {
2166                                 rcu_read_unlock();
2167                                 goto out;
2168                         }
2169                 }
2170                 rcu_read_unlock();
2171         }
2172 out:
2173         if (is_percpu)
2174                 migrate_enable();
2175         return num_elems;
2176 }
2177
2178 BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
2179 const struct bpf_map_ops htab_map_ops = {
2180         .map_meta_equal = bpf_map_meta_equal,
2181         .map_alloc_check = htab_map_alloc_check,
2182         .map_alloc = htab_map_alloc,
2183         .map_free = htab_map_free,
2184         .map_get_next_key = htab_map_get_next_key,
2185         .map_release_uref = htab_map_free_timers,
2186         .map_lookup_elem = htab_map_lookup_elem,
2187         .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
2188         .map_update_elem = htab_map_update_elem,
2189         .map_delete_elem = htab_map_delete_elem,
2190         .map_gen_lookup = htab_map_gen_lookup,
2191         .map_seq_show_elem = htab_map_seq_show_elem,
2192         .map_set_for_each_callback_args = map_set_for_each_callback_args,
2193         .map_for_each_callback = bpf_for_each_hash_elem,
2194         BATCH_OPS(htab),
2195         .map_btf_id = &htab_map_btf_ids[0],
2196         .iter_seq_info = &iter_seq_info,
2197 };
2198
2199 const struct bpf_map_ops htab_lru_map_ops = {
2200         .map_meta_equal = bpf_map_meta_equal,
2201         .map_alloc_check = htab_map_alloc_check,
2202         .map_alloc = htab_map_alloc,
2203         .map_free = htab_map_free,
2204         .map_get_next_key = htab_map_get_next_key,
2205         .map_release_uref = htab_map_free_timers,
2206         .map_lookup_elem = htab_lru_map_lookup_elem,
2207         .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
2208         .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
2209         .map_update_elem = htab_lru_map_update_elem,
2210         .map_delete_elem = htab_lru_map_delete_elem,
2211         .map_gen_lookup = htab_lru_map_gen_lookup,
2212         .map_seq_show_elem = htab_map_seq_show_elem,
2213         .map_set_for_each_callback_args = map_set_for_each_callback_args,
2214         .map_for_each_callback = bpf_for_each_hash_elem,
2215         BATCH_OPS(htab_lru),
2216         .map_btf_id = &htab_map_btf_ids[0],
2217         .iter_seq_info = &iter_seq_info,
2218 };
2219
2220 /* Called from eBPF program */
2221 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2222 {
2223         struct htab_elem *l = __htab_map_lookup_elem(map, key);
2224
2225         if (l)
2226                 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2227         else
2228                 return NULL;
2229 }
2230
2231 static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2232 {
2233         struct htab_elem *l;
2234
2235         if (cpu >= nr_cpu_ids)
2236                 return NULL;
2237
2238         l = __htab_map_lookup_elem(map, key);
2239         if (l)
2240                 return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2241         else
2242                 return NULL;
2243 }
2244
2245 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2246 {
2247         struct htab_elem *l = __htab_map_lookup_elem(map, key);
2248
2249         if (l) {
2250                 bpf_lru_node_set_ref(&l->lru_node);
2251                 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2252         }
2253
2254         return NULL;
2255 }
2256
2257 static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2258 {
2259         struct htab_elem *l;
2260
2261         if (cpu >= nr_cpu_ids)
2262                 return NULL;
2263
2264         l = __htab_map_lookup_elem(map, key);
2265         if (l) {
2266                 bpf_lru_node_set_ref(&l->lru_node);
2267                 return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2268         }
2269
2270         return NULL;
2271 }
2272
2273 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
2274 {
2275         struct htab_elem *l;
2276         void __percpu *pptr;
2277         int ret = -ENOENT;
2278         int cpu, off = 0;
2279         u32 size;
2280
2281         /* per_cpu areas are zero-filled and bpf programs can only
2282          * access 'value_size' of them, so copying rounded areas
2283          * will not leak any kernel data
2284          */
2285         size = round_up(map->value_size, 8);
2286         rcu_read_lock();
2287         l = __htab_map_lookup_elem(map, key);
2288         if (!l)
2289                 goto out;
2290         /* We do not mark LRU map element here in order to not mess up
2291          * eviction heuristics when user space does a map walk.
2292          */
2293         pptr = htab_elem_get_ptr(l, map->key_size);
2294         for_each_possible_cpu(cpu) {
2295                 bpf_long_memcpy(value + off,
2296                                 per_cpu_ptr(pptr, cpu), size);
2297                 off += size;
2298         }
2299         ret = 0;
2300 out:
2301         rcu_read_unlock();
2302         return ret;
2303 }
2304
2305 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
2306                            u64 map_flags)
2307 {
2308         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2309         int ret;
2310
2311         rcu_read_lock();
2312         if (htab_is_lru(htab))
2313                 ret = __htab_lru_percpu_map_update_elem(map, key, value,
2314                                                         map_flags, true);
2315         else
2316                 ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
2317                                                     true);
2318         rcu_read_unlock();
2319
2320         return ret;
2321 }
2322
2323 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
2324                                           struct seq_file *m)
2325 {
2326         struct htab_elem *l;
2327         void __percpu *pptr;
2328         int cpu;
2329
2330         rcu_read_lock();
2331
2332         l = __htab_map_lookup_elem(map, key);
2333         if (!l) {
2334                 rcu_read_unlock();
2335                 return;
2336         }
2337
2338         btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
2339         seq_puts(m, ": {\n");
2340         pptr = htab_elem_get_ptr(l, map->key_size);
2341         for_each_possible_cpu(cpu) {
2342                 seq_printf(m, "\tcpu%d: ", cpu);
2343                 btf_type_seq_show(map->btf, map->btf_value_type_id,
2344                                   per_cpu_ptr(pptr, cpu), m);
2345                 seq_puts(m, "\n");
2346         }
2347         seq_puts(m, "}\n");
2348
2349         rcu_read_unlock();
2350 }
2351
2352 const struct bpf_map_ops htab_percpu_map_ops = {
2353         .map_meta_equal = bpf_map_meta_equal,
2354         .map_alloc_check = htab_map_alloc_check,
2355         .map_alloc = htab_map_alloc,
2356         .map_free = htab_map_free,
2357         .map_get_next_key = htab_map_get_next_key,
2358         .map_lookup_elem = htab_percpu_map_lookup_elem,
2359         .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
2360         .map_update_elem = htab_percpu_map_update_elem,
2361         .map_delete_elem = htab_map_delete_elem,
2362         .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
2363         .map_seq_show_elem = htab_percpu_map_seq_show_elem,
2364         .map_set_for_each_callback_args = map_set_for_each_callback_args,
2365         .map_for_each_callback = bpf_for_each_hash_elem,
2366         BATCH_OPS(htab_percpu),
2367         .map_btf_id = &htab_map_btf_ids[0],
2368         .iter_seq_info = &iter_seq_info,
2369 };
2370
2371 const struct bpf_map_ops htab_lru_percpu_map_ops = {
2372         .map_meta_equal = bpf_map_meta_equal,
2373         .map_alloc_check = htab_map_alloc_check,
2374         .map_alloc = htab_map_alloc,
2375         .map_free = htab_map_free,
2376         .map_get_next_key = htab_map_get_next_key,
2377         .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
2378         .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
2379         .map_update_elem = htab_lru_percpu_map_update_elem,
2380         .map_delete_elem = htab_lru_map_delete_elem,
2381         .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
2382         .map_seq_show_elem = htab_percpu_map_seq_show_elem,
2383         .map_set_for_each_callback_args = map_set_for_each_callback_args,
2384         .map_for_each_callback = bpf_for_each_hash_elem,
2385         BATCH_OPS(htab_lru_percpu),
2386         .map_btf_id = &htab_map_btf_ids[0],
2387         .iter_seq_info = &iter_seq_info,
2388 };
2389
2390 static int fd_htab_map_alloc_check(union bpf_attr *attr)
2391 {
2392         if (attr->value_size != sizeof(u32))
2393                 return -EINVAL;
2394         return htab_map_alloc_check(attr);
2395 }
2396
2397 static void fd_htab_map_free(struct bpf_map *map)
2398 {
2399         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2400         struct hlist_nulls_node *n;
2401         struct hlist_nulls_head *head;
2402         struct htab_elem *l;
2403         int i;
2404
2405         for (i = 0; i < htab->n_buckets; i++) {
2406                 head = select_bucket(htab, i);
2407
2408                 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
2409                         void *ptr = fd_htab_map_get_ptr(map, l);
2410
2411                         map->ops->map_fd_put_ptr(ptr);
2412                 }
2413         }
2414
2415         htab_map_free(map);
2416 }
2417
2418 /* only called from syscall */
2419 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
2420 {
2421         void **ptr;
2422         int ret = 0;
2423
2424         if (!map->ops->map_fd_sys_lookup_elem)
2425                 return -ENOTSUPP;
2426
2427         rcu_read_lock();
2428         ptr = htab_map_lookup_elem(map, key);
2429         if (ptr)
2430                 *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
2431         else
2432                 ret = -ENOENT;
2433         rcu_read_unlock();
2434
2435         return ret;
2436 }
2437
2438 /* only called from syscall */
2439 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
2440                                 void *key, void *value, u64 map_flags)
2441 {
2442         void *ptr;
2443         int ret;
2444         u32 ufd = *(u32 *)value;
2445
2446         ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
2447         if (IS_ERR(ptr))
2448                 return PTR_ERR(ptr);
2449
2450         ret = htab_map_update_elem(map, key, &ptr, map_flags);
2451         if (ret)
2452                 map->ops->map_fd_put_ptr(ptr);
2453
2454         return ret;
2455 }
2456
2457 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
2458 {
2459         struct bpf_map *map, *inner_map_meta;
2460
2461         inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
2462         if (IS_ERR(inner_map_meta))
2463                 return inner_map_meta;
2464
2465         map = htab_map_alloc(attr);
2466         if (IS_ERR(map)) {
2467                 bpf_map_meta_free(inner_map_meta);
2468                 return map;
2469         }
2470
2471         map->inner_map_meta = inner_map_meta;
2472
2473         return map;
2474 }
2475
2476 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
2477 {
2478         struct bpf_map **inner_map  = htab_map_lookup_elem(map, key);
2479
2480         if (!inner_map)
2481                 return NULL;
2482
2483         return READ_ONCE(*inner_map);
2484 }
2485
2486 static int htab_of_map_gen_lookup(struct bpf_map *map,
2487                                   struct bpf_insn *insn_buf)
2488 {
2489         struct bpf_insn *insn = insn_buf;
2490         const int ret = BPF_REG_0;
2491
2492         BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2493                      (void *(*)(struct bpf_map *map, void *key))NULL));
2494         *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2495         *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
2496         *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
2497                                 offsetof(struct htab_elem, key) +
2498                                 round_up(map->key_size, 8));
2499         *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
2500
2501         return insn - insn_buf;
2502 }
2503
2504 static void htab_of_map_free(struct bpf_map *map)
2505 {
2506         bpf_map_meta_free(map->inner_map_meta);
2507         fd_htab_map_free(map);
2508 }
2509
2510 const struct bpf_map_ops htab_of_maps_map_ops = {
2511         .map_alloc_check = fd_htab_map_alloc_check,
2512         .map_alloc = htab_of_map_alloc,
2513         .map_free = htab_of_map_free,
2514         .map_get_next_key = htab_map_get_next_key,
2515         .map_lookup_elem = htab_of_map_lookup_elem,
2516         .map_delete_elem = htab_map_delete_elem,
2517         .map_fd_get_ptr = bpf_map_fd_get_ptr,
2518         .map_fd_put_ptr = bpf_map_fd_put_ptr,
2519         .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
2520         .map_gen_lookup = htab_of_map_gen_lookup,
2521         .map_check_btf = map_check_no_btf,
2522         BATCH_OPS(htab),
2523         .map_btf_id = &htab_map_btf_ids[0],
2524 };