]> Git Repo - linux.git/commitdiff
Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf...
authorJakub Kicinski <[email protected]>
Sat, 24 Jun 2023 21:52:28 +0000 (14:52 -0700)
committerJakub Kicinski <[email protected]>
Sat, 24 Jun 2023 21:52:28 +0000 (14:52 -0700)
Daniel Borkmann says:

====================
pull-request: bpf-next 2023-06-23

We've added 49 non-merge commits during the last 24 day(s) which contain
a total of 70 files changed, 1935 insertions(+), 442 deletions(-).

The main changes are:

1) Extend bpf_fib_lookup helper to allow passing the route table ID,
   from Louis DeLosSantos.

2) Fix regsafe() in verifier to call check_ids() for scalar registers,
   from Eduard Zingerman.

3) Extend the set of cpumask kfuncs with bpf_cpumask_first_and()
   and a rework of bpf_cpumask_any*() kfuncs. Additionally,
   add selftests, from David Vernet.

4) Fix socket lookup BPF helpers for tc/XDP to respect VRF bindings,
   from Gilad Sever.

5) Change bpf_link_put() to use workqueue unconditionally to fix it
   under PREEMPT_RT, from Sebastian Andrzej Siewior.

6) Follow-ups to address issues in the bpf_refcount shared ownership
   implementation, from Dave Marchevsky.

7) A few general refactorings to BPF map and program creation permissions
   checks which were part of the BPF token series, from Andrii Nakryiko.

8) Various fixes for benchmark framework and add a new benchmark
   for BPF memory allocator to BPF selftests, from Hou Tao.

9) Documentation improvements around iterators and trusted pointers,
   from Anton Protopopov.

10) Small cleanup in verifier to improve allocated object check,
    from Daniel T. Lee.

11) Improve performance of bpf_xdp_pointer() by avoiding access
    to shared_info when XDP packet does not have frags,
    from Jesper Dangaard Brouer.

12) Silence a harmless syzbot-reported warning in btf_type_id_size(),
    from Yonghong Song.

13) Remove duplicate bpfilter_umh_cleanup in favor of umd_cleanup_helper,
    from Jarkko Sakkinen.

14) Fix BPF selftests build for resolve_btfids under custom HOSTCFLAGS,
    from Viktor Malik.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (49 commits)
  bpf, docs: Document existing macros instead of deprecated
  bpf, docs: BPF Iterator Document
  selftests/bpf: Fix compilation failure for prog vrf_socket_lookup
  selftests/bpf: Add vrf_socket_lookup tests
  bpf: Fix bpf socket lookup from tc/xdp to respect socket VRF bindings
  bpf: Call __bpf_sk_lookup()/__bpf_skc_lookup() directly via TC hookpoint
  bpf: Factor out socket lookup functions for the TC hookpoint.
  selftests/bpf: Set the default value of consumer_cnt as 0
  selftests/bpf: Ensure that next_cpu() returns a valid CPU number
  selftests/bpf: Output the correct error code for pthread APIs
  selftests/bpf: Use producer_cnt to allocate local counter array
  xsk: Remove unused inline function xsk_buff_discard()
  bpf: Keep BPF_PROG_LOAD permission checks clear of validations
  bpf: Centralize permissions checks for all BPF map types
  bpf: Inline map creation logic in map_create() function
  bpf: Move unprivileged checks into map_create() and bpf_prog_load()
  bpf: Remove in_atomic() from bpf_link_put().
  selftests/bpf: Verify that check_ids() is used for scalars in regsafe()
  bpf: Verify scalar ids mapping in regsafe() using check_ids()
  selftests/bpf: Check if mark_chain_precision() follows scalar ids
  ...
====================

Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Jakub Kicinski <[email protected]>
1  2 
include/linux/netdevice.h
include/uapi/linux/bpf.h
kernel/bpf/btf.c
kernel/bpf/syscall.c
kernel/bpf/verifier.c
tools/include/uapi/linux/bpf.h

index acf706d49c2b2eefec2007b0d4e511ce94ceda3d,8c95ebbcf2034f5d956bc0a74a1f130b15aa05c7..b828c7a75be20b76c87e8ab7152d4bd55ee83438
@@@ -620,7 -620,7 +620,7 @@@ struct netdev_queue 
        netdevice_tracker       dev_tracker;
  
        struct Qdisc __rcu      *qdisc;
 -      struct Qdisc            *qdisc_sleeping;
 +      struct Qdisc __rcu      *qdisc_sleeping;
  #ifdef CONFIG_SYSFS
        struct kobject          kobj;
  #endif
@@@ -768,11 -768,8 +768,11 @@@ static inline void rps_record_sock_flow
                /* We only give a hint, preemption can change CPU under us */
                val |= raw_smp_processor_id();
  
 -              if (table->ents[index] != val)
 -                      table->ents[index] = val;
 +              /* The following WRITE_ONCE() is paired with the READ_ONCE()
 +               * here, and another one in get_rps_cpu().
 +               */
 +              if (READ_ONCE(table->ents[index]) != val)
 +                      WRITE_ONCE(table->ents[index], val);
        }
  }
  
@@@ -3124,10 -3121,6 +3124,10 @@@ struct net_device *netdev_sk_get_lowest
                                            struct sock *sk);
  struct net_device *dev_get_by_index(struct net *net, int ifindex);
  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
 +struct net_device *netdev_get_by_index(struct net *net, int ifindex,
 +                                     netdevice_tracker *tracker, gfp_t gfp);
 +struct net_device *netdev_get_by_name(struct net *net, const char *name,
 +                                    netdevice_tracker *tracker, gfp_t gfp);
  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
  struct net_device *dev_get_by_napi_id(unsigned int napi_id);
  int dev_restart(struct net_device *dev);
@@@ -4831,6 -4824,13 +4831,6 @@@ int skb_crc32c_csum_help(struct sk_buf
  int skb_csum_hwoffload_help(struct sk_buff *skb,
                            const netdev_features_t features);
  
 -struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 -                                netdev_features_t features, bool tx_path);
 -struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb,
 -                                  netdev_features_t features, __be16 type);
 -struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 -                                  netdev_features_t features);
 -
  struct netdev_bonding_info {
        ifslave slave;
        ifbond  master;
@@@ -4853,6 -4853,11 +4853,6 @@@ static inline void ethtool_notify(struc
  }
  #endif
  
 -static inline
 -struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features)
 -{
 -      return __skb_gso_segment(skb, features, true);
 -}
  __be16 skb_network_protocol(struct sk_buff *skb, int *depth);
  
  static inline bool can_checksum_protocol(netdev_features_t features,
@@@ -4979,7 -4984,6 +4979,7 @@@ netdev_features_t passthru_features_che
                                          struct net_device *dev,
                                          netdev_features_t features);
  netdev_features_t netif_skb_features(struct sk_buff *skb);
 +void skb_warn_bad_offload(const struct sk_buff *skb);
  
  static inline bool net_gso_ok(netdev_features_t features, int gso_type)
  {
@@@ -5028,6 -5032,19 +5028,6 @@@ void netif_set_tso_max_segs(struct net_
  void netif_inherit_tso_max(struct net_device *to,
                           const struct net_device *from);
  
 -static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
 -                                      int pulled_hlen, u16 mac_offset,
 -                                      int mac_len)
 -{
 -      skb->protocol = protocol;
 -      skb->encapsulation = 1;
 -      skb_push(skb, pulled_hlen);
 -      skb_reset_transport_header(skb);
 -      skb->mac_header = mac_offset;
 -      skb->network_header = skb->mac_header + mac_len;
 -      skb->mac_len = mac_len;
 -}
 -
  static inline bool netif_is_macsec(const struct net_device *dev)
  {
        return dev->priv_flags & IFF_MACSEC;
@@@ -5073,6 -5090,15 +5073,15 @@@ static inline bool netif_is_l3_slave(co
        return dev->priv_flags & IFF_L3MDEV_SLAVE;
  }
  
+ static inline int dev_sdif(const struct net_device *dev)
+ {
+ #ifdef CONFIG_NET_L3_MASTER_DEV
+       if (netif_is_l3_slave(dev))
+               return dev->ifindex;
+ #endif
+       return 0;
+ }
  static inline bool netif_is_bridge_master(const struct net_device *dev)
  {
        return dev->priv_flags & IFF_EBRIDGE;
diff --combined include/uapi/linux/bpf.h
index 6961a7b700281037cd6fbb8e05a63c21978ff5c5,a7b5e91dd768e7d5b716272488295f828aa9aa1c..60a9d59beeabba9bdbaa94946aa0c28e3f435463
@@@ -1035,7 -1035,6 +1035,7 @@@ enum bpf_attach_type 
        BPF_TRACE_KPROBE_MULTI,
        BPF_LSM_CGROUP,
        BPF_STRUCT_OPS,
 +      BPF_NETFILTER,
        __MAX_BPF_ATTACH_TYPE
  };
  
@@@ -3178,6 -3177,10 +3178,10 @@@ union bpf_attr 
   *            **BPF_FIB_LOOKUP_DIRECT**
   *                    Do a direct table lookup vs full lookup using FIB
   *                    rules.
+  *            **BPF_FIB_LOOKUP_TBID**
+  *                    Used with BPF_FIB_LOOKUP_DIRECT.
+  *                    Use the routing table ID present in *params*->tbid
+  *                    for the fib lookup.
   *            **BPF_FIB_LOOKUP_OUTPUT**
   *                    Perform lookup from an egress perspective (default is
   *                    ingress).
@@@ -6832,6 -6835,7 +6836,7 @@@ enum 
        BPF_FIB_LOOKUP_DIRECT  = (1U << 0),
        BPF_FIB_LOOKUP_OUTPUT  = (1U << 1),
        BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
+       BPF_FIB_LOOKUP_TBID    = (1U << 3),
  };
  
  enum {
@@@ -6892,9 -6896,19 +6897,19 @@@ struct bpf_fib_lookup 
                __u32           ipv6_dst[4];  /* in6_addr; network order */
        };
  
-       /* output */
-       __be16  h_vlan_proto;
-       __be16  h_vlan_TCI;
+       union {
+               struct {
+                       /* output */
+                       __be16  h_vlan_proto;
+                       __be16  h_vlan_TCI;
+               };
+               /* input: when accompanied with the
+                * 'BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_TBID` flags, a
+                * specific routing table to use for the fib lookup.
+                */
+               __u32   tbid;
+       };
        __u8    smac[6];     /* ETH_ALEN */
        __u8    dmac[6];     /* ETH_ALEN */
  };
diff --combined kernel/bpf/btf.c
index bbcae434fda540eb99aea7ce6099c62b75cfc87e,bd2cac057928bb848856d9e4f90339983cc18995..29fe2109929853dc5d8ecd02e2153c3e2d9b823b
@@@ -492,25 -492,26 +492,26 @@@ static bool btf_type_is_fwd(const struc
        return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
  }
  
- static bool btf_type_nosize(const struct btf_type *t)
+ static bool btf_type_is_datasec(const struct btf_type *t)
  {
-       return btf_type_is_void(t) || btf_type_is_fwd(t) ||
-              btf_type_is_func(t) || btf_type_is_func_proto(t);
+       return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
  }
  
- static bool btf_type_nosize_or_null(const struct btf_type *t)
+ static bool btf_type_is_decl_tag(const struct btf_type *t)
  {
-       return !t || btf_type_nosize(t);
+       return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
  }
  
- static bool btf_type_is_datasec(const struct btf_type *t)
+ static bool btf_type_nosize(const struct btf_type *t)
  {
-       return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
+       return btf_type_is_void(t) || btf_type_is_fwd(t) ||
+              btf_type_is_func(t) || btf_type_is_func_proto(t) ||
+              btf_type_is_decl_tag(t);
  }
  
- static bool btf_type_is_decl_tag(const struct btf_type *t)
+ static bool btf_type_nosize_or_null(const struct btf_type *t)
  {
-       return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
+       return !t || btf_type_nosize(t);
  }
  
  static bool btf_type_is_decl_tag_target(const struct btf_type *t)
@@@ -751,12 -752,13 +752,12 @@@ static bool btf_name_offset_valid(cons
        return offset < btf->hdr.str_len;
  }
  
 -static bool __btf_name_char_ok(char c, bool first, bool dot_ok)
 +static bool __btf_name_char_ok(char c, bool first)
  {
        if ((first ? !isalpha(c) :
                     !isalnum(c)) &&
            c != '_' &&
 -          ((c == '.' && !dot_ok) ||
 -            c != '.'))
 +          c != '.')
                return false;
        return true;
  }
@@@ -773,20 -775,20 +774,20 @@@ static const char *btf_str_by_offset(co
        return NULL;
  }
  
 -static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
 +static bool __btf_name_valid(const struct btf *btf, u32 offset)
  {
        /* offset must be valid */
        const char *src = btf_str_by_offset(btf, offset);
        const char *src_limit;
  
 -      if (!__btf_name_char_ok(*src, true, dot_ok))
 +      if (!__btf_name_char_ok(*src, true))
                return false;
  
        /* set a limit on identifier length */
        src_limit = src + KSYM_NAME_LEN;
        src++;
        while (*src && src < src_limit) {
 -              if (!__btf_name_char_ok(*src, false, dot_ok))
 +              if (!__btf_name_char_ok(*src, false))
                        return false;
                src++;
        }
        return !*src;
  }
  
 -/* Only C-style identifier is permitted. This can be relaxed if
 - * necessary.
 - */
  static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
  {
 -      return __btf_name_valid(btf, offset, false);
 +      return __btf_name_valid(btf, offset);
  }
  
  static bool btf_name_valid_section(const struct btf *btf, u32 offset)
  {
 -      return __btf_name_valid(btf, offset, true);
 +      return __btf_name_valid(btf, offset);
  }
  
  static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
@@@ -4425,7 -4430,7 +4426,7 @@@ static s32 btf_var_check_meta(struct bt
        }
  
        if (!t->name_off ||
 -          !__btf_name_valid(env->btf, t->name_off, true)) {
 +          !__btf_name_valid(env->btf, t->name_off)) {
                btf_verifier_log_type(env, t, "Invalid name");
                return -EINVAL;
        }
diff --combined kernel/bpf/syscall.c
index 4497b193dd200932d8f2eb2071473b05398b03d1,a75c54b6f8a33ace98b4d4079441f04e6873aa93..a2aef900519c23db385c44db0ac596fc85ac64a7
@@@ -109,37 -109,6 +109,6 @@@ const struct bpf_map_ops bpf_map_offloa
        .map_mem_usage = bpf_map_offload_map_mem_usage,
  };
  
- static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
- {
-       const struct bpf_map_ops *ops;
-       u32 type = attr->map_type;
-       struct bpf_map *map;
-       int err;
-       if (type >= ARRAY_SIZE(bpf_map_types))
-               return ERR_PTR(-EINVAL);
-       type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
-       ops = bpf_map_types[type];
-       if (!ops)
-               return ERR_PTR(-EINVAL);
-       if (ops->map_alloc_check) {
-               err = ops->map_alloc_check(attr);
-               if (err)
-                       return ERR_PTR(err);
-       }
-       if (attr->map_ifindex)
-               ops = &bpf_map_offload_ops;
-       if (!ops->map_mem_usage)
-               return ERR_PTR(-EINVAL);
-       map = ops->map_alloc(attr);
-       if (IS_ERR(map))
-               return map;
-       map->ops = ops;
-       map->map_type = type;
-       return map;
- }
  static void bpf_map_write_active_inc(struct bpf_map *map)
  {
        atomic64_inc(&map->writecnt);
@@@ -1127,7 -1096,9 +1096,9 @@@ free_map_tab
  /* called via syscall */
  static int map_create(union bpf_attr *attr)
  {
+       const struct bpf_map_ops *ops;
        int numa_node = bpf_map_attr_numa_node(attr);
+       u32 map_type = attr->map_type;
        struct bpf_map *map;
        int f_flags;
        int err;
                return -EINVAL;
  
        /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
-       map = find_and_alloc_map(attr);
+       map_type = attr->map_type;
+       if (map_type >= ARRAY_SIZE(bpf_map_types))
+               return -EINVAL;
+       map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
+       ops = bpf_map_types[map_type];
+       if (!ops)
+               return -EINVAL;
+       if (ops->map_alloc_check) {
+               err = ops->map_alloc_check(attr);
+               if (err)
+                       return err;
+       }
+       if (attr->map_ifindex)
+               ops = &bpf_map_offload_ops;
+       if (!ops->map_mem_usage)
+               return -EINVAL;
+       /* Intent here is for unprivileged_bpf_disabled to block BPF map
+        * creation for unprivileged users; other actions depend
+        * on fd availability and access to bpffs, so are dependent on
+        * object creation success. Even with unprivileged BPF disabled,
+        * capability checks are still carried out.
+        */
+       if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+               return -EPERM;
+       /* check privileged map type permissions */
+       switch (map_type) {
+       case BPF_MAP_TYPE_ARRAY:
+       case BPF_MAP_TYPE_PERCPU_ARRAY:
+       case BPF_MAP_TYPE_PROG_ARRAY:
+       case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+       case BPF_MAP_TYPE_CGROUP_ARRAY:
+       case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+       case BPF_MAP_TYPE_HASH:
+       case BPF_MAP_TYPE_PERCPU_HASH:
+       case BPF_MAP_TYPE_HASH_OF_MAPS:
+       case BPF_MAP_TYPE_RINGBUF:
+       case BPF_MAP_TYPE_USER_RINGBUF:
+       case BPF_MAP_TYPE_CGROUP_STORAGE:
+       case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
+               /* unprivileged */
+               break;
+       case BPF_MAP_TYPE_SK_STORAGE:
+       case BPF_MAP_TYPE_INODE_STORAGE:
+       case BPF_MAP_TYPE_TASK_STORAGE:
+       case BPF_MAP_TYPE_CGRP_STORAGE:
+       case BPF_MAP_TYPE_BLOOM_FILTER:
+       case BPF_MAP_TYPE_LPM_TRIE:
+       case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
+       case BPF_MAP_TYPE_STACK_TRACE:
+       case BPF_MAP_TYPE_QUEUE:
+       case BPF_MAP_TYPE_STACK:
+       case BPF_MAP_TYPE_LRU_HASH:
+       case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+       case BPF_MAP_TYPE_STRUCT_OPS:
+       case BPF_MAP_TYPE_CPUMAP:
+               if (!bpf_capable())
+                       return -EPERM;
+               break;
+       case BPF_MAP_TYPE_SOCKMAP:
+       case BPF_MAP_TYPE_SOCKHASH:
+       case BPF_MAP_TYPE_DEVMAP:
+       case BPF_MAP_TYPE_DEVMAP_HASH:
+       case BPF_MAP_TYPE_XSKMAP:
+               if (!capable(CAP_NET_ADMIN))
+                       return -EPERM;
+               break;
+       default:
+               WARN(1, "unsupported map type %d", map_type);
+               return -EPERM;
+       }
+       map = ops->map_alloc(attr);
        if (IS_ERR(map))
                return PTR_ERR(map);
+       map->ops = ops;
+       map->map_type = map_type;
  
        err = bpf_obj_name_cpy(map->name, attr->map_name,
                               sizeof(attr->map_name));
@@@ -2434,10 -2481,6 +2481,10 @@@ bpf_prog_load_check_attach(enum bpf_pro
                default:
                        return -EINVAL;
                }
 +      case BPF_PROG_TYPE_NETFILTER:
 +              if (expected_attach_type == BPF_NETFILTER)
 +                      return 0;
 +              return -EINVAL;
        case BPF_PROG_TYPE_SYSCALL:
        case BPF_PROG_TYPE_EXT:
                if (expected_attach_type)
@@@ -2507,7 -2550,6 +2554,6 @@@ static int bpf_prog_load(union bpf_att
        struct btf *attach_btf = NULL;
        int err;
        char license[128];
-       bool is_gpl;
  
        if (CHECK_ATTR(BPF_PROG_LOAD))
                return -EINVAL;
            !bpf_capable())
                return -EPERM;
  
-       /* copy eBPF program license from user space */
-       if (strncpy_from_bpfptr(license,
-                               make_bpfptr(attr->license, uattr.is_kernel),
-                               sizeof(license) - 1) < 0)
-               return -EFAULT;
-       license[sizeof(license) - 1] = 0;
-       /* eBPF programs must be GPL compatible to use GPL-ed functions */
-       is_gpl = license_is_gpl_compatible(license);
+       /* Intent here is for unprivileged_bpf_disabled to block BPF program
+        * creation for unprivileged users; other actions depend
+        * on fd availability and access to bpffs, so are dependent on
+        * object creation success. Even with unprivileged BPF disabled,
+        * capability checks are still carried out for these
+        * and other operations.
+        */
+       if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+               return -EPERM;
  
        if (attr->insn_cnt == 0 ||
            attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
                             make_bpfptr(attr->insns, uattr.is_kernel),
                             bpf_prog_insn_size(prog)) != 0)
                goto free_prog_sec;
+       /* copy eBPF program license from user space */
+       if (strncpy_from_bpfptr(license,
+                               make_bpfptr(attr->license, uattr.is_kernel),
+                               sizeof(license) - 1) < 0)
+               goto free_prog_sec;
+       license[sizeof(license) - 1] = 0;
+       /* eBPF programs must be GPL compatible to use GPL-ed functions */
+       prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
  
        prog->orig_prog = NULL;
        prog->jited = 0;
  
        atomic64_set(&prog->aux->refcnt, 1);
-       prog->gpl_compatible = is_gpl ? 1 : 0;
  
        if (bpf_prog_is_dev_bound(prog->aux)) {
                err = bpf_prog_dev_bound_init(prog, attr);
@@@ -2797,28 -2847,31 +2851,31 @@@ static void bpf_link_put_deferred(struc
        bpf_link_free(link);
  }
  
- /* bpf_link_put can be called from atomic context, but ensures that resources
-  * are freed from process context
+ /* bpf_link_put might be called from atomic context. It needs to be called
+  * from sleepable context in order to acquire sleeping locks during the process.
   */
  void bpf_link_put(struct bpf_link *link)
  {
        if (!atomic64_dec_and_test(&link->refcnt))
                return;
  
-       if (in_atomic()) {
-               INIT_WORK(&link->work, bpf_link_put_deferred);
-               schedule_work(&link->work);
-       } else {
-               bpf_link_free(link);
-       }
+       INIT_WORK(&link->work, bpf_link_put_deferred);
+       schedule_work(&link->work);
  }
  EXPORT_SYMBOL(bpf_link_put);
  
+ static void bpf_link_put_direct(struct bpf_link *link)
+ {
+       if (!atomic64_dec_and_test(&link->refcnt))
+               return;
+       bpf_link_free(link);
+ }
  static int bpf_link_release(struct inode *inode, struct file *filp)
  {
        struct bpf_link *link = filp->private_data;
  
-       bpf_link_put(link);
+       bpf_link_put_direct(link);
        return 0;
  }
  
@@@ -3463,11 -3516,6 +3520,11 @@@ static int bpf_prog_attach_check_attach
                return prog->enforce_expected_attach_type &&
                        prog->expected_attach_type != attach_type ?
                        -EINVAL : 0;
 +      case BPF_PROG_TYPE_KPROBE:
 +              if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
 +                  attach_type != BPF_TRACE_KPROBE_MULTI)
 +                      return -EINVAL;
 +              return 0;
        default:
                return 0;
        }
@@@ -4622,12 -4670,7 +4679,12 @@@ static int link_create(union bpf_attr *
  
        switch (prog->type) {
        case BPF_PROG_TYPE_EXT:
 +              break;
        case BPF_PROG_TYPE_NETFILTER:
 +              if (attr->link_create.attach_type != BPF_NETFILTER) {
 +                      ret = -EINVAL;
 +                      goto out;
 +              }
                break;
        case BPF_PROG_TYPE_PERF_EVENT:
        case BPF_PROG_TYPE_TRACEPOINT:
@@@ -4801,7 -4844,7 +4858,7 @@@ out_put_progs
        if (ret)
                bpf_prog_put(new_prog);
  out_put_link:
-       bpf_link_put(link);
+       bpf_link_put_direct(link);
        return ret;
  }
  
@@@ -4824,7 -4867,7 +4881,7 @@@ static int link_detach(union bpf_attr *
        else
                ret = -EOPNOTSUPP;
  
-       bpf_link_put(link);
+       bpf_link_put_direct(link);
        return ret;
  }
  
@@@ -4894,7 -4937,7 +4951,7 @@@ static int bpf_link_get_fd_by_id(const 
  
        fd = bpf_link_new_fd(link);
        if (fd < 0)
-               bpf_link_put(link);
+               bpf_link_put_direct(link);
  
        return fd;
  }
@@@ -4971,7 -5014,7 +5028,7 @@@ static int bpf_iter_create(union bpf_at
                return PTR_ERR(link);
  
        err = bpf_iter_new_fd(link);
-       bpf_link_put(link);
+       bpf_link_put_direct(link);
  
        return err;
  }
@@@ -5041,23 -5084,8 +5098,8 @@@ out_prog_put
  static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
  {
        union bpf_attr attr;
-       bool capable;
        int err;
  
-       capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled;
-       /* Intent here is for unprivileged_bpf_disabled to block key object
-        * creation commands for unprivileged users; other actions depend
-        * of fd availability and access to bpffs, so are dependent on
-        * object creation success.  Capabilities are later verified for
-        * operations such as load and map create, so even with unprivileged
-        * BPF disabled, capability checks are still carried out for these
-        * and other operations.
-        */
-       if (!capable &&
-           (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD))
-               return -EPERM;
        err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
        if (err)
                return err;
diff --combined kernel/bpf/verifier.c
index b54193de762ba6167619326ff2f2b52f699790f4,fa43dc8e85b99cfc13afc55291233f3f48074022..11e54dd8b6ddcc2afc9d54824e0832c364e557e0
@@@ -197,6 -197,7 +197,7 @@@ static int ref_set_non_owning(struct bp
                              struct bpf_reg_state *reg);
  static void specialize_kfunc(struct bpf_verifier_env *env,
                             u32 func_id, u16 offset, unsigned long *addr);
+ static bool is_trusted_reg(const struct bpf_reg_state *reg);
  
  static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
  {
@@@ -298,16 -299,19 +299,19 @@@ struct bpf_kfunc_call_arg_meta 
                bool found;
        } arg_constant;
  
-       /* arg_btf and arg_btf_id are used by kfunc-specific handling,
+       /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
         * generally to pass info about user-defined local kptr types to later
         * verification logic
         *   bpf_obj_drop
         *     Record the local kptr type to be drop'd
         *   bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
-        *     Record the local kptr type to be refcount_incr'd
+        *     Record the local kptr type to be refcount_incr'd and use
+        *     arg_owning_ref to determine whether refcount_acquire should be
+        *     fallible
         */
        struct btf *arg_btf;
        u32 arg_btf_id;
+       bool arg_owning_ref;
  
        struct {
                struct btf_field *field;
@@@ -439,8 -443,11 +443,11 @@@ static bool type_may_be_null(u32 type
        return type & PTR_MAYBE_NULL;
  }
  
- static bool reg_type_not_null(enum bpf_reg_type type)
+ static bool reg_not_null(const struct bpf_reg_state *reg)
  {
+       enum bpf_reg_type type;
+       type = reg->type;
        if (type_may_be_null(type))
                return false;
  
                type == PTR_TO_MAP_VALUE ||
                type == PTR_TO_MAP_KEY ||
                type == PTR_TO_SOCK_COMMON ||
+               (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) ||
                type == PTR_TO_MEM;
  }
  
@@@ -3771,6 -3779,96 +3779,96 @@@ static void mark_all_scalars_imprecise(
        }
  }
  
+ static bool idset_contains(struct bpf_idset *s, u32 id)
+ {
+       u32 i;
+       for (i = 0; i < s->count; ++i)
+               if (s->ids[i] == id)
+                       return true;
+       return false;
+ }
+ static int idset_push(struct bpf_idset *s, u32 id)
+ {
+       if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids)))
+               return -EFAULT;
+       s->ids[s->count++] = id;
+       return 0;
+ }
+ static void idset_reset(struct bpf_idset *s)
+ {
+       s->count = 0;
+ }
+ /* Collect a set of IDs for all registers currently marked as precise in env->bt.
+  * Mark all registers with these IDs as precise.
+  */
+ static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+ {
+       struct bpf_idset *precise_ids = &env->idset_scratch;
+       struct backtrack_state *bt = &env->bt;
+       struct bpf_func_state *func;
+       struct bpf_reg_state *reg;
+       DECLARE_BITMAP(mask, 64);
+       int i, fr;
+       idset_reset(precise_ids);
+       for (fr = bt->frame; fr >= 0; fr--) {
+               func = st->frame[fr];
+               bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
+               for_each_set_bit(i, mask, 32) {
+                       reg = &func->regs[i];
+                       if (!reg->id || reg->type != SCALAR_VALUE)
+                               continue;
+                       if (idset_push(precise_ids, reg->id))
+                               return -EFAULT;
+               }
+               bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
+               for_each_set_bit(i, mask, 64) {
+                       if (i >= func->allocated_stack / BPF_REG_SIZE)
+                               break;
+                       if (!is_spilled_scalar_reg(&func->stack[i]))
+                               continue;
+                       reg = &func->stack[i].spilled_ptr;
+                       if (!reg->id)
+                               continue;
+                       if (idset_push(precise_ids, reg->id))
+                               return -EFAULT;
+               }
+       }
+       for (fr = 0; fr <= st->curframe; ++fr) {
+               func = st->frame[fr];
+               for (i = BPF_REG_0; i < BPF_REG_10; ++i) {
+                       reg = &func->regs[i];
+                       if (!reg->id)
+                               continue;
+                       if (!idset_contains(precise_ids, reg->id))
+                               continue;
+                       bt_set_frame_reg(bt, fr, i);
+               }
+               for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) {
+                       if (!is_spilled_scalar_reg(&func->stack[i]))
+                               continue;
+                       reg = &func->stack[i].spilled_ptr;
+                       if (!reg->id)
+                               continue;
+                       if (!idset_contains(precise_ids, reg->id))
+                               continue;
+                       bt_set_frame_slot(bt, fr, i);
+               }
+       }
+       return 0;
+ }
  /*
   * __mark_chain_precision() backtracks BPF program instruction sequence and
   * chain of verifier states making sure that register *regno* (if regno >= 0)
@@@ -3902,6 -4000,31 +4000,31 @@@ static int __mark_chain_precision(struc
                                bt->frame, last_idx, first_idx, subseq_idx);
                }
  
+               /* If some register with scalar ID is marked as precise,
+                * make sure that all registers sharing this ID are also precise.
+                * This is needed to estimate effect of find_equal_scalars().
+                * Do this at the last instruction of each state,
+                * bpf_reg_state::id fields are valid for these instructions.
+                *
+                * Allows to track precision in situation like below:
+                *
+                *     r2 = unknown value
+                *     ...
+                *   --- state #0 ---
+                *     ...
+                *     r1 = r2                 // r1 and r2 now share the same ID
+                *     ...
+                *   --- state #1 {r1.id = A, r2.id = A} ---
+                *     ...
+                *     if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1
+                *     ...
+                *   --- state #2 {r1.id = A, r2.id = A} ---
+                *     r3 = r10
+                *     r3 += r1                // need to mark both r1 and r2
+                */
+               if (mark_precise_scalar_ids(env, st))
+                       return -EFAULT;
                if (last_idx < 0) {
                        /* we are at the entry into subprog, which
                         * is expected for global funcs, but only if
@@@ -4222,9 -4345,6 +4345,9 @@@ static int check_stack_write_fixed_off(
                                return err;
                }
                save_register_state(state, spi, reg, size);
 +              /* Break the relation on a narrowing spill. */
 +              if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
 +                      state->stack[spi].spilled_ptr.id = 0;
        } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
                   insn->imm != 0 && env->bpf_capable) {
                struct bpf_reg_state fake_reg = {};
@@@ -5894,7 -6014,7 +6017,7 @@@ static int check_ptr_to_btf_access(stru
                 * program allocated objects (which always have ref_obj_id > 0),
                 * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
                 */
-               if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+               if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) {
                        verbose(env, "only read is supported\n");
                        return -EACCES;
                }
@@@ -7514,7 -7634,7 +7637,7 @@@ static int check_reg_type(struct bpf_ve
        if (base_type(arg_type) == ARG_PTR_TO_MEM)
                type &= ~DYNPTR_TYPE_FLAG_MASK;
  
-       if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC)
+       if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type))
                type &= ~MEM_ALLOC;
  
        for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
@@@ -9681,11 -9801,6 +9804,6 @@@ static bool is_kfunc_acquire(struct bpf
        return meta->kfunc_flags & KF_ACQUIRE;
  }
  
- static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
- {
-       return meta->kfunc_flags & KF_RET_NULL;
- }
  static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
  {
        return meta->kfunc_flags & KF_RELEASE;
@@@ -10001,6 -10116,16 +10119,16 @@@ BTF_ID(func, bpf_dynptr_slice
  BTF_ID(func, bpf_dynptr_slice_rdwr)
  BTF_ID(func, bpf_dynptr_clone)
  
+ static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
+ {
+       if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
+           meta->arg_owning_ref) {
+               return false;
+       }
+       return meta->kfunc_flags & KF_RET_NULL;
+ }
  static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
  {
        return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock];
@@@ -10478,6 -10603,8 +10606,8 @@@ __process_kf_arg_ptr_to_graph_node(stru
                        node_off, btf_name_by_offset(reg->btf, t->name_off));
                return -EINVAL;
        }
+       meta->arg_btf = reg->btf;
+       meta->arg_btf_id = reg->btf_id;
  
        if (node_off != field->graph_root.node_offset) {
                verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n",
@@@ -10881,10 -11008,12 +11011,12 @@@ static int check_kfunc_args(struct bpf_
                        meta->subprogno = reg->subprogno;
                        break;
                case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
-                       if (!type_is_ptr_alloc_obj(reg->type) && !type_is_non_owning_ref(reg->type)) {
+                       if (!type_is_ptr_alloc_obj(reg->type)) {
                                verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
                                return -EINVAL;
                        }
+                       if (!type_is_non_owning_ref(reg->type))
+                               meta->arg_owning_ref = true;
  
                        rec = reg_btf_record(reg);
                        if (!rec) {
@@@ -11047,6 -11176,7 +11179,7 @@@ static int check_kfunc_call(struct bpf_
            meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
                release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
                insn_aux->insert_off = regs[BPF_REG_2].off;
+               insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
                err = ref_convert_owning_non_owning(env, release_ref_obj_id);
                if (err) {
                        verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
@@@ -12804,12 -12934,14 +12937,14 @@@ static int check_alu_op(struct bpf_veri
                if (BPF_SRC(insn->code) == BPF_X) {
                        struct bpf_reg_state *src_reg = regs + insn->src_reg;
                        struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
+                       bool need_id = src_reg->type == SCALAR_VALUE && !src_reg->id &&
+                                      !tnum_is_const(src_reg->var_off);
  
                        if (BPF_CLASS(insn->code) == BPF_ALU64) {
                                /* case: R1 = R2
                                 * copy register state to dest reg
                                 */
-                               if (src_reg->type == SCALAR_VALUE && !src_reg->id)
+                               if (need_id)
                                        /* Assign src and dst registers the same ID
                                         * that will be used by find_equal_scalars()
                                         * to propagate min/max range.
                                } else if (src_reg->type == SCALAR_VALUE) {
                                        bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX;
  
-                                       if (is_src_reg_u32 && !src_reg->id)
+                                       if (is_src_reg_u32 && need_id)
                                                src_reg->id = ++env->id_gen;
                                        copy_register_state(dst_reg, src_reg);
                                        /* Make sure ID is cleared if src_reg is not in u32 range otherwise
@@@ -13160,7 -13292,7 +13295,7 @@@ static int is_branch_taken(struct bpf_r
                           bool is_jmp32)
  {
        if (__is_pointer_value(false, reg)) {
-               if (!reg_type_not_null(reg->type))
+               if (!reg_not_null(reg))
                        return -1;
  
                /* If pointer is valid tests against zero will fail so we can
@@@ -14984,8 -15116,9 +15119,9 @@@ static bool range_within(struct bpf_reg
   * So we look through our idmap to see if this old id has been seen before.  If
   * so, we require the new id to match; otherwise, we add the id pair to the map.
   */
- static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)
+ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
  {
+       struct bpf_id_pair *map = idmap->map;
        unsigned int i;
  
        /* either both IDs should be set or both should be zero */
                return true;
  
        for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
-               if (!idmap[i].old) {
+               if (!map[i].old) {
                        /* Reached an empty slot; haven't seen this id before */
-                       idmap[i].old = old_id;
-                       idmap[i].cur = cur_id;
+                       map[i].old = old_id;
+                       map[i].cur = cur_id;
                        return true;
                }
-               if (idmap[i].old == old_id)
-                       return idmap[i].cur == cur_id;
+               if (map[i].old == old_id)
+                       return map[i].cur == cur_id;
+               if (map[i].cur == cur_id)
+                       return false;
        }
        /* We ran out of idmap slots, which should be impossible */
        WARN_ON_ONCE(1);
        return false;
  }
  
+ /* Similar to check_ids(), but allocate a unique temporary ID
+  * for 'old_id' or 'cur_id' of zero.
+  * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid.
+  */
+ static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
+ {
+       old_id = old_id ? old_id : ++idmap->tmp_id_gen;
+       cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
+       return check_ids(old_id, cur_id, idmap);
+ }
  static void clean_func_state(struct bpf_verifier_env *env,
                             struct bpf_func_state *st)
  {
@@@ -15108,7 -15255,7 +15258,7 @@@ next
  
  static bool regs_exact(const struct bpf_reg_state *rold,
                       const struct bpf_reg_state *rcur,
-                      struct bpf_id_pair *idmap)
+                      struct bpf_idmap *idmap)
  {
        return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
               check_ids(rold->id, rcur->id, idmap) &&
  
  /* Returns true if (rold safe implies rcur safe) */
  static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
-                   struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
+                   struct bpf_reg_state *rcur, struct bpf_idmap *idmap)
  {
        if (!(rold->live & REG_LIVE_READ))
                /* explored state didn't use this */
  
        switch (base_type(rold->type)) {
        case SCALAR_VALUE:
-               if (regs_exact(rold, rcur, idmap))
-                       return true;
-               if (env->explore_alu_limits)
-                       return false;
+               if (env->explore_alu_limits) {
+                       /* explore_alu_limits disables tnum_in() and range_within()
+                        * logic and requires everything to be strict
+                        */
+                       return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+                              check_scalar_ids(rold->id, rcur->id, idmap);
+               }
                if (!rold->precise)
                        return true;
-               /* new val must satisfy old val knowledge */
+               /* Why check_ids() for scalar registers?
+                *
+                * Consider the following BPF code:
+                *   1: r6 = ... unbound scalar, ID=a ...
+                *   2: r7 = ... unbound scalar, ID=b ...
+                *   3: if (r6 > r7) goto +1
+                *   4: r6 = r7
+                *   5: if (r6 > X) goto ...
+                *   6: ... memory operation using r7 ...
+                *
+                * First verification path is [1-6]:
+                * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
+                * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark
+                *   r7 <= X, because r6 and r7 share same id.
+                * Next verification path is [1-4, 6].
+                *
+                * Instruction (6) would be reached in two states:
+                *   I.  r6{.id=b}, r7{.id=b} via path 1-6;
+                *   II. r6{.id=a}, r7{.id=b} via path 1-4, 6.
+                *
+                * Use check_ids() to distinguish these states.
+                * ---
+                * Also verify that new value satisfies old value range knowledge.
+                */
                return range_within(rold, rcur) &&
-                      tnum_in(rold->var_off, rcur->var_off);
+                      tnum_in(rold->var_off, rcur->var_off) &&
+                      check_scalar_ids(rold->id, rcur->id, idmap);
        case PTR_TO_MAP_KEY:
        case PTR_TO_MAP_VALUE:
        case PTR_TO_MEM:
  }
  
  static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
-                     struct bpf_func_state *cur, struct bpf_id_pair *idmap)
+                     struct bpf_func_state *cur, struct bpf_idmap *idmap)
  {
        int i, spi;
  
  }
  
  static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
-                   struct bpf_id_pair *idmap)
+                   struct bpf_idmap *idmap)
  {
        int i;
  
@@@ -15359,13 -15533,13 +15536,13 @@@ static bool func_states_equal(struct bp
  
        for (i = 0; i < MAX_BPF_REG; i++)
                if (!regsafe(env, &old->regs[i], &cur->regs[i],
-                            env->idmap_scratch))
+                            &env->idmap_scratch))
                        return false;
  
-       if (!stacksafe(env, old, cur, env->idmap_scratch))
+       if (!stacksafe(env, old, cur, &env->idmap_scratch))
                return false;
  
-       if (!refsafe(old, cur, env->idmap_scratch))
+       if (!refsafe(old, cur, &env->idmap_scratch))
                return false;
  
        return true;
@@@ -15380,7 -15554,8 +15557,8 @@@ static bool states_equal(struct bpf_ver
        if (old->curframe != cur->curframe)
                return false;
  
-       memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
+       env->idmap_scratch.tmp_id_gen = env->id_gen;
+       memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
  
        /* Verification state from speculative execution simulation
         * must never prune a non-speculative execution one.
                return false;
  
        if (old->active_lock.id &&
-           !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch))
+           !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch))
                return false;
  
        if (old->active_rcu_lock != cur->active_rcu_lock)
@@@ -17616,10 -17791,9 +17794,10 @@@ static int jit_subprogs(struct bpf_veri
        }
  
        /* finally lock prog and jit images for all functions and
 -       * populate kallsysm
 +       * populate kallsysm. Begin at the first subprogram, since
 +       * bpf_prog_load will add the kallsyms for the main program.
         */
 -      for (i = 0; i < env->subprog_cnt; i++) {
 +      for (i = 1; i < env->subprog_cnt; i++) {
                bpf_prog_lock_ro(func[i]);
                bpf_prog_kallsyms_add(func[i]);
        }
        prog->jited = 1;
        prog->bpf_func = func[0]->bpf_func;
        prog->jited_len = func[0]->jited_len;
 +      prog->aux->extable = func[0]->aux->extable;
 +      prog->aux->num_exentries = func[0]->aux->num_exentries;
        prog->aux->func = func;
        prog->aux->func_cnt = env->subprog_cnt;
        bpf_prog_jit_attempt_done(prog);
index 6961a7b700281037cd6fbb8e05a63c21978ff5c5,a7b5e91dd768e7d5b716272488295f828aa9aa1c..60a9d59beeabba9bdbaa94946aa0c28e3f435463
@@@ -1035,7 -1035,6 +1035,7 @@@ enum bpf_attach_type 
        BPF_TRACE_KPROBE_MULTI,
        BPF_LSM_CGROUP,
        BPF_STRUCT_OPS,
 +      BPF_NETFILTER,
        __MAX_BPF_ATTACH_TYPE
  };
  
@@@ -3178,6 -3177,10 +3178,10 @@@ union bpf_attr 
   *            **BPF_FIB_LOOKUP_DIRECT**
   *                    Do a direct table lookup vs full lookup using FIB
   *                    rules.
+  *            **BPF_FIB_LOOKUP_TBID**
+  *                    Used with BPF_FIB_LOOKUP_DIRECT.
+  *                    Use the routing table ID present in *params*->tbid
+  *                    for the fib lookup.
   *            **BPF_FIB_LOOKUP_OUTPUT**
   *                    Perform lookup from an egress perspective (default is
   *                    ingress).
@@@ -6832,6 -6835,7 +6836,7 @@@ enum 
        BPF_FIB_LOOKUP_DIRECT  = (1U << 0),
        BPF_FIB_LOOKUP_OUTPUT  = (1U << 1),
        BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
+       BPF_FIB_LOOKUP_TBID    = (1U << 3),
  };
  
  enum {
@@@ -6892,9 -6896,19 +6897,19 @@@ struct bpf_fib_lookup 
                __u32           ipv6_dst[4];  /* in6_addr; network order */
        };
  
-       /* output */
-       __be16  h_vlan_proto;
-       __be16  h_vlan_TCI;
+       union {
+               struct {
+                       /* output */
+                       __be16  h_vlan_proto;
+                       __be16  h_vlan_TCI;
+               };
+               /* input: when accompanied with the
+                * 'BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_TBID` flags, a
+                * specific routing table to use for the fib lookup.
+                */
+               __u32   tbid;
+       };
        __u8    smac[6];     /* ETH_ALEN */
        __u8    dmac[6];     /* ETH_ALEN */
  };
This page took 0.192647 seconds and 4 git commands to generate.