1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 /* Copyright (C) 2017-2018 Netronome Systems, Inc. */
19 #include <sys/mount.h>
20 #include <sys/resource.h>
24 #include <linux/filter.h>
25 #include <linux/limits.h>
26 #include <linux/magic.h>
27 #include <linux/unistd.h>
30 #include <bpf/hashmap.h>
31 #include <bpf/libbpf.h> /* libbpf_num_possible_cpus */
37 #define BPF_FS_MAGIC 0xcafe4a11
40 void p_err(const char *fmt, ...)
46 jsonw_start_object(json_wtr);
47 jsonw_name(json_wtr, "error");
48 jsonw_vprintf_enquote(json_wtr, fmt, ap);
49 jsonw_end_object(json_wtr);
51 fprintf(stderr, "Error: ");
52 vfprintf(stderr, fmt, ap);
53 fprintf(stderr, "\n");
58 void p_info(const char *fmt, ...)
66 vfprintf(stderr, fmt, ap);
67 fprintf(stderr, "\n");
71 static bool is_bpffs(const char *path)
75 if (statfs(path, &st_fs) < 0)
78 return (unsigned long)st_fs.f_type == BPF_FS_MAGIC;
81 /* Probe whether kernel switched from memlock-based (RLIMIT_MEMLOCK) to
82 * memcg-based memory accounting for BPF maps and programs. This was done in
83 * commit 97306be45fbe ("Merge branch 'switch to memcg-based memory
84 * accounting'"), in Linux 5.11.
86 * Libbpf also offers to probe for memcg-based accounting vs rlimit, but does
87 * so by checking for the availability of a given BPF helper and this has
88 * failed on some kernels with backports in the past, see commit 6b4384ff1088
89 * ("Revert "bpftool: Use libbpf 1.0 API mode instead of RLIMIT_MEMLOCK"").
90 * Instead, we can probe by lowering the process-based rlimit to 0, trying to
91 * load a BPF object, and resetting the rlimit. If the load succeeds then
92 * memcg-based accounting is supported.
94 * This would be too dangerous to do in the library, because multithreaded
95 * applications might attempt to load items while the rlimit is at 0. Given
96 * that bpftool is single-threaded, this is fine to do here.
98 static bool known_to_need_rlimit(void)
100 struct rlimit rlim_init, rlim_cur_zero = {};
101 struct bpf_insn insns[] = {
102 BPF_MOV64_IMM(BPF_REG_0, 0),
105 size_t insn_cnt = ARRAY_SIZE(insns);
109 memset(&attr, 0, sizeof(attr));
110 attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
111 attr.insns = ptr_to_u64(insns);
112 attr.insn_cnt = insn_cnt;
113 attr.license = ptr_to_u64("GPL");
115 if (getrlimit(RLIMIT_MEMLOCK, &rlim_init))
118 /* Drop the soft limit to zero. We maintain the hard limit to its
119 * current value, because lowering it would be a permanent operation
120 * for unprivileged users.
122 rlim_cur_zero.rlim_max = rlim_init.rlim_max;
123 if (setrlimit(RLIMIT_MEMLOCK, &rlim_cur_zero))
126 /* Do not use bpf_prog_load() from libbpf here, because it calls
127 * bump_rlimit_memlock(), interfering with the current probe.
129 prog_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
132 /* reset soft rlimit to its initial value */
133 setrlimit(RLIMIT_MEMLOCK, &rlim_init);
142 void set_max_rlimit(void)
144 struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };
146 if (known_to_need_rlimit())
147 setrlimit(RLIMIT_MEMLOCK, &rinf);
151 mnt_fs(const char *target, const char *type, char *buff, size_t bufflen)
153 bool bind_done = false;
155 while (mount("", target, "none", MS_PRIVATE | MS_REC, NULL)) {
156 if (errno != EINVAL || bind_done) {
157 snprintf(buff, bufflen,
158 "mount --make-private %s failed: %s",
159 target, strerror(errno));
163 if (mount(target, target, "none", MS_BIND, NULL)) {
164 snprintf(buff, bufflen,
165 "mount --bind %s %s failed: %s",
166 target, target, strerror(errno));
173 if (mount(type, target, type, 0, "mode=0700")) {
174 snprintf(buff, bufflen, "mount -t %s %s %s failed: %s",
175 type, type, target, strerror(errno));
182 int mount_tracefs(const char *target)
184 char err_str[ERR_MAX_LEN];
187 err = mnt_fs(target, "tracefs", err_str, ERR_MAX_LEN);
189 err_str[ERR_MAX_LEN - 1] = '\0';
190 p_err("can't mount tracefs: %s", err_str);
196 int open_obj_pinned(const char *path, bool quiet)
201 pname = strdup(path);
204 p_err("mem alloc failed");
208 fd = bpf_obj_get(pname);
211 p_err("bpf obj get (%s): %s", pname,
212 errno == EACCES && !is_bpffs(dirname(pname)) ?
213 "directory not in bpf file system (bpffs)" :
224 int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type)
226 enum bpf_obj_type type;
229 fd = open_obj_pinned(path, false);
233 type = get_fd_type(fd);
238 if (type != exp_type) {
239 p_err("incorrect object type: %s", get_fd_type_name(type));
247 int create_and_mount_bpffs_dir(const char *dir_name)
249 char err_str[ERR_MAX_LEN];
253 if (is_bpffs(dir_name))
256 dir_exists = access(dir_name, F_OK) == 0;
262 temp_name = strdup(dir_name);
264 p_err("mem alloc failed");
268 parent_name = dirname(temp_name);
270 if (is_bpffs(parent_name)) {
271 /* nothing to do if already mounted */
276 if (access(parent_name, F_OK) == -1) {
277 p_err("can't create dir '%s' to pin BPF object: parent dir '%s' doesn't exist",
278 dir_name, parent_name);
287 p_err("no BPF file system found, not mounting it due to --nomount option");
292 err = mkdir(dir_name, S_IRWXU);
294 p_err("failed to create dir '%s': %s", dir_name, strerror(errno));
299 err = mnt_fs(dir_name, "bpf", err_str, ERR_MAX_LEN);
301 err_str[ERR_MAX_LEN - 1] = '\0';
302 p_err("can't mount BPF file system on given dir '%s': %s",
312 int mount_bpffs_for_file(const char *file_name)
314 char err_str[ERR_MAX_LEN];
319 if (access(file_name, F_OK) != -1) {
320 p_err("can't pin BPF object: path '%s' already exists", file_name);
324 temp_name = strdup(file_name);
326 p_err("mem alloc failed");
330 dir = dirname(temp_name);
333 /* nothing to do if already mounted */
336 if (access(dir, F_OK) == -1) {
337 p_err("can't pin BPF object: dir '%s' doesn't exist", dir);
343 p_err("no BPF file system found, not mounting it due to --nomount option");
348 err = mnt_fs(dir, "bpf", err_str, ERR_MAX_LEN);
350 err_str[ERR_MAX_LEN - 1] = '\0';
351 p_err("can't mount BPF file system to pin the object '%s': %s",
360 int do_pin_fd(int fd, const char *name)
364 err = mount_bpffs_for_file(name);
368 err = bpf_obj_pin(fd, name);
370 p_err("can't pin the object (%s): %s", name, strerror(errno));
375 int do_pin_any(int argc, char **argv, int (*get_fd)(int *, char ***))
383 fd = get_fd(&argc, &argv);
387 err = do_pin_fd(fd, *argv);
393 const char *get_fd_type_name(enum bpf_obj_type type)
395 static const char * const names[] = {
396 [BPF_OBJ_UNKNOWN] = "unknown",
397 [BPF_OBJ_PROG] = "prog",
398 [BPF_OBJ_MAP] = "map",
399 [BPF_OBJ_LINK] = "link",
402 if (type < 0 || type >= ARRAY_SIZE(names) || !names[type])
403 return names[BPF_OBJ_UNKNOWN];
408 void get_prog_full_name(const struct bpf_prog_info *prog_info, int prog_fd,
409 char *name_buff, size_t buff_len)
411 const char *prog_name = prog_info->name;
412 const struct btf_type *func_type;
413 struct bpf_func_info finfo = {};
414 struct bpf_prog_info info = {};
415 __u32 info_len = sizeof(info);
416 struct btf *prog_btf = NULL;
418 if (buff_len <= BPF_OBJ_NAME_LEN ||
419 strlen(prog_info->name) < BPF_OBJ_NAME_LEN - 1)
422 if (!prog_info->btf_id || prog_info->nr_func_info == 0)
425 info.nr_func_info = 1;
426 info.func_info_rec_size = prog_info->func_info_rec_size;
427 if (info.func_info_rec_size > sizeof(finfo))
428 info.func_info_rec_size = sizeof(finfo);
429 info.func_info = ptr_to_u64(&finfo);
431 if (bpf_prog_get_info_by_fd(prog_fd, &info, &info_len))
434 prog_btf = btf__load_from_kernel_by_id(info.btf_id);
438 func_type = btf__type_by_id(prog_btf, finfo.type_id);
439 if (!func_type || !btf_is_func(func_type))
442 prog_name = btf__name_by_offset(prog_btf, func_type->name_off);
445 snprintf(name_buff, buff_len, "%s", prog_name);
451 int get_fd_type(int fd)
457 snprintf(path, sizeof(path), "/proc/self/fd/%d", fd);
459 n = readlink(path, buf, sizeof(buf));
461 p_err("can't read link type: %s", strerror(errno));
464 if (n == sizeof(path)) {
465 p_err("can't read link type: path too long!");
469 if (strstr(buf, "bpf-map"))
471 else if (strstr(buf, "bpf-prog"))
473 else if (strstr(buf, "bpf-link"))
476 return BPF_OBJ_UNKNOWN;
479 char *get_fdinfo(int fd, const char *key)
487 snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", fd);
489 fdi = fopen(path, "r");
493 while ((n = getline(&line, &line_n, fdi)) > 0) {
497 if (!strstr(line, key))
502 value = strchr(line, '\t');
503 if (!value || !value[1]) {
510 memmove(line, value, len);
511 line[len - 1] = '\0';
521 void print_data_json(uint8_t *data, size_t len)
525 jsonw_start_array(json_wtr);
526 for (i = 0; i < len; i++)
527 jsonw_printf(json_wtr, "%d", data[i]);
528 jsonw_end_array(json_wtr);
531 void print_hex_data_json(uint8_t *data, size_t len)
535 jsonw_start_array(json_wtr);
536 for (i = 0; i < len; i++)
537 jsonw_printf(json_wtr, "\"0x%02hhx\"", data[i]);
538 jsonw_end_array(json_wtr);
541 /* extra params for nftw cb */
542 static struct hashmap *build_fn_table;
543 static enum bpf_obj_type build_fn_type;
545 static int do_build_table_cb(const char *fpath, const struct stat *sb,
546 int typeflag, struct FTW *ftwbuf)
548 struct bpf_prog_info pinned_info;
549 __u32 len = sizeof(pinned_info);
550 enum bpf_obj_type objtype;
554 if (typeflag != FTW_F)
557 fd = open_obj_pinned(fpath, true);
561 objtype = get_fd_type(fd);
562 if (objtype != build_fn_type)
565 memset(&pinned_info, 0, sizeof(pinned_info));
566 if (bpf_prog_get_info_by_fd(fd, &pinned_info, &len))
569 path = strdup(fpath);
575 err = hashmap__append(build_fn_table, pinned_info.id, path);
577 p_err("failed to append entry to hashmap for ID %u, path '%s': %s",
578 pinned_info.id, path, strerror(errno));
589 int build_pinned_obj_table(struct hashmap *tab,
590 enum bpf_obj_type type)
592 struct mntent *mntent = NULL;
593 FILE *mntfile = NULL;
594 int flags = FTW_PHYS;
598 mntfile = setmntent("/proc/mounts", "r");
602 build_fn_table = tab;
603 build_fn_type = type;
605 while ((mntent = getmntent(mntfile))) {
606 char *path = mntent->mnt_dir;
608 if (strncmp(mntent->mnt_type, "bpf", 3) != 0)
610 err = nftw(path, do_build_table_cb, nopenfd, flags);
618 void delete_pinned_obj_table(struct hashmap *map)
620 struct hashmap_entry *entry;
626 hashmap__for_each_entry(map, entry, bkt)
632 unsigned int get_page_size(void)
637 result = getpagesize();
641 unsigned int get_possible_cpus(void)
643 int cpus = libbpf_num_possible_cpus();
646 p_err("Can't get # of possible cpus: %s", strerror(-cpus));
653 ifindex_to_name_ns(__u32 ifindex, __u32 ns_dev, __u32 ns_ino, char *buf)
658 err = stat("/proc/self/ns/net", &st);
660 p_err("Can't stat /proc/self: %s", strerror(errno));
664 if (st.st_dev != ns_dev || st.st_ino != ns_ino)
667 return if_indextoname(ifindex, buf);
670 static int read_sysfs_hex_int(char *path)
672 char vendor_id_buf[8];
676 fd = open(path, O_RDONLY);
678 p_err("Can't open %s: %s", path, strerror(errno));
682 len = read(fd, vendor_id_buf, sizeof(vendor_id_buf));
685 p_err("Can't read %s: %s", path, strerror(errno));
688 if (len >= (int)sizeof(vendor_id_buf)) {
689 p_err("Value in %s too long", path);
693 vendor_id_buf[len] = 0;
695 return strtol(vendor_id_buf, NULL, 0);
698 static int read_sysfs_netdev_hex_int(char *devname, const char *entry_name)
702 snprintf(full_path, sizeof(full_path), "/sys/class/net/%s/device/%s",
703 devname, entry_name);
705 return read_sysfs_hex_int(full_path);
709 ifindex_to_arch(__u32 ifindex, __u64 ns_dev, __u64 ns_ino, const char **opt)
711 __maybe_unused int device_id;
712 char devname[IF_NAMESIZE];
715 if (!ifindex_to_name_ns(ifindex, ns_dev, ns_ino, devname)) {
716 p_err("Can't get net device name for ifindex %d: %s", ifindex,
721 vendor_id = read_sysfs_netdev_hex_int(devname, "vendor");
723 p_err("Can't get device vendor id for %s", devname);
728 #ifdef HAVE_LIBBFD_SUPPORT
730 device_id = read_sysfs_netdev_hex_int(devname, "device");
731 if (device_id != 0x4000 &&
732 device_id != 0x6000 &&
734 p_info("Unknown NFP device ID, assuming it is NFP-6xxx arch");
737 #endif /* HAVE_LIBBFD_SUPPORT */
738 /* No NFP support in LLVM, we have no valid triple to return. */
740 p_err("Can't get arch name for device vendor id 0x%04x",
746 void print_dev_plain(__u32 ifindex, __u64 ns_dev, __u64 ns_inode)
748 char name[IF_NAMESIZE];
753 printf(" offloaded_to ");
754 if (ifindex_to_name_ns(ifindex, ns_dev, ns_inode, name))
757 printf("ifindex %u ns_dev %llu ns_ino %llu",
758 ifindex, ns_dev, ns_inode);
761 void print_dev_json(__u32 ifindex, __u64 ns_dev, __u64 ns_inode)
763 char name[IF_NAMESIZE];
768 jsonw_name(json_wtr, "dev");
769 jsonw_start_object(json_wtr);
770 jsonw_uint_field(json_wtr, "ifindex", ifindex);
771 jsonw_uint_field(json_wtr, "ns_dev", ns_dev);
772 jsonw_uint_field(json_wtr, "ns_inode", ns_inode);
773 if (ifindex_to_name_ns(ifindex, ns_dev, ns_inode, name))
774 jsonw_string_field(json_wtr, "ifname", name);
775 jsonw_end_object(json_wtr);
778 int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what)
785 p_err("%s already specified", what);
789 *val = strtoul(**argv, &endptr, 0);
791 p_err("can't parse %s as %s", **argv, what);
800 print_all_levels(__maybe_unused enum libbpf_print_level level,
801 const char *format, va_list args)
803 return vfprintf(stderr, format, args);
806 static int prog_fd_by_nametag(void *nametag, int **fds, bool tag)
808 char prog_name[MAX_PROG_FULL_NAME];
815 struct bpf_prog_info info = {};
816 __u32 len = sizeof(info);
818 err = bpf_prog_get_next_id(id, &id);
820 if (errno != ENOENT) {
821 p_err("%s", strerror(errno));
827 fd = bpf_prog_get_fd_by_id(id);
829 p_err("can't get prog by id (%u): %s",
830 id, strerror(errno));
834 err = bpf_prog_get_info_by_fd(fd, &info, &len);
836 p_err("can't get prog info (%u): %s",
837 id, strerror(errno));
841 if (tag && memcmp(nametag, info.tag, BPF_TAG_SIZE)) {
847 get_prog_full_name(&info, fd, prog_name,
849 if (strncmp(nametag, prog_name, sizeof(prog_name))) {
856 tmp = realloc(*fds, (nb_fds + 1) * sizeof(int));
858 p_err("failed to realloc");
863 (*fds)[nb_fds++] = fd;
869 while (--nb_fds >= 0)
870 close((*fds)[nb_fds]);
874 int prog_parse_fds(int *argc, char ***argv, int **fds)
876 if (is_prefix(**argv, "id")) {
882 id = strtoul(**argv, &endptr, 0);
884 p_err("can't parse %s as ID", **argv);
889 (*fds)[0] = bpf_prog_get_fd_by_id(id);
891 p_err("get by id (%u): %s", id, strerror(errno));
895 } else if (is_prefix(**argv, "tag")) {
896 unsigned char tag[BPF_TAG_SIZE];
900 if (sscanf(**argv, BPF_TAG_FMT, tag, tag + 1, tag + 2,
901 tag + 3, tag + 4, tag + 5, tag + 6, tag + 7)
903 p_err("can't parse tag");
908 return prog_fd_by_nametag(tag, fds, true);
909 } else if (is_prefix(**argv, "name")) {
915 if (strlen(name) > MAX_PROG_FULL_NAME - 1) {
916 p_err("can't parse name");
921 return prog_fd_by_nametag(name, fds, false);
922 } else if (is_prefix(**argv, "pinned")) {
930 (*fds)[0] = open_obj_pinned_any(path, BPF_OBJ_PROG);
936 p_err("expected 'id', 'tag', 'name' or 'pinned', got: '%s'?", **argv);
940 int prog_parse_fd(int *argc, char ***argv)
945 fds = malloc(sizeof(int));
947 p_err("mem alloc failed");
950 nb_fds = prog_parse_fds(argc, argv, &fds);
953 p_err("several programs match this handle");
967 static int map_fd_by_name(char *name, int **fds)
975 struct bpf_map_info info = {};
976 __u32 len = sizeof(info);
978 err = bpf_map_get_next_id(id, &id);
980 if (errno != ENOENT) {
981 p_err("%s", strerror(errno));
987 fd = bpf_map_get_fd_by_id(id);
989 p_err("can't get map by id (%u): %s",
990 id, strerror(errno));
994 err = bpf_map_get_info_by_fd(fd, &info, &len);
996 p_err("can't get map info (%u): %s",
997 id, strerror(errno));
1001 if (strncmp(name, info.name, BPF_OBJ_NAME_LEN)) {
1007 tmp = realloc(*fds, (nb_fds + 1) * sizeof(int));
1009 p_err("failed to realloc");
1014 (*fds)[nb_fds++] = fd;
1020 while (--nb_fds >= 0)
1021 close((*fds)[nb_fds]);
1025 int map_parse_fds(int *argc, char ***argv, int **fds)
1027 if (is_prefix(**argv, "id")) {
1033 id = strtoul(**argv, &endptr, 0);
1035 p_err("can't parse %s as ID", **argv);
1040 (*fds)[0] = bpf_map_get_fd_by_id(id);
1041 if ((*fds)[0] < 0) {
1042 p_err("get map by id (%u): %s", id, strerror(errno));
1046 } else if (is_prefix(**argv, "name")) {
1052 if (strlen(name) > BPF_OBJ_NAME_LEN - 1) {
1053 p_err("can't parse name");
1058 return map_fd_by_name(name, fds);
1059 } else if (is_prefix(**argv, "pinned")) {
1067 (*fds)[0] = open_obj_pinned_any(path, BPF_OBJ_MAP);
1073 p_err("expected 'id', 'name' or 'pinned', got: '%s'?", **argv);
1077 int map_parse_fd(int *argc, char ***argv)
1082 fds = malloc(sizeof(int));
1084 p_err("mem alloc failed");
1087 nb_fds = map_parse_fds(argc, argv, &fds);
1090 p_err("several maps match this handle");
1104 int map_parse_fd_and_info(int *argc, char ***argv, struct bpf_map_info *info,
1110 fd = map_parse_fd(argc, argv);
1114 err = bpf_map_get_info_by_fd(fd, info, info_len);
1116 p_err("can't get map info: %s", strerror(errno));
1124 size_t hash_fn_for_key_as_id(long key, void *ctx)
1129 bool equal_fn_for_key_as_id(long k1, long k2, void *ctx)
1134 const char *bpf_attach_type_input_str(enum bpf_attach_type t)
1137 case BPF_CGROUP_INET_INGRESS: return "ingress";
1138 case BPF_CGROUP_INET_EGRESS: return "egress";
1139 case BPF_CGROUP_INET_SOCK_CREATE: return "sock_create";
1140 case BPF_CGROUP_INET_SOCK_RELEASE: return "sock_release";
1141 case BPF_CGROUP_SOCK_OPS: return "sock_ops";
1142 case BPF_CGROUP_DEVICE: return "device";
1143 case BPF_CGROUP_INET4_BIND: return "bind4";
1144 case BPF_CGROUP_INET6_BIND: return "bind6";
1145 case BPF_CGROUP_INET4_CONNECT: return "connect4";
1146 case BPF_CGROUP_INET6_CONNECT: return "connect6";
1147 case BPF_CGROUP_INET4_POST_BIND: return "post_bind4";
1148 case BPF_CGROUP_INET6_POST_BIND: return "post_bind6";
1149 case BPF_CGROUP_INET4_GETPEERNAME: return "getpeername4";
1150 case BPF_CGROUP_INET6_GETPEERNAME: return "getpeername6";
1151 case BPF_CGROUP_INET4_GETSOCKNAME: return "getsockname4";
1152 case BPF_CGROUP_INET6_GETSOCKNAME: return "getsockname6";
1153 case BPF_CGROUP_UDP4_SENDMSG: return "sendmsg4";
1154 case BPF_CGROUP_UDP6_SENDMSG: return "sendmsg6";
1155 case BPF_CGROUP_SYSCTL: return "sysctl";
1156 case BPF_CGROUP_UDP4_RECVMSG: return "recvmsg4";
1157 case BPF_CGROUP_UDP6_RECVMSG: return "recvmsg6";
1158 case BPF_CGROUP_GETSOCKOPT: return "getsockopt";
1159 case BPF_CGROUP_SETSOCKOPT: return "setsockopt";
1160 case BPF_TRACE_RAW_TP: return "raw_tp";
1161 case BPF_TRACE_FENTRY: return "fentry";
1162 case BPF_TRACE_FEXIT: return "fexit";
1163 case BPF_MODIFY_RETURN: return "mod_ret";
1164 case BPF_SK_REUSEPORT_SELECT: return "sk_skb_reuseport_select";
1165 case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: return "sk_skb_reuseport_select_or_migrate";
1166 default: return libbpf_bpf_attach_type_str(t);
1170 int pathname_concat(char *buf, int buf_sz, const char *path,
1175 len = snprintf(buf, buf_sz, "%s/%s", path, name);
1179 return -ENAMETOOLONG;