name: hash
doc:
Device is capable of exposing receive packet hash via bpf_xdp_metadata_rx_hash().
+ -
+ name: vlan-tag
+ doc:
+ Device is capable of exposing receive packet VLAN tag via bpf_xdp_metadata_rx_vlan_tag().
-
type: flags
name: xsk-flags
name: tx-checksum
doc:
L3 checksum HW offload is supported by the driver.
+ -
+ name: queue-type
+ type: enum
+ entries: [ rx, tx ]
attribute-sets:
-
name: recycle-released-refcnt
type: uint
+ -
+ name: napi
+ attributes:
+ -
+ name: ifindex
+ doc: ifindex of the netdevice to which NAPI instance belongs.
+ type: u32
+ checks:
+ min: 1
+ -
+ name: id
+ doc: ID of the NAPI instance.
+ type: u32
+ -
+ name: irq
+ doc: The associated interrupt vector number for the napi
+ type: u32
+ -
+ name: pid
+ doc: PID of the napi thread, if NAPI is configured to operate in
+ threaded mode. If NAPI is not in threaded mode (i.e. uses normal
+ softirq context), the attribute will be absent.
+ type: u32
+ -
+ name: queue
+ attributes:
+ -
+ name: id
+ doc: Queue index; most queue types are indexed like a C array, with
+ indexes starting at 0 and ending at queue count - 1. Queue indexes
+ are scoped to an interface and queue type.
+ type: u32
+ -
+ name: ifindex
+ doc: ifindex of the netdevice to which the queue belongs.
+ type: u32
+ checks:
+ min: 1
+ -
+ name: type
+ doc: Queue type as rx, tx. Each queue type defines a separate ID space.
+ type: u32
+ enum: queue-type
+ -
+ name: napi-id
+ doc: ID of the NAPI instance which services this queue.
+ type: u32
+
operations:
list:
-
dump:
reply: *pp-stats-reply
config-cond: page-pool-stats
+ -
+ name: queue-get
+ doc: Get queue information from the kernel.
+ Only configured queues will be reported (as opposed to all available
+ hardware queues).
+ attribute-set: queue
+ do:
+ request:
+ attributes:
+ - ifindex
+ - type
+ - id
+ reply: &queue-get-op
+ attributes:
+ - id
+ - type
+ - napi-id
+ - ifindex
+ dump:
+ request:
+ attributes:
+ - ifindex
+ reply: *queue-get-op
+ -
+ name: napi-get
+ doc: Get information about NAPI instances configured on the system.
+ attribute-set: napi
+ do:
+ request:
+ attributes:
+ - id
+ reply: &napi-get-op
+ attributes:
+ - id
+ - ifindex
+ - irq
+ - pid
+ dump:
+ request:
+ attributes:
+ - ifindex
+ reply: *napi-get-op
mcast-groups:
list:
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>
#include <asm/unwind.h>
+ #include <asm/cfi.h>
static bool all_callee_regs_used[4] = {true, true, true, true};
do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
#ifdef CONFIG_X86_KERNEL_IBT
- #define EMIT_ENDBR() EMIT(gen_endbr(), 4)
+ #define EMIT_ENDBR() EMIT(gen_endbr(), 4)
+ #define EMIT_ENDBR_POISON() EMIT(gen_endbr_poison(), 4)
#else
#define EMIT_ENDBR()
+ #define EMIT_ENDBR_POISON()
#endif
static bool is_imm8(int value)
*pprog = prog;
}
+ /*
+ * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT
+ * in arch/x86/kernel/alternative.c
+ */
+
+ static void emit_fineibt(u8 **pprog, u32 hash)
+ {
+ u8 *prog = *pprog;
+
+ EMIT_ENDBR();
+ EMIT3_off32(0x41, 0x81, 0xea, hash); /* subl $hash, %r10d */
+ EMIT2(0x74, 0x07); /* jz.d8 +7 */
+ EMIT2(0x0f, 0x0b); /* ud2 */
+ EMIT1(0x90); /* nop */
+ EMIT_ENDBR_POISON();
+
+ *pprog = prog;
+ }
+
+ static void emit_kcfi(u8 **pprog, u32 hash)
+ {
+ u8 *prog = *pprog;
+
+ EMIT1_off32(0xb8, hash); /* movl $hash, %eax */
+ #ifdef CONFIG_CALL_PADDING
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ #endif
+ EMIT_ENDBR();
+
+ *pprog = prog;
+ }
+
+ static void emit_cfi(u8 **pprog, u32 hash)
+ {
+ u8 *prog = *pprog;
+
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ emit_fineibt(&prog, hash);
+ break;
+
+ case CFI_KCFI:
+ emit_kcfi(&prog, hash);
+ break;
+
+ default:
+ EMIT_ENDBR();
+ break;
+ }
+
+ *pprog = prog;
+ }
+
/*
* Emit x86-64 prologue code for BPF program.
* bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
{
u8 *prog = *pprog;
+ emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash);
/* BPF trampoline can be made to work without these nops,
* but let's waste 5 bytes for now and optimize later
*/
- EMIT_ENDBR();
memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
if (!ebpf_from_cbpf) {
static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_link *l, int stack_size,
- int run_ctx_off, bool save_ret)
+ int run_ctx_off, bool save_ret,
+ void *image, void *rw_image)
{
u8 *prog = *pprog;
u8 *jmp_insn;
else
EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
- if (emit_rsb_call(&prog, bpf_trampoline_enter(p), prog))
+ if (emit_rsb_call(&prog, bpf_trampoline_enter(p), image + (prog - (u8 *)rw_image)))
return -EINVAL;
/* remember prog start time returned by __bpf_prog_enter */
emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
(long) p->insnsi >> 32,
(u32) (long) p->insnsi);
/* call JITed bpf program or interpreter */
- if (emit_rsb_call(&prog, p->bpf_func, prog))
+ if (emit_rsb_call(&prog, p->bpf_func, image + (prog - (u8 *)rw_image)))
return -EINVAL;
/*
EMIT3_off32(0x48, 0x8D, 0x95, -run_ctx_off);
else
EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
- if (emit_rsb_call(&prog, bpf_trampoline_exit(p), prog))
+ if (emit_rsb_call(&prog, bpf_trampoline_exit(p), image + (prog - (u8 *)rw_image)))
return -EINVAL;
*pprog = prog;
static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_links *tl, int stack_size,
- int run_ctx_off, bool save_ret)
+ int run_ctx_off, bool save_ret,
+ void *image, void *rw_image)
{
int i;
u8 *prog = *pprog;
for (i = 0; i < tl->nr_links; i++) {
if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size,
- run_ctx_off, save_ret))
+ run_ctx_off, save_ret, image, rw_image))
return -EINVAL;
}
*pprog = prog;
static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_links *tl, int stack_size,
- int run_ctx_off, u8 **branches)
+ int run_ctx_off, u8 **branches,
+ void *image, void *rw_image)
{
u8 *prog = *pprog;
int i;
emit_mov_imm32(&prog, false, BPF_REG_0, 0);
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
for (i = 0; i < tl->nr_links; i++) {
- if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true))
+ if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true,
+ image, rw_image))
return -EINVAL;
/* mod_ret prog stored return value into [rbp - 8]. Emit:
* add rsp, 8 // skip eth_type_trans's frame
* ret // return to its caller
*/
- int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
- const struct btf_func_model *m, u32 flags,
- struct bpf_tramp_links *tlinks,
- void *func_addr)
+ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image,
+ void *rw_image_end, void *image,
+ const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks,
+ void *func_addr)
{
int i, ret, nr_regs = m->nr_args, stack_size = 0;
int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
u8 *prog;
bool save_ret;
+ /*
+ * F_INDIRECT is only compatible with F_RET_FENTRY_RET, it is
+ * explicitly incompatible with F_CALL_ORIG | F_SKIP_FRAME | F_IP_ARG
+ * because @func_addr.
+ */
+ WARN_ON_ONCE((flags & BPF_TRAMP_F_INDIRECT) &&
+ (flags & ~(BPF_TRAMP_F_INDIRECT | BPF_TRAMP_F_RET_FENTRY_RET)));
+
/* extra registers for struct arguments */
- for (i = 0; i < m->nr_args; i++)
+ for (i = 0; i < m->nr_args; i++) {
if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
nr_regs += (m->arg_size[i] + 7) / 8 - 1;
+ }
/* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6
* are passed through regs, the remains are through stack.
orig_call += X86_PATCH_SIZE;
}
- prog = image;
+ prog = rw_image;
- EMIT_ENDBR();
- /*
- * This is the direct-call trampoline, as such it needs accounting
- * for the __fentry__ call.
- */
- x86_call_depth_emit_accounting(&prog, NULL);
+ if (flags & BPF_TRAMP_F_INDIRECT) {
+ /*
+ * Indirect call for bpf_struct_ops
+ */
+ emit_cfi(&prog, cfi_get_func_hash(func_addr));
+ } else {
+ /*
+ * Direct-call fentry stub, as such it needs accounting for the
+ * __fentry__ call.
+ */
+ x86_call_depth_emit_accounting(&prog, NULL);
+ }
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
- if (!is_imm8(stack_size))
+ if (!is_imm8(stack_size)) {
/* sub rsp, stack_size */
EMIT3_off32(0x48, 0x81, 0xEC, stack_size);
- else
+ } else {
/* sub rsp, stack_size */
EMIT4(0x48, 0x83, 0xEC, stack_size);
+ }
if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
EMIT1(0x50); /* push rax */
/* mov QWORD PTR [rbp - rbx_off], rbx */
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
- if (emit_rsb_call(&prog, __bpf_tramp_enter, prog)) {
+ if (emit_rsb_call(&prog, __bpf_tramp_enter,
+ image + (prog - (u8 *)rw_image))) {
ret = -EINVAL;
goto cleanup;
}
}
- if (fentry->nr_links)
+ if (fentry->nr_links) {
if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off,
- flags & BPF_TRAMP_F_RET_FENTRY_RET))
+ flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image))
return -EINVAL;
+ }
if (fmod_ret->nr_links) {
branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *),
return -ENOMEM;
if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off,
- run_ctx_off, branches)) {
+ run_ctx_off, branches, image, rw_image)) {
ret = -EINVAL;
goto cleanup;
}
restore_regs(m, &prog, regs_off);
save_args(m, &prog, arg_stack_off, true);
- if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+ if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
/* Before calling the original function, restore the
* tail_call_cnt from stack to rax.
*/
RESTORE_TAIL_CALL_CNT(stack_size);
+ }
if (flags & BPF_TRAMP_F_ORIG_STACK) {
emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8);
EMIT2(0xff, 0xd3); /* call *rbx */
} else {
/* call original function */
- if (emit_rsb_call(&prog, orig_call, prog)) {
+ if (emit_rsb_call(&prog, orig_call, image + (prog - (u8 *)rw_image))) {
ret = -EINVAL;
goto cleanup;
}
}
/* remember return value in a stack for bpf prog to access */
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
- im->ip_after_call = prog;
+ im->ip_after_call = image + (prog - (u8 *)rw_image);
memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
}
/* Update the branches saved in invoke_bpf_mod_ret with the
* aligned address of do_fexit.
*/
- for (i = 0; i < fmod_ret->nr_links; i++)
- emit_cond_near_jump(&branches[i], prog, branches[i],
- X86_JNE);
+ for (i = 0; i < fmod_ret->nr_links; i++) {
+ emit_cond_near_jump(&branches[i], image + (prog - (u8 *)rw_image),
+ image + (branches[i] - (u8 *)rw_image), X86_JNE);
+ }
}
- if (fexit->nr_links)
- if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, false)) {
+ if (fexit->nr_links) {
+ if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off,
+ false, image, rw_image)) {
ret = -EINVAL;
goto cleanup;
}
+ }
if (flags & BPF_TRAMP_F_RESTORE_REGS)
restore_regs(m, &prog, regs_off);
* restored to R0.
*/
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- im->ip_epilogue = prog;
+ im->ip_epilogue = image + (prog - (u8 *)rw_image);
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
- if (emit_rsb_call(&prog, __bpf_tramp_exit, prog)) {
+ if (emit_rsb_call(&prog, __bpf_tramp_exit, image + (prog - (u8 *)rw_image))) {
ret = -EINVAL;
goto cleanup;
}
- } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+ } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
/* Before running the original function, restore the
* tail_call_cnt from stack to rax.
*/
RESTORE_TAIL_CALL_CNT(stack_size);
+ }
/* restore return value of orig_call or fentry prog back into RAX */
if (save_ret)
emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off);
EMIT1(0xC9); /* leave */
- if (flags & BPF_TRAMP_F_SKIP_FRAME)
+ if (flags & BPF_TRAMP_F_SKIP_FRAME) {
/* skip our return address and return to parent */
EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
- emit_return(&prog, prog);
+ }
+ emit_return(&prog, image + (prog - (u8 *)rw_image));
/* Make sure the trampoline generation logic doesn't overflow */
- if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) {
+ if (WARN_ON_ONCE(prog > (u8 *)rw_image_end - BPF_INSN_SAFETY)) {
ret = -EFAULT;
goto cleanup;
}
- ret = prog - (u8 *)image;
+ ret = prog - (u8 *)rw_image + BPF_INSN_SAFETY;
cleanup:
kfree(branches);
return ret;
}
+ void *arch_alloc_bpf_trampoline(unsigned int size)
+ {
+ return bpf_prog_pack_alloc(size, jit_fill_hole);
+ }
+
+ void arch_free_bpf_trampoline(void *image, unsigned int size)
+ {
+ bpf_prog_pack_free(image, size);
+ }
+
+ void arch_protect_bpf_trampoline(void *image, unsigned int size)
+ {
+ }
+
+ void arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+ {
+ }
+
+ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
+ const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks,
+ void *func_addr)
+ {
+ void *rw_image, *tmp;
+ int ret;
+ u32 size = image_end - image;
+
+ /* rw_image doesn't need to be in module memory range, so we can
+ * use kvmalloc.
+ */
+ rw_image = kvmalloc(size, GFP_KERNEL);
+ if (!rw_image)
+ return -ENOMEM;
+
+ ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m,
+ flags, tlinks, func_addr);
+ if (ret < 0)
+ goto out;
+
+ tmp = bpf_arch_text_copy(image, rw_image, size);
+ if (IS_ERR(tmp))
+ ret = PTR_ERR(tmp);
+ out:
+ kvfree(rw_image);
+ return ret;
+ }
+
+ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks, void *func_addr)
+ {
+ struct bpf_tramp_image im;
+ void *image;
+ int ret;
+
+ /* Allocate a temporary buffer for __arch_prepare_bpf_trampoline().
+ * This will NOT cause fragmentation in direct map, as we do not
+ * call set_memory_*() on this buffer.
+ *
+ * We cannot use kvmalloc here, because we need image to be in
+ * module memory range.
+ */
+ image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!image)
+ return -ENOMEM;
+
+ ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image,
+ m, flags, tlinks, func_addr);
+ bpf_jit_free_exec(image);
+ return ret;
+ }
+
static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
{
u8 *jg_reloc, *prog = *pprog;
jit_data->header = header;
jit_data->rw_header = rw_header;
}
- prog->bpf_func = (void *)image;
+ /*
+ * ctx.prog_offset is used when CFI preambles put code *before*
+ * the function. See emit_cfi(). For FineIBT specifically this code
+ * can also be executed and bpf_prog_kallsyms_add() will
+ * generate an additional symbol to cover this, hence also
+ * decrement proglen.
+ */
+ prog->bpf_func = (void *)image + cfi_get_offset();
prog->jited = 1;
- prog->jited_len = proglen;
+ prog->jited_len = proglen - cfi_get_offset();
} else {
prog = orig_prog;
}
kvfree(jit_data->addrs);
kfree(jit_data);
}
+ prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
hdr = bpf_jit_binary_pack_hdr(prog);
bpf_jit_binary_pack_free(hdr, NULL);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
#endif
WARN(1, "verification of programs using bpf_throw should have failed\n");
}
+
+void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+ struct bpf_prog *new, struct bpf_prog *old)
+{
+ u8 *old_addr, *new_addr, *old_bypass_addr;
+ int ret;
+
+ old_bypass_addr = old ? NULL : poke->bypass_addr;
+ old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
+ new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
+
+ /*
+ * On program loading or teardown, the program's kallsym entry
+ * might not be in place, so we use __bpf_arch_text_poke to skip
+ * the kallsyms check.
+ */
+ if (new) {
+ ret = __bpf_arch_text_poke(poke->tailcall_target,
+ BPF_MOD_JUMP,
+ old_addr, new_addr);
+ BUG_ON(ret < 0);
+ if (!old) {
+ ret = __bpf_arch_text_poke(poke->tailcall_bypass,
+ BPF_MOD_JUMP,
+ poke->bypass_addr,
+ NULL);
+ BUG_ON(ret < 0);
+ }
+ } else {
+ ret = __bpf_arch_text_poke(poke->tailcall_bypass,
+ BPF_MOD_JUMP,
+ old_bypass_addr,
+ poke->bypass_addr);
+ BUG_ON(ret < 0);
+ /* let other CPUs finish the execution of program
+ * so that it will not possible to expose them
+ * to invalid nop, stack unwind, nop state
+ */
+ if (!ret)
+ synchronize_rcu();
+ ret = __bpf_arch_text_poke(poke->tailcall_target,
+ BPF_MOD_JUMP,
+ old_addr, NULL);
+ BUG_ON(ret < 0);
+ }
+}
/* RSS config */
u16 rss_table_size; /* HW RSS table size */
u16 rss_size; /* Allocated RSS queues */
+ u8 rss_hfunc; /* User configured hash type */
u8 *rss_hkey_user; /* User configured hash keys */
u8 *rss_lut_user; /* User configured lookup table entries */
u8 rss_lut_type; /* used to configure Get/Set RSS LUT AQ call */
struct ice_vsi_stats **vsi_stats;
struct ice_sw *first_sw; /* first switch created by firmware */
u16 eswitch_mode; /* current mode of eswitch */
+ struct dentry *ice_debugfs_pf;
+ struct dentry *ice_debugfs_pf_fwlog;
+ /* keep track of all the dentrys for FW log modules */
+ struct dentry **ice_debugfs_pf_fwlog_modules;
struct ice_vfs vfs;
DECLARE_BITMAP(features, ICE_F_MAX);
DECLARE_BITMAP(state, ICE_STATE_NBITS);
#define ICE_MAX_VF_AGG_NODES 32
struct ice_agg_node vf_agg_node[ICE_MAX_VF_AGG_NODES];
struct ice_dplls dplls;
+ struct device *hwmon_dev;
};
extern struct workqueue_struct *ice_lag_wq;
return false;
}
+void ice_debugfs_fwlog_init(struct ice_pf *pf);
+void ice_debugfs_init(void);
+void ice_debugfs_exit(void);
+void ice_pf_fwlog_update_module(struct ice_pf *pf, int log_level, int module);
+
bool netif_is_ice(const struct net_device *dev);
int ice_vsi_setup_tx_rings(struct ice_vsi *vsi);
int ice_vsi_setup_rx_rings(struct ice_vsi *vsi);
int ice_get_rss_lut(struct ice_vsi *vsi, u8 *lut, u16 lut_size);
int ice_set_rss_key(struct ice_vsi *vsi, u8 *seed);
int ice_get_rss_key(struct ice_vsi *vsi, u8 *seed);
+int ice_set_rss_hfunc(struct ice_vsi *vsi, u8 hfunc);
void ice_fill_rss_lut(u8 *lut, u16 rss_table_size, u16 rss_size);
int ice_schedule_reset(struct ice_pf *pf, enum ice_reset_req reset);
void ice_print_link_msg(struct ice_vsi *vsi, bool isup);
set_bit(ICE_FLAG_UNPLUG_AUX_DEV, pf->flags);
clear_bit(ICE_FLAG_RDMA_ENA, pf->flags);
}
+
+ extern const struct xdp_metadata_ops ice_xdp_md_ops;
#endif /* _ICE_H_ */
}
q_vector = vsi->q_vectors[v_idx];
- ice_for_each_tx_ring(tx_ring, q_vector->tx)
+ ice_for_each_tx_ring(tx_ring, q_vector->tx) {
+ if (vsi->netdev)
+ netif_queue_set_napi(vsi->netdev, tx_ring->q_index,
+ NETDEV_QUEUE_TYPE_TX, NULL);
tx_ring->q_vector = NULL;
- ice_for_each_rx_ring(rx_ring, q_vector->rx)
+ }
+ ice_for_each_rx_ring(rx_ring, q_vector->rx) {
+ if (vsi->netdev)
+ netif_queue_set_napi(vsi->netdev, rx_ring->q_index,
+ NETDEV_QUEUE_TYPE_RX, NULL);
rx_ring->q_vector = NULL;
+ }
/* only VSI with an associated netdev is set up with NAPI */
if (vsi->netdev)
return 0;
}
+ static void ice_xsk_pool_fill_cb(struct ice_rx_ring *ring)
+ {
+ void *ctx_ptr = &ring->pkt_ctx;
+ struct xsk_cb_desc desc = {};
+
+ XSK_CHECK_PRIV_TYPE(struct ice_xdp_buff);
+ desc.src = &ctx_ptr;
+ desc.off = offsetof(struct ice_xdp_buff, pkt_ctx) -
+ sizeof(struct xdp_buff);
+ desc.bytes = sizeof(ctx_ptr);
+ xsk_pool_fill_cb(ring->xsk_pool, &desc);
+ }
+
/**
* ice_vsi_cfg_rxq - Configure an Rx queue
* @ring: the ring being configured
if (err)
return err;
xsk_pool_set_rxq_info(ring->xsk_pool, &ring->xdp_rxq);
+ ice_xsk_pool_fill_cb(ring);
dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
ring->q_index);
xdp_init_buff(&ring->xdp, ice_rx_pg_size(ring) / 2, &ring->xdp_rxq);
ring->xdp.data = NULL;
+ ring->xdp_ext.pkt_ctx = &ring->pkt_ctx;
err = ice_setup_rx_ctx(ring);
if (err) {
dev_err(dev, "ice_setup_rx_ctx failed for RxQ %d, err %d\n",
#include "ice_dcb_lib.h"
#include "ice_dcb_nl.h"
#include "ice_devlink.h"
+#include "ice_hwmon.h"
/* Including ice_trace.h with CREATE_TRACE_POINTS defined will generate the
* ice tracepoint functions. This must be done exactly once across the
* ice driver.
return status;
}
+/**
+ * ice_get_fwlog_data - copy the FW log data from ARQ event
+ * @pf: PF that the FW log event is associated with
+ * @event: event structure containing FW log data
+ */
+static void
+ice_get_fwlog_data(struct ice_pf *pf, struct ice_rq_event_info *event)
+{
+ struct ice_fwlog_data *fwlog;
+ struct ice_hw *hw = &pf->hw;
+
+ fwlog = &hw->fwlog_ring.rings[hw->fwlog_ring.tail];
+
+ memset(fwlog->data, 0, PAGE_SIZE);
+ fwlog->data_size = le16_to_cpu(event->desc.datalen);
+
+ memcpy(fwlog->data, event->msg_buf, fwlog->data_size);
+ ice_fwlog_ring_increment(&hw->fwlog_ring.tail, hw->fwlog_ring.size);
+
+ if (ice_fwlog_ring_full(&hw->fwlog_ring)) {
+ /* the rings are full so bump the head to create room */
+ ice_fwlog_ring_increment(&hw->fwlog_ring.head,
+ hw->fwlog_ring.size);
+ }
+}
+
/**
* ice_aq_prep_for_event - Prepare to wait for an AdminQ event from firmware
* @pf: pointer to the PF private structure
ice_vc_process_vf_msg(pf, &event, &data);
break;
- case ice_aqc_opc_fw_logging:
- ice_output_fw_log(hw, &event.desc, event.msg_buf);
+ case ice_aqc_opc_fw_logs_event:
+ ice_get_fwlog_data(pf, &event);
break;
case ice_aqc_opc_lldp_set_mib_change:
ice_dcb_process_lldp_set_mib_change(pf, &event);
if (oicr & PFINT_OICR_TSYN_TX_M) {
ena_mask &= ~PFINT_OICR_TSYN_TX_M;
- if (!hw->reset_ongoing && ice_ptp_pf_handles_tx_interrupt(pf))
+ if (ice_ptp_pf_handles_tx_interrupt(pf))
set_bit(ICE_MISC_THREAD_TX_TSTAMP, pf->misc_thread);
}
if (!vsi->netdev)
return;
- ice_for_each_q_vector(vsi, v_idx)
+ ice_for_each_q_vector(vsi, v_idx) {
netif_napi_add(vsi->netdev, &vsi->q_vectors[v_idx]->napi,
ice_napi_poll);
+ ice_q_vector_set_napi_queues(vsi->q_vectors[v_idx], false);
+ }
}
/**
netdev->netdev_ops = &ice_netdev_ops;
netdev->udp_tunnel_nic_info = &pf->hw.udp_tunnel_nic;
+ netdev->xdp_metadata_ops = &ice_xdp_md_ops;
ice_set_ethtool_ops(netdev);
if (vsi->type != ICE_VSI_PF)
dev_info(ice_pf_to_dev(pf), "Wake reason: %s", wake_str);
}
+/**
+ * ice_pf_fwlog_update_module - update 1 module
+ * @pf: pointer to the PF struct
+ * @log_level: log_level to use for the @module
+ * @module: module to update
+ */
+void ice_pf_fwlog_update_module(struct ice_pf *pf, int log_level, int module)
+{
+ struct ice_hw *hw = &pf->hw;
+
+ hw->fwlog_cfg.module_entries[module].log_level = log_level;
+}
+
/**
* ice_register_netdev - register netdev
* @vsi: pointer to the VSI struct
if (ice_init_lag(pf))
dev_warn(dev, "Failed to init link aggregation support\n");
+
+ ice_hwmon_init(pf);
}
static void ice_deinit_features(struct ice_pf *pf)
msleep(100);
}
+ ice_debugfs_exit();
+
if (test_bit(ICE_FLAG_SRIOV_ENA, pf->flags)) {
set_bit(ICE_VF_RESETS_DISABLED, pf->state);
ice_free_vfs(pf);
}
+ ice_hwmon_exit(pf);
+
ice_service_task_stop(pf);
ice_aq_cancel_waiting_tasks(pf);
set_bit(ICE_DOWN, pf->state);
goto err_dest_wq;
}
+ ice_debugfs_init();
+
status = pci_register_driver(&ice_driver);
if (status) {
pr_err("failed to register PCI driver, err %d\n", status);
err_dest_lag_wq:
destroy_workqueue(ice_lag_wq);
+ ice_debugfs_exit();
err_dest_wq:
destroy_workqueue(ice_wq);
return status;
return features;
}
+ /**
+ * ice_set_rx_rings_vlan_proto - update rings with new stripped VLAN proto
+ * @vsi: PF's VSI
+ * @vlan_ethertype: VLAN ethertype (802.1Q or 802.1ad) in network byte order
+ *
+ * Store current stripped VLAN proto in ring packet context,
+ * so it can be accessed more efficiently by packet processing code.
+ */
+ static void
+ ice_set_rx_rings_vlan_proto(struct ice_vsi *vsi, __be16 vlan_ethertype)
+ {
+ u16 i;
+
+ ice_for_each_alloc_rxq(vsi, i)
+ vsi->rx_rings[i]->pkt_ctx.vlan_proto = vlan_ethertype;
+ }
+
/**
* ice_set_vlan_offload_features - set VLAN offload features for the PF VSI
* @vsi: PF's VSI
if (strip_err || insert_err)
return -EIO;
+ ice_set_rx_rings_vlan_proto(vsi, enable_stripping ?
+ htons(vlan_ethertype) : 0);
+
return 0;
}
return status;
}
+/**
+ * ice_set_rss_hfunc - Set RSS HASH function
+ * @vsi: Pointer to VSI structure
+ * @hfunc: hash function (ICE_AQ_VSI_Q_OPT_RSS_*)
+ *
+ * Returns 0 on success, negative on failure
+ */
+int ice_set_rss_hfunc(struct ice_vsi *vsi, u8 hfunc)
+{
+ struct ice_hw *hw = &vsi->back->hw;
+ struct ice_vsi_ctx *ctx;
+ bool symm;
+ int err;
+
+ if (hfunc == vsi->rss_hfunc)
+ return 0;
+
+ if (hfunc != ICE_AQ_VSI_Q_OPT_RSS_HASH_TPLZ &&
+ hfunc != ICE_AQ_VSI_Q_OPT_RSS_HASH_SYM_TPLZ)
+ return -EOPNOTSUPP;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_Q_OPT_VALID);
+ ctx->info.q_opt_rss = vsi->info.q_opt_rss;
+ ctx->info.q_opt_rss &= ~ICE_AQ_VSI_Q_OPT_RSS_HASH_M;
+ ctx->info.q_opt_rss |=
+ FIELD_PREP(ICE_AQ_VSI_Q_OPT_RSS_HASH_M, hfunc);
+ ctx->info.q_opt_tc = vsi->info.q_opt_tc;
+ ctx->info.q_opt_flags = vsi->info.q_opt_rss;
+
+ err = ice_update_vsi(hw, vsi->idx, ctx, NULL);
+ if (err) {
+ dev_err(ice_pf_to_dev(vsi->back), "Failed to configure RSS hash for VSI %d, error %d\n",
+ vsi->vsi_num, err);
+ } else {
+ vsi->info.q_opt_rss = ctx->info.q_opt_rss;
+ vsi->rss_hfunc = hfunc;
+ netdev_info(vsi->netdev, "Hash function set to: %sToeplitz\n",
+ hfunc == ICE_AQ_VSI_Q_OPT_RSS_HASH_SYM_TPLZ ?
+ "Symmetric " : "");
+ }
+ kfree(ctx);
+ if (err)
+ return err;
+
+ /* Fix the symmetry setting for all existing RSS configurations */
+ symm = !!(hfunc == ICE_AQ_VSI_Q_OPT_RSS_HASH_SYM_TPLZ);
+ return ice_set_rss_cfg_symm(hw, vsi, symm);
+}
+
/**
* ice_bridge_getlink - Get the hardware bridge mode
* @skb: skb buff
for (tun = 0; tun < ICE_FD_HW_SEG_MAX; tun++) {
enum ice_flow_priority prio;
- u64 prof_id;
/* add this VSI to FDir profile for this flow */
prio = ICE_FLOW_PRIO_NORMAL;
prof = hw->fdir_prof[flow];
- prof_id = flow + tun * ICE_FLTR_PTYPE_MAX;
- status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id,
+ status = ice_flow_add_entry(hw, ICE_BLK_FD,
+ prof->prof_id[tun],
prof->vsi_h[0], vsi->idx,
prio, prof->fdir_seg[tun],
&entry_h);
#define E810_OUT_PROP_DELAY_NS 1
-#define UNKNOWN_INCVAL_E822 0x100000000ULL
+#define UNKNOWN_INCVAL_E82X 0x100000000ULL
static const struct ptp_pin_desc ice_pin_desc_e810t[] = {
/* name idx func chan */
/* Read the Tx ready status first */
err = ice_get_phy_tx_tstamp_ready(&pf->hw, i, &tstamp_ready);
- if (err || tstamp_ready)
+ if (err)
+ break;
+ else if (tstamp_ready)
return ICE_TX_TSTAMP_WORK_PENDING;
}
}
/**
- * ice_ptp_init_tx_e822 - Initialize tracking for Tx timestamps
+ * ice_ptp_init_tx_e82x - Initialize tracking for Tx timestamps
* @pf: Board private structure
* @tx: the Tx tracking structure to initialize
* @port: the port this structure tracks
* registers into chunks based on the port number.
*/
static int
-ice_ptp_init_tx_e822(struct ice_pf *pf, struct ice_ptp_tx *tx, u8 port)
+ice_ptp_init_tx_e82x(struct ice_pf *pf, struct ice_ptp_tx *tx, u8 port)
{
tx->block = port / ICE_PORTS_PER_QUAD;
- tx->offset = (port % ICE_PORTS_PER_QUAD) * INDEX_PER_PORT_E822;
- tx->len = INDEX_PER_PORT_E822;
+ tx->offset = (port % ICE_PORTS_PER_QUAD) * INDEX_PER_PORT_E82X;
+ tx->len = INDEX_PER_PORT_E82X;
tx->verify_cached = 0;
return ice_ptp_alloc_tx_tracker(tx);
if (ice_is_e810(hw))
incval = ICE_PTP_NOMINAL_INCVAL_E810;
- else if (ice_e822_time_ref(hw) < NUM_ICE_TIME_REF_FREQ)
- incval = ice_e822_nominal_incval(ice_e822_time_ref(hw));
+ else if (ice_e82x_time_ref(hw) < NUM_ICE_TIME_REF_FREQ)
+ incval = ice_e82x_nominal_incval(ice_e82x_time_ref(hw));
else
- incval = UNKNOWN_INCVAL_E822;
+ incval = UNKNOWN_INCVAL_E82X;
dev_dbg(ice_pf_to_dev(pf), "PTP: using base increment value of 0x%016llx\n",
incval);
/* need to read FIFO state */
if (offs == 0 || offs == 1)
- err = ice_read_quad_reg_e822(hw, quad, Q_REG_FIFO01_STATUS,
+ err = ice_read_quad_reg_e82x(hw, quad, Q_REG_FIFO01_STATUS,
&val);
else
- err = ice_read_quad_reg_e822(hw, quad, Q_REG_FIFO23_STATUS,
+ err = ice_read_quad_reg_e82x(hw, quad, Q_REG_FIFO23_STATUS,
&val);
if (err) {
dev_dbg(ice_pf_to_dev(pf),
"Port %d Tx FIFO still not empty; resetting quad %d\n",
port->port_num, quad);
- ice_ptp_reset_ts_memory_quad_e822(hw, quad);
+ ice_ptp_reset_ts_memory_quad_e82x(hw, quad);
port->tx_fifo_busy_cnt = FIFO_OK;
return 0;
}
tx_err = ice_ptp_check_tx_fifo(port);
if (!tx_err)
- tx_err = ice_phy_cfg_tx_offset_e822(hw, port->port_num);
- rx_err = ice_phy_cfg_rx_offset_e822(hw, port->port_num);
+ tx_err = ice_phy_cfg_tx_offset_e82x(hw, port->port_num);
+ rx_err = ice_phy_cfg_rx_offset_e82x(hw, port->port_num);
if (tx_err || rx_err) {
/* Tx and/or Rx offset not yet configured, try again later */
kthread_queue_delayed_work(pf->ptp.kworker,
kthread_cancel_delayed_work_sync(&ptp_port->ov_work);
- err = ice_stop_phy_timer_e822(hw, port, true);
+ err = ice_stop_phy_timer_e82x(hw, port, true);
if (err)
dev_err(ice_pf_to_dev(pf), "PTP failed to set PHY port %d down, err %d\n",
port, err);
ptp_port->tx_fifo_busy_cnt = 0;
/* Start the PHY timer in Vernier mode */
- err = ice_start_phy_timer_e822(hw, port);
+ err = ice_start_phy_timer_e82x(hw, port);
if (err)
goto out_unlock;
case ICE_PHY_E810:
/* Do not reconfigure E810 PHY */
return;
- case ICE_PHY_E822:
+ case ICE_PHY_E82X:
ice_ptp_port_phy_restart(ptp_port);
return;
default:
ice_ptp_reset_ts_memory(hw);
for (quad = 0; quad < ICE_MAX_QUAD; quad++) {
- err = ice_read_quad_reg_e822(hw, quad, Q_REG_TX_MEM_GBL_CFG,
+ err = ice_read_quad_reg_e82x(hw, quad, Q_REG_TX_MEM_GBL_CFG,
&val);
if (err)
break;
val &= ~Q_REG_TX_MEM_GBL_CFG_INTR_ENA_M;
}
- err = ice_write_quad_reg_e822(hw, quad, Q_REG_TX_MEM_GBL_CFG,
+ err = ice_write_quad_reg_e82x(hw, quad, Q_REG_TX_MEM_GBL_CFG,
val);
if (err)
break;
if (ice_is_e810(hw))
start_time -= E810_OUT_PROP_DELAY_NS;
else
- start_time -= ice_e822_pps_delay(ice_e822_time_ref(hw));
+ start_time -= ice_e82x_pps_delay(ice_e82x_time_ref(hw));
/* 2. Write TARGET time */
wr32(hw, GLTSYN_TGT_L(chan, tmr_idx), lower_32_bits(start_time));
ice_ptp_enable_all_clkout(pf);
/* Recalibrate and re-enable timestamp blocks for E822/E823 */
- if (hw->phy_model == ICE_PHY_E822)
+ if (hw->phy_model == ICE_PHY_E82X)
ice_ptp_restart_all_phy(pf);
exit:
if (err) {
}
/**
- * ice_ptp_rx_hwtstamp - Check for an Rx timestamp
- * @rx_ring: Ring to get the VSI info
+ * ice_ptp_get_rx_hwts - Get packet Rx timestamp in ns
* @rx_desc: Receive descriptor
- * @skb: Particular skb to send timestamp with
+ * @pkt_ctx: Packet context to get the cached time
*
* The driver receives a notification in the receive descriptor with timestamp.
- * The timestamp is in ns, so we must convert the result first.
*/
- void
- ice_ptp_rx_hwtstamp(struct ice_rx_ring *rx_ring,
- union ice_32b_rx_flex_desc *rx_desc, struct sk_buff *skb)
+ u64 ice_ptp_get_rx_hwts(const union ice_32b_rx_flex_desc *rx_desc,
+ const struct ice_pkt_ctx *pkt_ctx)
{
- struct skb_shared_hwtstamps *hwtstamps;
u64 ts_ns, cached_time;
u32 ts_high;
if (!(rx_desc->wb.time_stamp_low & ICE_PTP_TS_VALID))
- return;
+ return 0;
- cached_time = READ_ONCE(rx_ring->cached_phctime);
+ cached_time = READ_ONCE(pkt_ctx->cached_phctime);
/* Do not report a timestamp if we don't have a cached PHC time */
if (!cached_time)
- return;
+ return 0;
/* Use ice_ptp_extend_32b_ts directly, using the ring-specific cached
* PHC value, rather than accessing the PF. This also allows us to
ts_high = le32_to_cpu(rx_desc->wb.flex_ts.ts_high);
ts_ns = ice_ptp_extend_32b_ts(cached_time, ts_high);
- hwtstamps = skb_hwtstamps(skb);
- memset(hwtstamps, 0, sizeof(*hwtstamps));
- hwtstamps->hwtstamp = ns_to_ktime(ts_ns);
+ return ts_ns;
}
/**
}
}
+/**
+ * ice_ptp_maybe_trigger_tx_interrupt - Trigger Tx timstamp interrupt
+ * @pf: Board private structure
+ *
+ * The device PHY issues Tx timestamp interrupts to the driver for processing
+ * timestamp data from the PHY. It will not interrupt again until all
+ * current timestamp data is read. In rare circumstances, it is possible that
+ * the driver fails to read all outstanding data.
+ *
+ * To avoid getting permanently stuck, periodically check if the PHY has
+ * outstanding timestamp data. If so, trigger an interrupt from software to
+ * process this data.
+ */
+static void ice_ptp_maybe_trigger_tx_interrupt(struct ice_pf *pf)
+{
+ struct device *dev = ice_pf_to_dev(pf);
+ struct ice_hw *hw = &pf->hw;
+ bool trigger_oicr = false;
+ unsigned int i;
+
+ if (ice_is_e810(hw))
+ return;
+
+ if (!ice_pf_src_tmr_owned(pf))
+ return;
+
+ for (i = 0; i < ICE_MAX_QUAD; i++) {
+ u64 tstamp_ready;
+ int err;
+
+ err = ice_get_phy_tx_tstamp_ready(&pf->hw, i, &tstamp_ready);
+ if (!err && tstamp_ready) {
+ trigger_oicr = true;
+ break;
+ }
+ }
+
+ if (trigger_oicr) {
+ /* Trigger a software interrupt, to ensure this data
+ * gets processed.
+ */
+ dev_dbg(dev, "PTP periodic task detected waiting timestamps. Triggering Tx timestamp interrupt now.\n");
+
+ wr32(hw, PFINT_OICR, PFINT_OICR_TSYN_TX_M);
+ ice_flush(hw);
+ }
+}
+
static void ice_ptp_periodic_work(struct kthread_work *work)
{
struct ice_ptp *ptp = container_of(work, struct ice_ptp, work.work);
err = ice_ptp_update_cached_phctime(pf);
+ ice_ptp_maybe_trigger_tx_interrupt(pf);
+
/* Run twice a second or reschedule if phc update failed */
kthread_queue_delayed_work(ptp->kworker, &ptp->work,
msecs_to_jiffies(err ? 10 : 500));
int err, itr = 1;
u64 time_diff;
- if (test_bit(ICE_PFR_REQ, pf->state))
+ if (test_bit(ICE_PFR_REQ, pf->state) ||
+ !ice_pf_src_tmr_owned(pf))
goto pfr;
- if (!ice_pf_src_tmr_owned(pf))
- goto reset_ts;
-
err = ice_ptp_init_phc(hw);
if (err)
goto err;
goto err;
}
-reset_ts:
- /* Restart the PHY timestamping block */
- ice_ptp_reset_phy_timestamping(pf);
-
pfr:
/* Init Tx structures */
if (ice_is_e810(&pf->hw)) {
} else {
kthread_init_delayed_work(&ptp->port.ov_work,
ice_ptp_wait_for_offsets);
- err = ice_ptp_init_tx_e822(pf, &ptp->port.tx,
+ err = ice_ptp_init_tx_e82x(pf, &ptp->port.tx,
ptp->port.port_num);
}
if (err)
set_bit(ICE_FLAG_PTP, pf->flags);
+ /* Restart the PHY timestamping block */
+ if (!test_bit(ICE_PFR_REQ, pf->state) &&
+ ice_pf_src_tmr_owned(pf))
+ ice_ptp_restart_all_phy(pf);
+
/* Start periodic work going */
kthread_queue_delayed_work(ptp->kworker, &ptp->work, 0);
switch (hw->phy_model) {
case ICE_PHY_E810:
return ice_ptp_init_tx_e810(pf, &ptp_port->tx);
- case ICE_PHY_E822:
+ case ICE_PHY_E82X:
kthread_init_delayed_work(&ptp_port->ov_work,
ice_ptp_wait_for_offsets);
- return ice_ptp_init_tx_e822(pf, &ptp_port->tx,
+ return ice_ptp_init_tx_e82x(pf, &ptp_port->tx,
ptp_port->port_num);
default:
return -ENODEV;
static void ice_ptp_init_tx_interrupt_mode(struct ice_pf *pf)
{
switch (pf->hw.phy_model) {
- case ICE_PHY_E822:
+ case ICE_PHY_E82X:
/* E822 based PHY has the clock owner process the interrupt
* for all ports.
*/
/* Quad and port information for initializing timestamp blocks */
#define INDEX_PER_QUAD 64
-#define INDEX_PER_PORT_E822 16
+#define INDEX_PER_PORT_E82X 16
#define INDEX_PER_PORT_E810 64
/**
s8 ice_ptp_request_ts(struct ice_ptp_tx *tx, struct sk_buff *skb);
enum ice_tx_tstamp_work ice_ptp_process_ts(struct ice_pf *pf);
- void
- ice_ptp_rx_hwtstamp(struct ice_rx_ring *rx_ring,
- union ice_32b_rx_flex_desc *rx_desc, struct sk_buff *skb);
+ u64 ice_ptp_get_rx_hwts(const union ice_32b_rx_flex_desc *rx_desc,
+ const struct ice_pkt_ctx *pkt_ctx);
void ice_ptp_reset(struct ice_pf *pf);
void ice_ptp_prepare_for_reset(struct ice_pf *pf);
void ice_ptp_init(struct ice_pf *pf);
{
return true;
}
- static inline void
- ice_ptp_rx_hwtstamp(struct ice_rx_ring *rx_ring,
- union ice_32b_rx_flex_desc *rx_desc, struct sk_buff *skb) { }
+
+ static inline u64
+ ice_ptp_get_rx_hwts(const union ice_32b_rx_flex_desc *rx_desc,
+ const struct ice_pkt_ctx *pkt_ctx)
+ {
+ return 0;
+ }
+
static inline void ice_ptp_reset(struct ice_pf *pf) { }
static inline void ice_ptp_prepare_for_reset(struct ice_pf *pf) { }
static inline void ice_ptp_init(struct ice_pf *pf) { }
skb_add_rx_frag(nskb, i, page, page_offset, size,
truesize);
- if (skb_copy_bits(skb, off, page_address(page),
+ if (skb_copy_bits(skb, off,
+ page_address(page) + page_offset,
size)) {
consume_skb(nskb);
goto drop;
return 0;
}
+ static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto,
+ u16 *vlan_tci)
+ {
+ const struct veth_xdp_buff *_ctx = (void *)ctx;
+ const struct sk_buff *skb = _ctx->skb;
+ int err;
+
+ if (!skb)
+ return -ENODATA;
+
+ err = __vlan_hwaccel_get_tag(skb, vlan_tci);
+ if (err)
+ return err;
+
+ *vlan_proto = skb->vlan_proto;
+ return err;
+ }
+
static const struct net_device_ops veth_netdev_ops = {
.ndo_init = veth_dev_init,
.ndo_open = veth_open,
static const struct xdp_metadata_ops veth_xdp_metadata_ops = {
.xmo_rx_timestamp = veth_xdp_rx_timestamp,
.xmo_rx_hash = veth_xdp_rx_hash,
+ .xmo_rx_vlan_tag = veth_xdp_rx_vlan_tag,
};
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
#include <linux/rcupdate_trace.h>
#include <linux/static_call.h>
#include <linux/memcontrol.h>
+ #include <linux/cfi.h>
struct bpf_verifier_env;
struct bpf_verifier_log;
struct bpf_func_state;
struct ftrace_ops;
struct cgroup;
+ struct bpf_token;
+ struct user_namespace;
+ struct super_block;
+ struct inode;
extern struct idr btf_idr;
extern spinlock_t btf_idr_lock;
/* funcs called by prog_array and perf_event_array map */
void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
int fd);
- void (*map_fd_put_ptr)(void *ptr);
+ /* If need_defer is true, the implementation should guarantee that
+ * the to-be-put element is still alive before the bpf program, which
+ * may manipulate it, exists.
+ */
+ void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer);
int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
u32 (*map_fd_sys_lookup_elem)(void *ptr);
void (*map_seq_show_elem)(struct bpf_map *map, void *key,
*/
atomic64_t refcnt ____cacheline_aligned;
atomic64_t usercnt;
- struct work_struct work;
+ /* rcu is used before freeing and work is only used during freeing */
+ union {
+ struct work_struct work;
+ struct rcu_head rcu;
+ };
struct mutex freeze_mutex;
atomic64_t writecnt;
/* 'Ownership' of program-containing map is claimed by the first program
} owner;
bool bypass_spec_v1;
bool frozen; /* write-once; write-protected by freeze_mutex */
+ bool free_after_mult_rcu_gp;
+ bool free_after_rcu_gp;
+ atomic64_t sleepable_refcnt;
s64 __percpu *elem_count;
};
*/
#define BPF_TRAMP_F_TAIL_CALL_CTX BIT(7)
+ /*
+ * Indicate the trampoline should be suitable to receive indirect calls;
+ * without this indirectly calling the generated code can result in #UD/#CP,
+ * depending on the CFI options.
+ *
+ * Used by bpf_struct_ops.
+ *
+ * Incompatible with FENTRY usage, overloads @func_addr argument.
+ */
+ #define BPF_TRAMP_F_INDIRECT BIT(8)
+
/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
* bytes on x86.
*/
* fexit = a set of program to run after original function
*/
struct bpf_tramp_image;
- int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
+ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
struct bpf_tramp_links *tlinks,
- void *orig_call);
+ void *func_addr);
+ void *arch_alloc_bpf_trampoline(unsigned int size);
+ void arch_free_bpf_trampoline(void *image, unsigned int size);
+ void arch_protect_bpf_trampoline(void *image, unsigned int size);
+ void arch_unprotect_bpf_trampoline(void *image, unsigned int size);
+ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks, void *func_addr);
+
u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
struct bpf_tramp_image {
void *image;
+ int size;
struct bpf_ksym ksym;
struct percpu_ref pcref;
void *ip_after_call;
#endif
};
- static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func(
+ #ifndef __bpfcall
+ #define __bpfcall __nocfi
+ #endif
+
+ static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func(
const void *ctx,
const struct bpf_insn *insnsi,
bpf_func_t bpf_func)
#define DEFINE_BPF_DISPATCHER(name) \
__BPF_DISPATCHER_SC(name); \
- noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \
+ noinline __bpfcall unsigned int bpf_dispatcher_##name##_func( \
const void *ctx, \
const struct bpf_insn *insnsi, \
bpf_func_t bpf_func) \
void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
struct bpf_prog *to);
/* Called only from JIT-enabled code, so there's no need for stubs. */
- void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
+ void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym);
void bpf_image_ksym_del(struct bpf_ksym *ksym);
void bpf_ksym_add(struct bpf_ksym *ksym);
void bpf_ksym_del(struct bpf_ksym *ksym);
struct bpf_kfunc_desc_tab *kfunc_tab;
struct bpf_kfunc_btf_tab *kfunc_btf_tab;
u32 size_poke_tab;
+ #ifdef CONFIG_FINEIBT
+ struct bpf_ksym ksym_prefix;
+ #endif
struct bpf_ksym ksym;
const struct bpf_prog_ops *ops;
struct bpf_map **used_maps;
int cgroup_atype; /* enum cgroup_bpf_attach_type */
struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
char name[BPF_OBJ_NAME_LEN];
- unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp);
+ u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64);
#ifdef CONFIG_SECURITY
void *security;
#endif
+ struct bpf_token *token;
struct bpf_prog_offload *offload;
struct btf *btf;
struct bpf_func_info *func_info;
u32 id;
};
+ struct bpf_mount_opts {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+
+ /* BPF token-related delegation options */
+ u64 delegate_cmds;
+ u64 delegate_maps;
+ u64 delegate_progs;
+ u64 delegate_attachs;
+ };
+
+ struct bpf_token {
+ struct work_struct work;
+ atomic64_t refcnt;
+ struct user_namespace *userns;
+ u64 allowed_cmds;
+ u64 allowed_maps;
+ u64 allowed_progs;
+ u64 allowed_attachs;
+ #ifdef CONFIG_SECURITY
+ void *security;
+ #endif
+ };
+
struct bpf_struct_ops_value;
struct btf_member;
struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
u32 type_id;
u32 value_id;
+ void *cfi_stubs;
};
#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
struct bpf_tramp_link *link,
const struct btf_func_model *model,
+ void *stub_func,
void *image, void *image_end);
static inline bool bpf_try_module_get(const void *data, struct module *owner)
{
migrate_enable();
}
+ extern const struct super_operations bpf_super_ops;
extern const struct file_operations bpf_map_fops;
extern const struct file_operations bpf_prog_fops;
extern const struct file_operations bpf_iter_fops;
extern int sysctl_unprivileged_bpf_disabled;
- static inline bool bpf_allow_ptr_leaks(void)
+ bool bpf_token_capable(const struct bpf_token *token, int cap);
+
+ static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token)
{
- return perfmon_capable();
+ return bpf_token_capable(token, CAP_PERFMON);
}
- static inline bool bpf_allow_uninit_stack(void)
+ static inline bool bpf_allow_uninit_stack(const struct bpf_token *token)
{
- return perfmon_capable();
+ return bpf_token_capable(token, CAP_PERFMON);
}
- static inline bool bpf_bypass_spec_v1(void)
+ static inline bool bpf_bypass_spec_v1(const struct bpf_token *token)
{
- return cpu_mitigations_off() || perfmon_capable();
+ return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
}
- static inline bool bpf_bypass_spec_v4(void)
+ static inline bool bpf_bypass_spec_v4(const struct bpf_token *token)
{
- return cpu_mitigations_off() || perfmon_capable();
+ return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
}
int bpf_map_new_fd(struct bpf_map *map, int flags);
struct bpf_link *bpf_link_get_from_fd(u32 ufd);
struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
+ void bpf_token_inc(struct bpf_token *token);
+ void bpf_token_put(struct bpf_token *token);
+ int bpf_token_create(union bpf_attr *attr);
+ struct bpf_token *bpf_token_get_from_fd(u32 ufd);
+
+ bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
+ bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
+ bool bpf_token_allow_prog_type(const struct bpf_token *token,
+ enum bpf_prog_type prog_type,
+ enum bpf_attach_type attach_type);
+
int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
+ struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir,
+ umode_t mode);
#define BPF_ITER_FUNC_PREFIX "bpf_iter_"
#define DEFINE_BPF_ITER_FUNC(target, args...) \
int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
struct bpf_reg_state *regs);
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *reg, bool is_ex_cb);
+ struct bpf_reg_state *reg, u32 *nargs);
int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
struct btf *btf, const struct btf_type *t);
const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
struct bpf_prog *bpf_prog_by_id(u32 id);
struct bpf_link *bpf_link_by_id(u32 id);
- const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
+ const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog);
void bpf_task_storage_free(struct task_struct *task);
void bpf_cgrp_storage_free(struct cgroup *cgroup);
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
return -EOPNOTSUPP;
}
+ static inline bool bpf_token_capable(const struct bpf_token *token, int cap)
+ {
+ return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
+ }
+
+ static inline void bpf_token_inc(struct bpf_token *token)
+ {
+ }
+
+ static inline void bpf_token_put(struct bpf_token *token)
+ {
+ }
+
+ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
+ {
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
static inline void __dev_flush(void)
{
}
}
static inline const struct bpf_func_proto *
- bpf_base_func_proto(enum bpf_func_id func_id)
+ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
return NULL;
}
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *addr1, void *addr2);
+void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+ struct bpf_prog *new, struct bpf_prog *old);
+
void *bpf_arch_text_copy(void *dst, void *src, size_t len);
int bpf_arch_text_invalidate(void *dst, size_t len);
refcount_t users;
#ifdef CONFIG_SKB_EXTENSIONS
- /* only useable after checking ->active_extensions != 0 */
+ /* only usable after checking ->active_extensions != 0 */
struct skb_ext *extensions;
#endif
};
unsigned int order)
{
/* This piece of code contains several assumptions.
- * 1. This is for device Rx, therefor a cold page is preferred.
+ * 1. This is for device Rx, therefore a cold page is preferred.
* 2. The expectation is the user wants a compound page.
* 3. If requesting a order 0 page it will not be compound
* due to the check to see if order has a value in prep_new_page
{
const void *a = skb_metadata_end(skb_a);
const void *b = skb_metadata_end(skb_b);
- /* Using more efficient variant than plain call to memcmp(). */
- #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
u64 diffs = 0;
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+ BITS_PER_LONG != 64)
+ goto slow;
+
+ /* Using more efficient variant than plain call to memcmp(). */
switch (meta_len) {
#define __it(x, op) (x -= sizeof(u##op))
#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
fallthrough;
case 4: diffs |= __it_diff(a, b, 32);
break;
+ default:
+ slow:
+ return memcmp(a - meta_len, b - meta_len, meta_len);
}
return diffs;
- #else
- return memcmp(a - meta_len, b - meta_len, meta_len);
- #endif
}
static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
* Returns zero on success. On error, -1 is returned and *errno*
* is set appropriately.
*
+ * BPF_TOKEN_CREATE
+ * Description
+ * Create BPF token with embedded information about what
+ * BPF-related functionality it allows:
+ * - a set of allowed bpf() syscall commands;
+ * - a set of allowed BPF map types to be created with
+ * BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed;
+ * - a set of allowed BPF program types and BPF program attach
+ * types to be loaded with BPF_PROG_LOAD command, if
+ * BPF_PROG_LOAD itself is allowed.
+ *
+ * BPF token is created (derived) from an instance of BPF FS,
+ * assuming it has necessary delegation mount options specified.
+ * This BPF token can be passed as an extra parameter to various
+ * bpf() syscall commands to grant BPF subsystem functionality to
+ * unprivileged processes.
+ *
+ * When created, BPF token is "associated" with the owning
+ * user namespace of BPF FS instance (super block) that it was
+ * derived from, and subsequent BPF operations performed with
+ * BPF token would be performing capabilities checks (i.e.,
+ * CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within
+ * that user namespace. Without BPF token, such capabilities
+ * have to be granted in init user namespace, making bpf()
+ * syscall incompatible with user namespace, for the most part.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
* NOTES
* eBPF objects (maps and programs) can be shared between processes.
*
BPF_ITER_CREATE,
BPF_LINK_DETACH,
BPF_PROG_BIND_MAP,
+ BPF_TOKEN_CREATE,
+ __MAX_BPF_CMD,
};
enum bpf_map_type {
BPF_MAP_TYPE_BLOOM_FILTER,
BPF_MAP_TYPE_USER_RINGBUF,
BPF_MAP_TYPE_CGRP_STORAGE,
+ __MAX_BPF_MAP_TYPE
};
/* Note that tracing related programs such as
BPF_PROG_TYPE_SK_LOOKUP,
BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
BPF_PROG_TYPE_NETFILTER,
+ __MAX_BPF_PROG_TYPE
};
enum bpf_attach_type {
BPF_LINK_TYPE_TCX = 11,
BPF_LINK_TYPE_UPROBE_MULTI = 12,
BPF_LINK_TYPE_NETKIT = 13,
- MAX_BPF_LINK_TYPE,
+ __MAX_BPF_LINK_TYPE,
};
+ #define MAX_BPF_LINK_TYPE __MAX_BPF_LINK_TYPE
+
enum bpf_perf_event_type {
BPF_PERF_EVENT_UNSPEC = 0,
BPF_PERF_EVENT_UPROBE = 1,
* to using 5 hash functions).
*/
__u64 map_extra;
+ __u32 map_token_fd;
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
* truncated), or smaller (if log buffer wasn't filled completely).
*/
__u32 log_true_size;
+ __u32 prog_token_fd;
};
struct { /* anonymous struct used by BPF_OBJ_* commands */
* truncated), or smaller (if log buffer wasn't filled completely).
*/
__u32 btf_log_true_size;
+ __u32 btf_token_fd;
};
struct {
__u32 flags; /* extra flags */
} prog_bind_map;
+ struct { /* struct used by BPF_TOKEN_CREATE command */
+ __u32 flags;
+ __u32 bpffs_fd;
+ } token_create;
+
} __attribute__((aligned(8)));
/* The description below is an attempt at providing documentation to eBPF
BPF_TCP_LISTEN,
BPF_TCP_CLOSING, /* Now a valid state */
BPF_TCP_NEW_SYN_RECV,
+ BPF_TCP_BOUND_INACTIVE,
BPF_TCP_MAX_STATES /* Leave at the end! */
};
* timestamp via bpf_xdp_metadata_rx_timestamp().
* @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet
* hash via bpf_xdp_metadata_rx_hash().
+ * @NETDEV_XDP_RX_METADATA_VLAN_TAG: Device is capable of exposing receive
+ * packet VLAN tag via bpf_xdp_metadata_rx_vlan_tag().
*/
enum netdev_xdp_rx_metadata {
NETDEV_XDP_RX_METADATA_TIMESTAMP = 1,
NETDEV_XDP_RX_METADATA_HASH = 2,
+ NETDEV_XDP_RX_METADATA_VLAN_TAG = 4,
};
/**
NETDEV_XSK_FLAGS_TX_CHECKSUM = 2,
};
+enum netdev_queue_type {
+ NETDEV_QUEUE_TYPE_RX,
+ NETDEV_QUEUE_TYPE_TX,
+};
+
enum {
NETDEV_A_DEV_IFINDEX = 1,
NETDEV_A_DEV_PAD,
NETDEV_A_PAGE_POOL_STATS_MAX = (__NETDEV_A_PAGE_POOL_STATS_MAX - 1)
};
+enum {
+ NETDEV_A_NAPI_IFINDEX = 1,
+ NETDEV_A_NAPI_ID,
+ NETDEV_A_NAPI_IRQ,
+ NETDEV_A_NAPI_PID,
+
+ __NETDEV_A_NAPI_MAX,
+ NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
+};
+
+enum {
+ NETDEV_A_QUEUE_ID = 1,
+ NETDEV_A_QUEUE_IFINDEX,
+ NETDEV_A_QUEUE_TYPE,
+ NETDEV_A_QUEUE_NAPI_ID,
+
+ __NETDEV_A_QUEUE_MAX,
+ NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
+};
+
enum {
NETDEV_CMD_DEV_GET = 1,
NETDEV_CMD_DEV_ADD_NTF,
NETDEV_CMD_PAGE_POOL_DEL_NTF,
NETDEV_CMD_PAGE_POOL_CHANGE_NTF,
NETDEV_CMD_PAGE_POOL_STATS_GET,
+ NETDEV_CMD_QUEUE_GET,
+ NETDEV_CMD_NAPI_GET,
__NETDEV_CMD_MAX,
NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
- bool bypass_spec_v1 = bpf_bypass_spec_v1();
+ bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
u64 array_size, mask64;
struct bpf_array *array;
}
if (old_ptr)
- map->ops->map_fd_put_ptr(old_ptr);
+ map->ops->map_fd_put_ptr(map, old_ptr, true);
return 0;
}
- static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
+ static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *old_ptr;
}
if (old_ptr) {
- map->ops->map_fd_put_ptr(old_ptr);
+ map->ops->map_fd_put_ptr(map, old_ptr, need_defer);
return 0;
} else {
return -ENOENT;
}
}
+ static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
+ {
+ return __fd_array_map_delete_elem(map, key, true);
+ }
+
static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
return prog;
}
- static void prog_fd_array_put_ptr(void *ptr)
+ static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
+ /* bpf_prog is freed after one RCU or tasks trace grace period */
bpf_prog_put(ptr);
}
}
/* decrement refcnt of all bpf_progs that are stored in this map */
- static void bpf_fd_array_map_clear(struct bpf_map *map)
+ static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
for (i = 0; i < array->map.max_entries; i++)
- fd_array_map_delete_elem(map, &i);
+ __fd_array_map_delete_elem(map, &i, need_defer);
}
static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
mutex_unlock(&aux->poke_mutex);
}
+void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+ struct bpf_prog *new, struct bpf_prog *old)
+{
+ WARN_ON_ONCE(1);
+}
+
static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
struct bpf_prog *old,
struct bpf_prog *new)
{
- u8 *old_addr, *new_addr, *old_bypass_addr;
struct prog_poke_elem *elem;
struct bpf_array_aux *aux;
list_for_each_entry(elem, &aux->poke_progs, list) {
struct bpf_jit_poke_descriptor *poke;
- int i, ret;
+ int i;
for (i = 0; i < elem->aux->size_poke_tab; i++) {
poke = &elem->aux->poke_tab[i];
* activated, so tail call updates can arrive from here
* while JIT is still finishing its final fixup for
* non-activated poke entries.
- * 3) On program teardown, the program's kallsym entry gets
- * removed out of RCU callback, but we can only untrack
- * from sleepable context, therefore bpf_arch_text_poke()
- * might not see that this is in BPF text section and
- * bails out with -EINVAL. As these are unreachable since
- * RCU grace period already passed, we simply skip them.
- * 4) Also programs reaching refcount of zero while patching
+ * 3) Also programs reaching refcount of zero while patching
* is in progress is okay since we're protected under
* poke_mutex and untrack the programs before the JIT
- * buffer is freed. When we're still in the middle of
- * patching and suddenly kallsyms entry of the program
- * gets evicted, we just skip the rest which is fine due
- * to point 3).
- * 5) Any other error happening below from bpf_arch_text_poke()
- * is a unexpected bug.
+ * buffer is freed.
*/
if (!READ_ONCE(poke->tailcall_target_stable))
continue;
poke->tail_call.key != key)
continue;
- old_bypass_addr = old ? NULL : poke->bypass_addr;
- old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
- new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
-
- if (new) {
- ret = bpf_arch_text_poke(poke->tailcall_target,
- BPF_MOD_JUMP,
- old_addr, new_addr);
- BUG_ON(ret < 0 && ret != -EINVAL);
- if (!old) {
- ret = bpf_arch_text_poke(poke->tailcall_bypass,
- BPF_MOD_JUMP,
- poke->bypass_addr,
- NULL);
- BUG_ON(ret < 0 && ret != -EINVAL);
- }
- } else {
- ret = bpf_arch_text_poke(poke->tailcall_bypass,
- BPF_MOD_JUMP,
- old_bypass_addr,
- poke->bypass_addr);
- BUG_ON(ret < 0 && ret != -EINVAL);
- /* let other CPUs finish the execution of program
- * so that it will not possible to expose them
- * to invalid nop, stack unwind, nop state
- */
- if (!ret)
- synchronize_rcu();
- ret = bpf_arch_text_poke(poke->tailcall_target,
- BPF_MOD_JUMP,
- old_addr, NULL);
- BUG_ON(ret < 0 && ret != -EINVAL);
- }
+ bpf_arch_poke_desc_update(poke, new, old);
}
}
}
{
struct bpf_map *map = container_of(work, struct bpf_array_aux,
work)->map;
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, true);
bpf_map_put(map);
}
{
struct bpf_event_entry *ee;
- ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
+ ee = kzalloc(sizeof(*ee), GFP_KERNEL);
if (ee) {
ee->event = perf_file->private_data;
ee->perf_file = perf_file;
return ee;
}
- static void perf_event_fd_array_put_ptr(void *ptr)
+ static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
+ /* bpf_perf_event is freed after one RCU grace period */
bpf_event_entry_free_rcu(ptr);
}
for (i = 0; i < array->map.max_entries; i++) {
ee = READ_ONCE(array->ptrs[i]);
if (ee && ee->map_file == map_file)
- fd_array_map_delete_elem(map, &i);
+ __fd_array_map_delete_elem(map, &i, true);
}
rcu_read_unlock();
}
static void perf_event_fd_array_map_free(struct bpf_map *map)
{
if (map->map_flags & BPF_F_PRESERVE_ELEMS)
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
return cgroup_get_from_fd(fd);
}
- static void cgroup_fd_array_put_ptr(void *ptr)
+ static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
/* cgroup_put free cgrp after a rcu grace period */
cgroup_put(ptr);
static void cgroup_fd_array_free(struct bpf_map *map)
{
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
* is protected by fdget/fdput.
*/
bpf_map_meta_free(map->inner_map_meta);
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
#endif
INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
+ #ifdef CONFIG_FINEIBT
+ INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
+ #endif
mutex_init(&fp->aux->used_maps_mutex);
mutex_init(&fp->aux->dst_mutex);
static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
s32 end_new, s32 curr, const bool probe_pass)
{
- const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s64 off_min, off_max, off;
s32 delta = end_new - end_old;
- s32 off;
- if (insn->code == (BPF_JMP32 | BPF_JA))
+ if (insn->code == (BPF_JMP32 | BPF_JA)) {
off = insn->imm;
- else
+ off_min = S32_MIN;
+ off_max = S32_MAX;
+ } else {
off = insn->off;
+ off_min = S16_MIN;
+ off_max = S16_MAX;
+ }
if (curr < pos && curr + off + 1 >= end_old)
off += delta;
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
- !bpf_capable())
+ !bpf_token_capable(fp->aux->token, CAP_BPF))
return;
bpf_prog_ksym_set_addr(fp);
fp->aux->ksym.prog = true;
bpf_ksym_add(&fp->aux->ksym);
+
+ #ifdef CONFIG_FINEIBT
+ /*
+ * When FineIBT, code in the __cfi_foo() symbols can get executed
+ * and hence unwinder needs help.
+ */
+ if (cfi_mode != CFI_FINEIBT)
+ return;
+
+ snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
+ "__cfi_%s", fp->aux->ksym.name);
+
+ fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
+ fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func;
+
+ bpf_ksym_add(&fp->aux->ksym_prefix);
+ #endif
}
void bpf_prog_kallsyms_del(struct bpf_prog *fp)
return;
bpf_ksym_del(&fp->aux->ksym);
+ #ifdef CONFIG_FINEIBT
+ if (cfi_mode != CFI_FINEIBT)
+ return;
+ bpf_ksym_del(&fp->aux->ksym_prefix);
+ #endif
}
static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
return ptr;
}
- void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+ void bpf_prog_pack_free(void *ptr, u32 size)
{
struct bpf_prog_pack *pack = NULL, *tmp;
unsigned int nbits;
unsigned long pos;
mutex_lock(&pack_mutex);
- if (hdr->size > BPF_PROG_PACK_SIZE) {
- bpf_jit_free_exec(hdr);
+ if (size > BPF_PROG_PACK_SIZE) {
+ bpf_jit_free_exec(ptr);
goto out;
}
list_for_each_entry(tmp, &pack_list, list) {
- if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) {
+ if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
pack = tmp;
break;
}
if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
goto out;
- nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
- pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
+ nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
- WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
+ WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
"bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
bitmap_clear(pack->bitmap, pos, nbits);
*rw_header = kvmalloc(size, GFP_KERNEL);
if (!*rw_header) {
- bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
- bpf_prog_pack_free(ro_header);
+ bpf_prog_pack_free(ro_header, size);
bpf_jit_uncharge_modmem(size);
return NULL;
}
kvfree(rw_header);
if (IS_ERR(ptr)) {
- bpf_prog_pack_free(ro_header);
+ bpf_prog_pack_free(ro_header, ro_header->size);
return PTR_ERR(ptr);
}
return 0;
{
u32 size = ro_header->size;
- bpf_prog_pack_free(ro_header);
+ bpf_prog_pack_free(ro_header, size);
kvfree(rw_header);
bpf_jit_uncharge_modmem(size);
}
struct bpf_map **used_maps, u32 len)
{
struct bpf_map *map;
+ bool sleepable;
u32 i;
+ sleepable = aux->sleepable;
for (i = 0; i < len; i++) {
map = used_maps[i];
if (map->ops->map_poke_untrack)
map->ops->map_poke_untrack(map, aux);
+ if (sleepable)
+ atomic64_dec(&map->sleepable_refcnt);
bpf_map_put(map);
}
}
if (aux->dst_prog)
bpf_prog_put(aux->dst_prog);
+ bpf_token_put(aux->token);
INIT_WORK(&aux->work, bpf_prog_free_deferred);
schedule_work(&aux->work);
}
#include "dev.h"
static const struct bpf_func_proto *
- bpf_sk_base_func_proto(enum bpf_func_id func_id);
+ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
{
*/
static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
+ int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
u32 filter_size = bpf_prog_size(fp->prog->len);
- int optmem_max = READ_ONCE(sysctl_optmem_max);
/* same check as in sock_kmalloc() */
if (filter_size <= optmem_max &&
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
struct bpf_prog *prog = __get_filter(fprog, sk);
- int err;
+ int err, optmem_max;
if (IS_ERR(prog))
return PTR_ERR(prog);
- if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max))
+ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
+ if (bpf_prog_size(prog->len) > optmem_max)
err = -ENOMEM;
else
err = reuseport_attach_prog(sk, prog);
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
{
struct bpf_prog *prog;
- int err;
+ int err, optmem_max;
if (sock_flag(sk, SOCK_FILTER_LOCKED))
return -EPERM;
}
} else {
/* BPF_PROG_TYPE_SOCKET_FILTER */
- if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) {
+ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
+ if (bpf_prog_size(prog->len) > optmem_max) {
err = -ENOMEM;
goto err_prog_put;
}
return 0;
}
+static void sk_msg_reset_curr(struct sk_msg *msg)
+{
+ u32 i = msg->sg.start;
+ u32 len = 0;
+
+ do {
+ len += sk_msg_elem(msg, i)->length;
+ sk_msg_iter_var_next(i);
+ if (len >= msg->sg.size)
+ break;
+ } while (i != msg->sg.end);
+
+ msg->sg.curr = i;
+ msg->sg.copybreak = 0;
+}
+
static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
.func = bpf_msg_cork_bytes,
.gpl_only = false,
msg->sg.end - shift + NR_MSG_FRAG_IDS :
msg->sg.end - shift;
out:
+ sk_msg_reset_curr(msg);
msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
msg->data_end = msg->data + bytes;
return 0;
msg->sg.data[new] = rsge;
}
+ sk_msg_reset_curr(msg);
sk_msg_compute_data_pointers(msg);
return 0;
}
sk_mem_uncharge(msg->sk, len - pop);
msg->sg.size -= (len - pop);
+ sk_msg_reset_curr(msg);
sk_msg_compute_data_pointers(msg);
return 0;
}
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
return NULL;
}
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
case BPF_FUNC_perf_event_output:
return &bpf_skb_event_output_proto;
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
#endif
#endif
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
#endif
#endif
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
return &bpf_tcp_sock_proto;
#endif /* CONFIG_INET */
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
return &bpf_get_cgroup_classid_curr_proto;
#endif
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
return &bpf_skc_lookup_tcp_proto;
#endif
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
case BPF_FUNC_skb_load_bytes:
return &bpf_flow_dissector_load_bytes_proto;
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
case BPF_FUNC_skb_under_cgroup:
return &bpf_skb_under_cgroup_proto;
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
return false;
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_end):
- if (!bpf_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_BPF))
return false;
break;
}
case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
case bpf_ctx_range(struct __sk_buff, tstamp):
- if (!bpf_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_BPF))
return false;
break;
default:
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
case BPF_FUNC_sk_release:
return &bpf_sk_release_proto;
default:
- return bpf_sk_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
};
static const struct bpf_func_proto *
- bpf_sk_base_func_proto(enum bpf_func_id func_id)
+ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
const struct bpf_func_proto *func;
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
- if (!perfmon_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
return NULL;
return func;
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
static const struct nf_defrag_hook *
get_proto_defrag_hook(struct bpf_nf_link *link,
- const struct nf_defrag_hook __rcu *global_hook,
+ const struct nf_defrag_hook __rcu **ptr_global_hook,
const char *mod)
{
const struct nf_defrag_hook *hook;
/* RCU protects us from races against module unloading */
rcu_read_lock();
- hook = rcu_dereference(global_hook);
+ hook = rcu_dereference(*ptr_global_hook);
if (!hook) {
rcu_read_unlock();
err = request_module(mod);
return ERR_PTR(err < 0 ? err : -EINVAL);
rcu_read_lock();
- hook = rcu_dereference(global_hook);
+ hook = rcu_dereference(*ptr_global_hook);
}
if (hook && try_module_get(hook->owner)) {
switch (link->hook_ops.pf) {
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
case NFPROTO_IPV4:
- hook = get_proto_defrag_hook(link, nf_defrag_v4_hook, "nf_defrag_ipv4");
+ hook = get_proto_defrag_hook(link, &nf_defrag_v4_hook, "nf_defrag_ipv4");
if (IS_ERR(hook))
return PTR_ERR(hook);
#endif
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
case NFPROTO_IPV6:
- hook = get_proto_defrag_hook(link, nf_defrag_v6_hook, "nf_defrag_ipv6");
+ hook = get_proto_defrag_hook(link, &nf_defrag_v6_hook, "nf_defrag_ipv6");
if (IS_ERR(hook))
return PTR_ERR(hook);
static const struct bpf_func_proto *
bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
const struct bpf_verifier_ops netfilter_verifier_ops = {
* timestamp via bpf_xdp_metadata_rx_timestamp().
* @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet
* hash via bpf_xdp_metadata_rx_hash().
+ * @NETDEV_XDP_RX_METADATA_VLAN_TAG: Device is capable of exposing receive
+ * packet VLAN tag via bpf_xdp_metadata_rx_vlan_tag().
*/
enum netdev_xdp_rx_metadata {
NETDEV_XDP_RX_METADATA_TIMESTAMP = 1,
NETDEV_XDP_RX_METADATA_HASH = 2,
+ NETDEV_XDP_RX_METADATA_VLAN_TAG = 4,
};
/**
NETDEV_XSK_FLAGS_TX_CHECKSUM = 2,
};
+enum netdev_queue_type {
+ NETDEV_QUEUE_TYPE_RX,
+ NETDEV_QUEUE_TYPE_TX,
+};
+
enum {
NETDEV_A_DEV_IFINDEX = 1,
NETDEV_A_DEV_PAD,
NETDEV_A_PAGE_POOL_STATS_MAX = (__NETDEV_A_PAGE_POOL_STATS_MAX - 1)
};
+enum {
+ NETDEV_A_NAPI_IFINDEX = 1,
+ NETDEV_A_NAPI_ID,
+ NETDEV_A_NAPI_IRQ,
+ NETDEV_A_NAPI_PID,
+
+ __NETDEV_A_NAPI_MAX,
+ NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
+};
+
+enum {
+ NETDEV_A_QUEUE_ID = 1,
+ NETDEV_A_QUEUE_IFINDEX,
+ NETDEV_A_QUEUE_TYPE,
+ NETDEV_A_QUEUE_NAPI_ID,
+
+ __NETDEV_A_QUEUE_MAX,
+ NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
+};
+
enum {
NETDEV_CMD_DEV_GET = 1,
NETDEV_CMD_DEV_ADD_NTF,
NETDEV_CMD_PAGE_POOL_DEL_NTF,
NETDEV_CMD_PAGE_POOL_CHANGE_NTF,
NETDEV_CMD_PAGE_POOL_STATS_GET,
+ NETDEV_CMD_QUEUE_GET,
+ NETDEV_CMD_NAPI_GET,
__NETDEV_CMD_MAX,
NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)