Daniel Borkmann says:
====================
pull-request: bpf-next 2024-05-28
We've added 23 non-merge commits during the last 11 day(s) which contain
a total of 45 files changed, 696 insertions(+), 277 deletions(-).
The main changes are:
1) Rename skb's mono_delivery_time to tstamp_type for extensibility
and add SKB_CLOCK_TAI type support to bpf_skb_set_tstamp(),
from Abhishek Chauhan.
2) Add netfilter CT zone ID and direction to bpf_ct_opts so that arbitrary
CT zones can be used from XDP/tc BPF netfilter CT helper functions,
from Brad Cowie.
3) Several tweaks to the instruction-set.rst IETF doc to address
the Last Call review comments, from Dave Thaler.
4) Small batch of riscv64 BPF JIT optimizations in order to emit more
compressed instructions to the JITed image for better icache efficiency,
from Xiao Wang.
5) Sort bpftool C dump output from BTF, aiming to simplify vmlinux.h
diffing and forcing more natural type definitions ordering,
from Mykyta Yatsenko.
6) Use DEV_STATS_INC() macro in BPF redirect helpers to silence
a syzbot/KCSAN race report for the tx_errors counter,
from Jiang Yunshui.
7) Un-constify bpf_func_info in bpftool to fix compilation with LLVM 17+
which started treating const structs as constants and thus breaking
full BTF program name resolution, from Ivan Babrou.
8) Fix up BPF program numbers in test_sockmap selftest in order to reduce
some of the test-internal array sizes, from Geliang Tang.
9) Small cleanup in Makefile.btf script to use test-ge check for v1.25-only
pahole, from Alan Maguire.
10) Fix bpftool's make dependencies for vmlinux.h in order to avoid needless
rebuilds in some corner cases, from Artem Savkov.
* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (23 commits)
bpf, net: Use DEV_STAT_INC()
bpf, docs: Fix instruction.rst indentation
bpf, docs: Clarify call local offset
bpf, docs: Add table captions
bpf, docs: clarify sign extension of 64-bit use of 32-bit imm
bpf, docs: Use RFC 2119 language for ISA requirements
bpf, docs: Move sentence about returning R0 to abi.rst
bpf: constify member bpf_sysctl_kern:: Table
riscv, bpf: Try RVC for reg move within BPF_CMPXCHG JIT
riscv, bpf: Use STACK_ALIGN macro for size rounding up
riscv, bpf: Optimize zextw insn with Zba extension
selftests/bpf: Handle forwarding of UDP CLOCK_TAI packets
net: Add additional bit to support clockid_t timestamp type
net: Rename mono_delivery_time to tstamp_type for scalabilty
selftests/bpf: Update tests for new ct zone opts for nf_conntrack kfuncs
net: netfilter: Make ct zone opts configurable for bpf ct helpers
selftests/bpf: Fix prog numbers in test_sockmap
bpf: Remove unused variable "prev_state"
bpftool: Un-const bpf_func_info to fix it for llvm 17 and newer
bpf: Fix order of args in call to bpf_map_kvcalloc
...
====================
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Jakub Kicinski <[email protected]>
select ARCH_HAS_DEBUG_VIRTUAL if MMU
select ARCH_HAS_DEBUG_VM_PGTABLE
select ARCH_HAS_DEBUG_WX
+ select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
+ select ARCH_HAS_KERNEL_FPU_SUPPORT if 64BIT && FPU
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_MMIOWB
select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU
select ARCH_SUPPORTS_PER_VMA_LOCK if MMU
select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK
+ select ARCH_USE_CMPXCHG_LOCKREF if 64BIT
select ARCH_USE_MEMTEST
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USES_CFI_TRAPS if CFI_CLANG
- select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP && MMU
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if MMU
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE
select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
select BUILDTIME_TABLE_SORT if MMU
- select CLINT_TIMER if !MMU
+ select CLINT_TIMER if RISCV_M_MODE
select CLONE_BACKWARDS
select COMMON_CLK
select CPU_PM if CPU_IDLE || HIBERNATION || SUSPEND
select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION
select HAVE_EBPF_JIT if MMU
- select HAVE_FAST_GUP if MMU
+ select HAVE_GUP_FAST if MMU
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_GCC_PLUGINS
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RETHOOK if !XIP_KERNEL
select HAVE_RSEQ
+ select HAVE_RUST if 64BIT
select HAVE_SAMPLE_FTRACE_DIRECT
select HAVE_SAMPLE_FTRACE_DIRECT_MULTI
select HAVE_STACKPROTECTOR
# set if we run in machine mode, cleared if we run in supervisor mode
config RISCV_M_MODE
- bool
- default !MMU
+ bool "Build a kernel that runs in machine mode"
+ depends on !MMU
+ default y
+ help
+ Select this option if you want to run the kernel in M-mode,
+ without the assistance of any other firmware.
# set if we are running in S-mode and can use SBI calls
config RISCV_SBI
config PAGE_OFFSET
hex
- default 0xC0000000 if 32BIT && MMU
- default 0x80000000 if !MMU
+ default 0x80000000 if !MMU && RISCV_M_MODE
+ default 0x80200000 if !MMU
+ default 0xc0000000 if 32BIT
default 0xff60000000000000 if 64BIT
config KASAN_SHADOW_OFFSET
def_bool $(as-instr, .option arch$(comma) +v$(comma) +zvkb)
depends on AS_HAS_OPTION_ARCH
+ config RISCV_ISA_ZBA
+ bool "Zba extension support for bit manipulation instructions"
+ default y
+ help
+ Add support for enabling optimisations in the kernel when the Zba
+ extension is detected at boot.
+
+ The Zba extension provides instructions to accelerate the generation
+ of addresses that index into arrays of basic data types.
+
+ If you don't know what to do here, say Y.
+
config RISCV_ISA_ZBB
bool "Zbb extension support for bit manipulation instructions"
depends on TOOLCHAIN_HAS_ZBB
- depends on MMU
depends on RISCV_ALTERNATIVE
default y
help
config RISCV_ISA_ZICBOZ
bool "Zicboz extension support for faster zeroing of memory"
- depends on MMU
depends on RISCV_ALTERNATIVE
default y
help
typedef unsigned char *sk_buff_data_t;
#endif
+ enum skb_tstamp_type {
+ SKB_CLOCK_REALTIME,
+ SKB_CLOCK_MONOTONIC,
+ SKB_CLOCK_TAI,
+ __SKB_CLOCK_MAX = SKB_CLOCK_TAI,
+ };
+
/**
* DOC: Basic sk_buff geometry
*
* @dst_pending_confirm: need to confirm neighbour
* @decrypted: Decrypted SKB
* @slow_gro: state present at GRO time, slower prepare step required
- * @mono_delivery_time: When set, skb->tstamp has the
- * delivery_time in mono clock base (i.e. EDT). Otherwise, the
- * skb->tstamp has the (rcv) timestamp at ingress and
- * delivery_time at egress.
+ * @tstamp_type: When set, skb->tstamp has the
+ * delivery_time clock base of skb->tstamp.
* @napi_id: id of the NAPI struct this skb came from
* @sender_cpu: (aka @napi_id) source CPU in XPS
* @alloc_cpu: CPU which did the skb allocation.
/* private: */
__u8 __mono_tc_offset[0];
/* public: */
- __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */
+ __u8 tstamp_type:2; /* See skb_tstamp_type */
#ifdef CONFIG_NET_XGRESS
__u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */
__u8 tc_skip_classify:1;
#endif
#define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset)
- /* if you move tc_at_ingress or mono_delivery_time
+ /* if you move tc_at_ingress or tstamp_type
* around, you also must adapt these constants.
*/
#ifdef __BIG_ENDIAN_BITFIELD
- #define SKB_MONO_DELIVERY_TIME_MASK (1 << 7)
- #define TC_AT_INGRESS_MASK (1 << 6)
+ #define SKB_TSTAMP_TYPE_MASK (3 << 6)
+ #define SKB_TSTAMP_TYPE_RSHIFT (6)
+ #define TC_AT_INGRESS_MASK (1 << 5)
#else
- #define SKB_MONO_DELIVERY_TIME_MASK (1 << 0)
- #define TC_AT_INGRESS_MASK (1 << 1)
+ #define SKB_TSTAMP_TYPE_MASK (3)
+ #define TC_AT_INGRESS_MASK (1 << 2)
#endif
#define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset)
*
* %NULL is returned if there is no free memory.
*/
-static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
+static inline struct page *__dev_alloc_pages_noprof(gfp_t gfp_mask,
unsigned int order)
{
/* This piece of code contains several assumptions.
*/
gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;
- return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
+ return alloc_pages_node_noprof(NUMA_NO_NODE, gfp_mask, order);
}
+#define __dev_alloc_pages(...) alloc_hooks(__dev_alloc_pages_noprof(__VA_ARGS__))
-static inline struct page *dev_alloc_pages(unsigned int order)
-{
- return __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, order);
-}
+#define dev_alloc_pages(_order) __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, _order)
/**
* __dev_alloc_page - allocate a page for network Rx
*
* %NULL is returned if there is no free memory.
*/
-static inline struct page *__dev_alloc_page(gfp_t gfp_mask)
+static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask)
{
- return __dev_alloc_pages(gfp_mask, 0);
+ return __dev_alloc_pages_noprof(gfp_mask, 0);
}
+#define __dev_alloc_page(...) alloc_hooks(__dev_alloc_page_noprof(__VA_ARGS__))
-static inline struct page *dev_alloc_page(void)
-{
- return dev_alloc_pages(0);
-}
+#define dev_alloc_page() dev_alloc_pages(0)
/**
* dev_page_is_reusable - check whether a page can be reused for network Rx
static inline void __net_timestamp(struct sk_buff *skb)
{
skb->tstamp = ktime_get_real();
- skb->mono_delivery_time = 0;
+ skb->tstamp_type = SKB_CLOCK_REALTIME;
}
static inline ktime_t net_timedelta(ktime_t t)
}
static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt,
- bool mono)
+ u8 tstamp_type)
{
skb->tstamp = kt;
- skb->mono_delivery_time = kt && mono;
+
+ if (kt)
+ skb->tstamp_type = tstamp_type;
+ else
+ skb->tstamp_type = SKB_CLOCK_REALTIME;
+ }
+
+ static inline void skb_set_delivery_type_by_clockid(struct sk_buff *skb,
+ ktime_t kt, clockid_t clockid)
+ {
+ u8 tstamp_type = SKB_CLOCK_REALTIME;
+
+ switch (clockid) {
+ case CLOCK_REALTIME:
+ break;
+ case CLOCK_MONOTONIC:
+ tstamp_type = SKB_CLOCK_MONOTONIC;
+ break;
+ case CLOCK_TAI:
+ tstamp_type = SKB_CLOCK_TAI;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ kt = 0;
+ }
+
+ skb_set_delivery_time(skb, kt, tstamp_type);
}
DECLARE_STATIC_KEY_FALSE(netstamp_needed_key);
*/
static inline void skb_clear_delivery_time(struct sk_buff *skb)
{
- if (skb->mono_delivery_time) {
- skb->mono_delivery_time = 0;
+ if (skb->tstamp_type) {
+ skb->tstamp_type = SKB_CLOCK_REALTIME;
if (static_branch_unlikely(&netstamp_needed_key))
skb->tstamp = ktime_get_real();
else
static inline void skb_clear_tstamp(struct sk_buff *skb)
{
- if (skb->mono_delivery_time)
+ if (skb->tstamp_type)
return;
skb->tstamp = 0;
static inline ktime_t skb_tstamp(const struct sk_buff *skb)
{
- if (skb->mono_delivery_time)
+ if (skb->tstamp_type)
return 0;
return skb->tstamp;
static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond)
{
- if (!skb->mono_delivery_time && skb->tstamp)
+ if (skb->tstamp_type != SKB_CLOCK_MONOTONIC && skb->tstamp)
return skb->tstamp;
if (static_branch_unlikely(&netstamp_needed_key) || cond)
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
else
- (*rcv_wnd) = min_t(u32, space, U16_MAX);
+ (*rcv_wnd) = space;
if (init_rcv_wnd)
*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
tp = tcp_sk(sk);
prior_wstamp = tp->tcp_wstamp_ns;
tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
- skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
+ skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
if (clone_it) {
oskb = skb;
skb_split(skb, buff, len);
- skb_set_delivery_time(buff, skb->tstamp, true);
+ skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC);
tcp_fragment_tstamp(skb, buff);
old_factor = tcp_skb_pcount(skb);
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
tp->tcp_wstamp_ns = tp->tcp_clock_cache;
- skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
+ skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
tcp_init_tso_segs(skb, mss_now);
goto repair; /* Skip network transmission */
#ifdef CONFIG_SYN_COOKIES
if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
skb_set_delivery_time(skb, cookie_init_timestamp(req, now),
- true);
+ SKB_CLOCK_MONOTONIC);
else
#endif
{
- skb_set_delivery_time(skb, now, true);
+ skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
}
bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
synack_type, &opts);
- skb_set_delivery_time(skb, now, true);
+ skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
tcp_add_tx_delay(skb, tp);
return skb;
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
- skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true);
+ skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC);
/* Now full SYN+DATA was cloned and sent (or not),
* remove the SYN from the original skb (syn_data)