1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
4 #define BPF_NO_KFUNC_PROTOTYPES
7 #include <bpf/bpf_helpers.h>
8 #include <bpf/bpf_endian.h>
11 #include "bpf_compiler.h"
16 #define NSEC_PER_SEC 1000000000L
19 #define ETH_P_IP 0x0800
20 #define ETH_P_IPV6 0x86DD
22 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
25 #define IP_OFFSET 0x1fff
32 #define TCPOPT_WINDOW 3
33 #define TCPOPT_SACK_PERM 4
34 #define TCPOPT_TIMESTAMP 8
37 #define TCPOLEN_WINDOW 3
38 #define TCPOLEN_SACK_PERM 2
39 #define TCPOLEN_TIMESTAMP 10
41 #define TCP_TS_HZ 1000
42 #define TS_OPT_WSCALE_MASK 0xf
43 #define TS_OPT_SACK (1 << 4)
44 #define TS_OPT_ECN (1 << 5)
46 #define TSMASK (((__u32)1 << TSBITS) - 1)
47 #define TCP_MAX_WSCALE 14U
49 #define IPV4_MAXLEN 60
52 #define DEFAULT_MSS4 1460
53 #define DEFAULT_MSS6 1440
54 #define DEFAULT_WSCALE 7
55 #define DEFAULT_TTL 64
56 #define MAX_ALLOWED_PORTS 8
58 #define MAX_PACKET_OFF 0xffff
61 do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
63 #define __get_unaligned_t(type, ptr) ({ \
64 const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
68 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
71 __uint(type, BPF_MAP_TYPE_ARRAY);
74 __uint(max_entries, 2);
75 } values SEC(".maps");
78 __uint(type, BPF_MAP_TYPE_ARRAY);
81 __uint(max_entries, MAX_ALLOWED_PORTS);
82 } allowed_ports SEC(".maps");
84 /* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in
85 * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally.
88 struct bpf_ct_opts___local {
94 } __attribute__((preserve_access_index));
96 #define BPF_F_CURRENT_NETNS (-1)
98 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
99 struct bpf_sock_tuple *bpf_tuple,
101 struct bpf_ct_opts___local *opts,
102 __u32 len_opts) __ksym;
104 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
105 struct bpf_sock_tuple *bpf_tuple,
107 struct bpf_ct_opts___local *opts,
108 u32 len_opts) __ksym;
110 extern void bpf_ct_release(struct nf_conn *ct) __ksym;
112 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
116 __builtin_memcpy(tmp, a, ETH_ALEN);
117 __builtin_memcpy(a, b, ETH_ALEN);
118 __builtin_memcpy(b, tmp, ETH_ALEN);
121 static __always_inline __u16 csum_fold(__u32 csum)
123 csum = (csum & 0xffff) + (csum >> 16);
124 csum = (csum & 0xffff) + (csum >> 16);
128 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
129 __u32 len, __u8 proto,
136 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
138 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
139 s += (proto + len) << 8;
141 #error Unknown endian
143 s = (s & 0xffffffff) + (s >> 32);
144 s = (s & 0xffffffff) + (s >> 32);
146 return csum_fold((__u32)s);
149 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
150 const struct in6_addr *daddr,
151 __u32 len, __u8 proto, __u32 csum)
157 for (i = 0; i < 4; i++)
158 sum += (__u32)saddr->in6_u.u6_addr32[i];
161 for (i = 0; i < 4; i++)
162 sum += (__u32)daddr->in6_u.u6_addr32[i];
164 /* Don't combine additions to avoid 32-bit overflow. */
165 sum += bpf_htonl(len);
166 sum += bpf_htonl(proto);
168 sum = (sum & 0xffffffff) + (sum >> 32);
169 sum = (sum & 0xffffffff) + (sum >> 32);
171 return csum_fold((__u32)sum);
174 static __always_inline __u64 tcp_clock_ns(void)
176 return bpf_ktime_get_ns();
179 static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
181 return ns / (NSEC_PER_SEC / TCP_TS_HZ);
184 static __always_inline __u32 tcp_clock_ms(void)
186 return tcp_ns_to_ts(tcp_clock_ns());
189 struct tcpopt_context {
194 bool option_timestamp;
199 static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz)
201 __u64 off = ctx->off;
204 /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */
205 if (off > MAX_PACKET_OFF - sz)
208 data = ctx->data + off;
210 if (data + sz >= ctx->data_end)
217 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
219 __u8 *opcode, *opsize, *wscale, *tsecr;
220 __u32 off = ctx->off;
222 opcode = next(ctx, 1);
226 if (*opcode == TCPOPT_EOL)
228 if (*opcode == TCPOPT_NOP)
231 opsize = next(ctx, 1);
232 if (!opsize || *opsize < 2)
237 wscale = next(ctx, 1);
240 if (*opsize == TCPOLEN_WINDOW)
241 ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE;
243 case TCPOPT_TIMESTAMP:
244 tsecr = next(ctx, 4);
247 if (*opsize == TCPOLEN_TIMESTAMP) {
248 ctx->option_timestamp = true;
249 /* Client's tsval becomes our tsecr. */
250 *ctx->tsecr = get_unaligned((__be32 *)tsecr);
253 case TCPOPT_SACK_PERM:
254 if (*opsize == TCPOLEN_SACK_PERM)
255 ctx->option_sack = true;
259 ctx->off = off + *opsize;
264 static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
268 for (i = 0; i < 7; i++)
269 if (tscookie_tcpopt_parse(context))
274 static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
275 __u16 tcp_len, __be32 *tsval,
276 __be32 *tsecr, void *data, void *data_end)
278 struct tcpopt_context loop_ctx = {
280 .data_end = data_end,
282 .wscale = TS_OPT_WSCALE_MASK,
283 .option_timestamp = false,
284 .option_sack = false,
285 /* Note: currently verifier would track .off as unbound scalar.
286 * In case if verifier would at some point get smarter and
287 * compute bounded value for this var, beware that it might
288 * hinder bpf_loop() convergence validation.
290 .off = (__u8 *)(tcp_header + 1) - (__u8 *)data,
294 bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
296 if (!loop_ctx.option_timestamp)
299 cookie = tcp_clock_ms() & ~TSMASK;
300 cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
301 if (loop_ctx.option_sack)
302 cookie |= TS_OPT_SACK;
303 if (tcp_header->ece && tcp_header->cwr)
304 cookie |= TS_OPT_ECN;
305 *tsval = bpf_htonl(cookie);
310 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
311 __u8 *ttl, bool ipv6)
316 value = bpf_map_lookup_elem(&values, &key);
317 if (value && *value != 0) {
319 *mss = (*value >> 32) & 0xffff;
321 *mss = *value & 0xffff;
322 *wscale = (*value >> 16) & 0xf;
323 *ttl = (*value >> 24) & 0xff;
327 *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
328 *wscale = DEFAULT_WSCALE;
332 static __always_inline void values_inc_synacks(void)
337 value = bpf_map_lookup_elem(&values, &key);
339 __sync_fetch_and_add(value, 1);
342 static __always_inline bool check_port_allowed(__u16 port)
346 for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
350 value = bpf_map_lookup_elem(&allowed_ports, &key);
354 /* 0 is a terminator value. Check it first to avoid matching on
355 * a forbidden port == 0 and returning true.
367 struct header_pointers {
370 struct ipv6hdr *ipv6;
375 static __always_inline int tcp_dissect(void *data, void *data_end,
376 struct header_pointers *hdr)
379 if (hdr->eth + 1 > data_end)
382 switch (bpf_ntohs(hdr->eth->h_proto)) {
386 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
387 if (hdr->ipv4 + 1 > data_end)
389 if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
391 if (hdr->ipv4->version != 4)
394 if (hdr->ipv4->protocol != IPPROTO_TCP)
397 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
402 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
403 if (hdr->ipv6 + 1 > data_end)
405 if (hdr->ipv6->version != 6)
408 /* XXX: Extension headers are not supported and could circumvent
409 * XDP SYN flood protection.
411 if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
414 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
417 /* XXX: VLANs will circumvent XDP SYN flood protection. */
421 if (hdr->tcp + 1 > data_end)
423 hdr->tcp_len = hdr->tcp->doff * 4;
424 if (hdr->tcp_len < sizeof(*hdr->tcp))
430 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp)
432 struct bpf_ct_opts___local ct_lookup_opts = {
433 .netns_id = BPF_F_CURRENT_NETNS,
434 .l4proto = IPPROTO_TCP,
436 struct bpf_sock_tuple tup = {};
441 /* TCP doesn't normally use fragments, and XDP can't reassemble
444 if ((hdr->ipv4->frag_off & bpf_htons(IP_MF | IP_OFFSET)) != 0)
447 tup.ipv4.saddr = hdr->ipv4->saddr;
448 tup.ipv4.daddr = hdr->ipv4->daddr;
449 tup.ipv4.sport = hdr->tcp->source;
450 tup.ipv4.dport = hdr->tcp->dest;
451 tup_size = sizeof(tup.ipv4);
452 } else if (hdr->ipv6) {
453 __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
454 __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
455 tup.ipv6.sport = hdr->tcp->source;
456 tup.ipv6.dport = hdr->tcp->dest;
457 tup_size = sizeof(tup.ipv6);
459 /* The verifier can't track that either ipv4 or ipv6 is not
465 ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
467 ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
469 unsigned long status = ct->status;
472 if (status & IPS_CONFIRMED)
474 } else if (ct_lookup_opts.error != -ENOENT) {
478 /* error == -ENOENT || !(status & IPS_CONFIRMED) */
482 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
487 *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
492 if (tsopt[0] & bpf_htonl(1 << 4))
493 *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
494 (TCPOLEN_SACK_PERM << 16) |
495 (TCPOPT_TIMESTAMP << 8) |
498 *buf++ = bpf_htonl((TCPOPT_NOP << 24) |
500 (TCPOPT_TIMESTAMP << 8) |
505 if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
506 *buf++ = bpf_htonl((TCPOPT_NOP << 24) |
507 (TCPOPT_WINDOW << 16) |
508 (TCPOLEN_WINDOW << 8) |
514 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
515 __u32 cookie, __be32 *tsopt,
516 __u16 mss, __u8 wscale)
520 tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
521 if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
522 tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
523 tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
524 swap(tcp_header->source, tcp_header->dest);
525 tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
526 tcp_header->seq = bpf_htonl(cookie);
527 tcp_header->window = 0;
528 tcp_header->urg_ptr = 0;
529 tcp_header->check = 0; /* Calculate checksum later. */
531 tcp_options = (void *)(tcp_header + 1);
532 tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
535 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
536 __u32 cookie, __be32 *tsopt)
542 values_get_tcpipopts(&mss, &wscale, &ttl, false);
544 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
546 swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
547 hdr->ipv4->check = 0; /* Calculate checksum later. */
550 hdr->ipv4->ttl = ttl;
552 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
554 hdr->tcp_len = hdr->tcp->doff * 4;
555 hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
558 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
559 __u32 cookie, __be32 *tsopt)
565 values_get_tcpipopts(&mss, &wscale, &ttl, true);
567 swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
569 swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
570 *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
571 hdr->ipv6->hop_limit = ttl;
573 tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
575 hdr->tcp_len = hdr->tcp->doff * 4;
576 hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
579 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
581 void *data, void *data_end,
584 __u32 old_pkt_size, new_pkt_size;
585 /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
586 * BPF verifier if tsopt is not volatile. Volatile forces it to store
587 * the pointer value and use it directly, otherwise tcp_mkoptions is
588 * (mis)compiled like this:
590 * return buf - start;
591 * reg = stored_return_value_of_tscookie_init;
598 * It creates a dead branch where tsopt is assigned NULL, but the
599 * verifier can't prove it's dead and blocks the program.
601 __be32 * volatile tsopt = NULL;
602 __be32 tsopt_buf[2] = {};
607 /* Checksum is not yet verified, but both checksum failure and TCP
608 * header checks return XDP_DROP, so the order doesn't matter.
610 if (hdr->tcp->fin || hdr->tcp->rst)
613 /* Issue SYN cookies on allowed ports, drop SYN packets on blocked
616 if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
620 /* Check the IPv4 and TCP checksums before creating a SYNACK. */
621 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
624 if (csum_fold(value) != 0)
625 return XDP_DROP; /* Bad IPv4 checksum. */
627 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
630 if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
631 hdr->tcp_len, IPPROTO_TCP, value) != 0)
632 return XDP_DROP; /* Bad TCP checksum. */
634 ip_len = sizeof(*hdr->ipv4);
636 value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
638 } else if (hdr->ipv6) {
639 /* Check the TCP checksum before creating a SYNACK. */
640 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
643 if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
644 hdr->tcp_len, IPPROTO_TCP, value) != 0)
645 return XDP_DROP; /* Bad TCP checksum. */
647 ip_len = sizeof(*hdr->ipv6);
649 value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
657 cookie = (__u32)value;
659 if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
660 &tsopt_buf[0], &tsopt_buf[1], data, data_end))
663 /* Check that there is enough space for a SYNACK. It also covers
664 * the check that the destination of the __builtin_memmove below
667 if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
671 if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
672 struct tcphdr *new_tcp_header;
674 new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
675 __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
676 hdr->tcp = new_tcp_header;
678 hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
681 tcpv4_gen_synack(hdr, cookie, tsopt);
682 } else if (hdr->ipv6) {
683 tcpv6_gen_synack(hdr, cookie, tsopt);
688 /* Recalculate checksums. */
690 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
694 hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
700 hdr->ipv4->check = 0;
701 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
704 hdr->ipv4->check = csum_fold(value);
705 } else if (hdr->ipv6) {
706 hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
715 /* Set the new packet size. */
716 old_pkt_size = data_end - data;
717 new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
719 if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
722 if (bpf_skb_change_tail(ctx, new_pkt_size, 0))
726 values_inc_synacks();
731 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
739 err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
741 err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
750 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end,
751 struct header_pointers *hdr, bool xdp)
755 ret = tcp_dissect(data, data_end, hdr);
759 ret = tcp_lookup(ctx, hdr, xdp);
763 /* Packet is TCP and doesn't belong to an established connection. */
765 if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1)
768 /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len
769 * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
772 if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len))
775 /* Without volatile the verifier throws this error:
776 * R9 32-bit pointer arithmetic prohibited
778 volatile u64 old_len = data_end - data;
780 if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0))
787 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end,
788 struct header_pointers *hdr, bool xdp)
792 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
793 /* IPV4_MAXLEN is needed when calculating checksum.
794 * At least sizeof(struct iphdr) is needed here to access ihl.
796 if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end)
798 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
799 } else if (hdr->ipv6) {
801 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
802 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
807 if ((void *)hdr->tcp + TCP_MAXLEN > data_end)
810 /* We run out of registers, tcp_len gets spilled to the stack, and the
811 * verifier forgets its min and max values checked above in tcp_dissect.
813 hdr->tcp_len = hdr->tcp->doff * 4;
814 if (hdr->tcp_len < sizeof(*hdr->tcp))
817 return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) :
818 syncookie_handle_ack(hdr);
822 int syncookie_xdp(struct xdp_md *ctx)
824 void *data_end = (void *)(long)ctx->data_end;
825 void *data = (void *)(long)ctx->data;
826 struct header_pointers hdr;
829 ret = syncookie_part1(ctx, data, data_end, &hdr, true);
833 data_end = (void *)(long)ctx->data_end;
834 data = (void *)(long)ctx->data;
836 return syncookie_part2(ctx, data, data_end, &hdr, true);
840 int syncookie_tc(struct __sk_buff *skb)
842 void *data_end = (void *)(long)skb->data_end;
843 void *data = (void *)(long)skb->data;
844 struct header_pointers hdr;
847 ret = syncookie_part1(skb, data, data_end, &hdr, false);
849 return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT;
851 data_end = (void *)(long)skb->data_end;
852 data = (void *)(long)skb->data;
854 ret = syncookie_part2(skb, data, data_end, &hdr, false);
859 return bpf_redirect(skb->ifindex, 0);
865 char _license[] SEC("license") = "GPL";