]> Git Repo - J-linux.git/blob - tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / tools / testing / selftests / bpf / progs / xdp_synproxy_kern.c
1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
3
4 #define BPF_NO_KFUNC_PROTOTYPES
5 #include "vmlinux.h"
6
7 #include <bpf/bpf_helpers.h>
8 #include <bpf/bpf_endian.h>
9 #include <asm/errno.h>
10
11 #include "bpf_compiler.h"
12
13 #define TC_ACT_OK 0
14 #define TC_ACT_SHOT 2
15
16 #define NSEC_PER_SEC 1000000000L
17
18 #define ETH_ALEN 6
19 #define ETH_P_IP 0x0800
20 #define ETH_P_IPV6 0x86DD
21
22 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
23
24 #define IP_MF 0x2000
25 #define IP_OFFSET 0x1fff
26
27 #define NEXTHDR_TCP 6
28
29 #define TCPOPT_NOP 1
30 #define TCPOPT_EOL 0
31 #define TCPOPT_MSS 2
32 #define TCPOPT_WINDOW 3
33 #define TCPOPT_SACK_PERM 4
34 #define TCPOPT_TIMESTAMP 8
35
36 #define TCPOLEN_MSS 4
37 #define TCPOLEN_WINDOW 3
38 #define TCPOLEN_SACK_PERM 2
39 #define TCPOLEN_TIMESTAMP 10
40
41 #define TCP_TS_HZ 1000
42 #define TS_OPT_WSCALE_MASK 0xf
43 #define TS_OPT_SACK (1 << 4)
44 #define TS_OPT_ECN (1 << 5)
45 #define TSBITS 6
46 #define TSMASK (((__u32)1 << TSBITS) - 1)
47 #define TCP_MAX_WSCALE 14U
48
49 #define IPV4_MAXLEN 60
50 #define TCP_MAXLEN 60
51
52 #define DEFAULT_MSS4 1460
53 #define DEFAULT_MSS6 1440
54 #define DEFAULT_WSCALE 7
55 #define DEFAULT_TTL 64
56 #define MAX_ALLOWED_PORTS 8
57
58 #define MAX_PACKET_OFF 0xffff
59
60 #define swap(a, b) \
61         do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
62
63 #define __get_unaligned_t(type, ptr) ({                                         \
64         const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
65         __pptr->x;                                                              \
66 })
67
68 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
69
70 struct {
71         __uint(type, BPF_MAP_TYPE_ARRAY);
72         __type(key, __u32);
73         __type(value, __u64);
74         __uint(max_entries, 2);
75 } values SEC(".maps");
76
77 struct {
78         __uint(type, BPF_MAP_TYPE_ARRAY);
79         __type(key, __u32);
80         __type(value, __u16);
81         __uint(max_entries, MAX_ALLOWED_PORTS);
82 } allowed_ports SEC(".maps");
83
84 /* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in
85  * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally.
86  */
87
88 struct bpf_ct_opts___local {
89         s32 netns_id;
90         s32 error;
91         u8 l4proto;
92         u8 dir;
93         u8 reserved[2];
94 } __attribute__((preserve_access_index));
95
96 #define BPF_F_CURRENT_NETNS (-1)
97
98 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
99                                          struct bpf_sock_tuple *bpf_tuple,
100                                          __u32 len_tuple,
101                                          struct bpf_ct_opts___local *opts,
102                                          __u32 len_opts) __ksym;
103
104 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
105                                          struct bpf_sock_tuple *bpf_tuple,
106                                          u32 len_tuple,
107                                          struct bpf_ct_opts___local *opts,
108                                          u32 len_opts) __ksym;
109
110 extern void bpf_ct_release(struct nf_conn *ct) __ksym;
111
112 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
113 {
114         __u8 tmp[ETH_ALEN];
115
116         __builtin_memcpy(tmp, a, ETH_ALEN);
117         __builtin_memcpy(a, b, ETH_ALEN);
118         __builtin_memcpy(b, tmp, ETH_ALEN);
119 }
120
121 static __always_inline __u16 csum_fold(__u32 csum)
122 {
123         csum = (csum & 0xffff) + (csum >> 16);
124         csum = (csum & 0xffff) + (csum >> 16);
125         return (__u16)~csum;
126 }
127
128 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
129                                                __u32 len, __u8 proto,
130                                                __u32 csum)
131 {
132         __u64 s = csum;
133
134         s += (__u32)saddr;
135         s += (__u32)daddr;
136 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
137         s += proto + len;
138 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
139         s += (proto + len) << 8;
140 #else
141 #error Unknown endian
142 #endif
143         s = (s & 0xffffffff) + (s >> 32);
144         s = (s & 0xffffffff) + (s >> 32);
145
146         return csum_fold((__u32)s);
147 }
148
149 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
150                                              const struct in6_addr *daddr,
151                                              __u32 len, __u8 proto, __u32 csum)
152 {
153         __u64 sum = csum;
154         int i;
155
156         __pragma_loop_unroll
157         for (i = 0; i < 4; i++)
158                 sum += (__u32)saddr->in6_u.u6_addr32[i];
159
160         __pragma_loop_unroll
161         for (i = 0; i < 4; i++)
162                 sum += (__u32)daddr->in6_u.u6_addr32[i];
163
164         /* Don't combine additions to avoid 32-bit overflow. */
165         sum += bpf_htonl(len);
166         sum += bpf_htonl(proto);
167
168         sum = (sum & 0xffffffff) + (sum >> 32);
169         sum = (sum & 0xffffffff) + (sum >> 32);
170
171         return csum_fold((__u32)sum);
172 }
173
174 static __always_inline __u64 tcp_clock_ns(void)
175 {
176         return bpf_ktime_get_ns();
177 }
178
179 static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
180 {
181         return ns / (NSEC_PER_SEC / TCP_TS_HZ);
182 }
183
184 static __always_inline __u32 tcp_clock_ms(void)
185 {
186         return tcp_ns_to_ts(tcp_clock_ns());
187 }
188
189 struct tcpopt_context {
190         void *data;
191         void *data_end;
192         __be32 *tsecr;
193         __u8 wscale;
194         bool option_timestamp;
195         bool option_sack;
196         __u32 off;
197 };
198
199 static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz)
200 {
201         __u64 off = ctx->off;
202         __u8 *data;
203
204         /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */
205         if (off > MAX_PACKET_OFF - sz)
206                 return NULL;
207
208         data = ctx->data + off;
209         barrier_var(data);
210         if (data + sz >= ctx->data_end)
211                 return NULL;
212
213         ctx->off += sz;
214         return data;
215 }
216
217 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
218 {
219         __u8 *opcode, *opsize, *wscale, *tsecr;
220         __u32 off = ctx->off;
221
222         opcode = next(ctx, 1);
223         if (!opcode)
224                 return 1;
225
226         if (*opcode == TCPOPT_EOL)
227                 return 1;
228         if (*opcode == TCPOPT_NOP)
229                 return 0;
230
231         opsize = next(ctx, 1);
232         if (!opsize || *opsize < 2)
233                 return 1;
234
235         switch (*opcode) {
236         case TCPOPT_WINDOW:
237                 wscale = next(ctx, 1);
238                 if (!wscale)
239                         return 1;
240                 if (*opsize == TCPOLEN_WINDOW)
241                         ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE;
242                 break;
243         case TCPOPT_TIMESTAMP:
244                 tsecr = next(ctx, 4);
245                 if (!tsecr)
246                         return 1;
247                 if (*opsize == TCPOLEN_TIMESTAMP) {
248                         ctx->option_timestamp = true;
249                         /* Client's tsval becomes our tsecr. */
250                         *ctx->tsecr = get_unaligned((__be32 *)tsecr);
251                 }
252                 break;
253         case TCPOPT_SACK_PERM:
254                 if (*opsize == TCPOLEN_SACK_PERM)
255                         ctx->option_sack = true;
256                 break;
257         }
258
259         ctx->off = off + *opsize;
260
261         return 0;
262 }
263
264 static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
265 {
266         int i;
267
268         for (i = 0; i < 7; i++)
269                 if (tscookie_tcpopt_parse(context))
270                         return 1;
271         return 0;
272 }
273
274 static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
275                                           __u16 tcp_len, __be32 *tsval,
276                                           __be32 *tsecr, void *data, void *data_end)
277 {
278         struct tcpopt_context loop_ctx = {
279                 .data = data,
280                 .data_end = data_end,
281                 .tsecr = tsecr,
282                 .wscale = TS_OPT_WSCALE_MASK,
283                 .option_timestamp = false,
284                 .option_sack = false,
285                 /* Note: currently verifier would track .off as unbound scalar.
286                  *       In case if verifier would at some point get smarter and
287                  *       compute bounded value for this var, beware that it might
288                  *       hinder bpf_loop() convergence validation.
289                  */
290                 .off = (__u8 *)(tcp_header + 1) - (__u8 *)data,
291         };
292         u32 cookie;
293
294         bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
295
296         if (!loop_ctx.option_timestamp)
297                 return false;
298
299         cookie = tcp_clock_ms() & ~TSMASK;
300         cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
301         if (loop_ctx.option_sack)
302                 cookie |= TS_OPT_SACK;
303         if (tcp_header->ece && tcp_header->cwr)
304                 cookie |= TS_OPT_ECN;
305         *tsval = bpf_htonl(cookie);
306
307         return true;
308 }
309
310 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
311                                                  __u8 *ttl, bool ipv6)
312 {
313         __u32 key = 0;
314         __u64 *value;
315
316         value = bpf_map_lookup_elem(&values, &key);
317         if (value && *value != 0) {
318                 if (ipv6)
319                         *mss = (*value >> 32) & 0xffff;
320                 else
321                         *mss = *value & 0xffff;
322                 *wscale = (*value >> 16) & 0xf;
323                 *ttl = (*value >> 24) & 0xff;
324                 return;
325         }
326
327         *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
328         *wscale = DEFAULT_WSCALE;
329         *ttl = DEFAULT_TTL;
330 }
331
332 static __always_inline void values_inc_synacks(void)
333 {
334         __u32 key = 1;
335         __u64 *value;
336
337         value = bpf_map_lookup_elem(&values, &key);
338         if (value)
339                 __sync_fetch_and_add(value, 1);
340 }
341
342 static __always_inline bool check_port_allowed(__u16 port)
343 {
344         __u32 i;
345
346         for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
347                 __u32 key = i;
348                 __u16 *value;
349
350                 value = bpf_map_lookup_elem(&allowed_ports, &key);
351
352                 if (!value)
353                         break;
354                 /* 0 is a terminator value. Check it first to avoid matching on
355                  * a forbidden port == 0 and returning true.
356                  */
357                 if (*value == 0)
358                         break;
359
360                 if (*value == port)
361                         return true;
362         }
363
364         return false;
365 }
366
367 struct header_pointers {
368         struct ethhdr *eth;
369         struct iphdr *ipv4;
370         struct ipv6hdr *ipv6;
371         struct tcphdr *tcp;
372         __u16 tcp_len;
373 };
374
375 static __always_inline int tcp_dissect(void *data, void *data_end,
376                                        struct header_pointers *hdr)
377 {
378         hdr->eth = data;
379         if (hdr->eth + 1 > data_end)
380                 return XDP_DROP;
381
382         switch (bpf_ntohs(hdr->eth->h_proto)) {
383         case ETH_P_IP:
384                 hdr->ipv6 = NULL;
385
386                 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
387                 if (hdr->ipv4 + 1 > data_end)
388                         return XDP_DROP;
389                 if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
390                         return XDP_DROP;
391                 if (hdr->ipv4->version != 4)
392                         return XDP_DROP;
393
394                 if (hdr->ipv4->protocol != IPPROTO_TCP)
395                         return XDP_PASS;
396
397                 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
398                 break;
399         case ETH_P_IPV6:
400                 hdr->ipv4 = NULL;
401
402                 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
403                 if (hdr->ipv6 + 1 > data_end)
404                         return XDP_DROP;
405                 if (hdr->ipv6->version != 6)
406                         return XDP_DROP;
407
408                 /* XXX: Extension headers are not supported and could circumvent
409                  * XDP SYN flood protection.
410                  */
411                 if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
412                         return XDP_PASS;
413
414                 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
415                 break;
416         default:
417                 /* XXX: VLANs will circumvent XDP SYN flood protection. */
418                 return XDP_PASS;
419         }
420
421         if (hdr->tcp + 1 > data_end)
422                 return XDP_DROP;
423         hdr->tcp_len = hdr->tcp->doff * 4;
424         if (hdr->tcp_len < sizeof(*hdr->tcp))
425                 return XDP_DROP;
426
427         return XDP_TX;
428 }
429
430 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp)
431 {
432         struct bpf_ct_opts___local ct_lookup_opts = {
433                 .netns_id = BPF_F_CURRENT_NETNS,
434                 .l4proto = IPPROTO_TCP,
435         };
436         struct bpf_sock_tuple tup = {};
437         struct nf_conn *ct;
438         __u32 tup_size;
439
440         if (hdr->ipv4) {
441                 /* TCP doesn't normally use fragments, and XDP can't reassemble
442                  * them.
443                  */
444                 if ((hdr->ipv4->frag_off & bpf_htons(IP_MF | IP_OFFSET)) != 0)
445                         return XDP_DROP;
446
447                 tup.ipv4.saddr = hdr->ipv4->saddr;
448                 tup.ipv4.daddr = hdr->ipv4->daddr;
449                 tup.ipv4.sport = hdr->tcp->source;
450                 tup.ipv4.dport = hdr->tcp->dest;
451                 tup_size = sizeof(tup.ipv4);
452         } else if (hdr->ipv6) {
453                 __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
454                 __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
455                 tup.ipv6.sport = hdr->tcp->source;
456                 tup.ipv6.dport = hdr->tcp->dest;
457                 tup_size = sizeof(tup.ipv6);
458         } else {
459                 /* The verifier can't track that either ipv4 or ipv6 is not
460                  * NULL.
461                  */
462                 return XDP_ABORTED;
463         }
464         if (xdp)
465                 ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
466         else
467                 ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
468         if (ct) {
469                 unsigned long status = ct->status;
470
471                 bpf_ct_release(ct);
472                 if (status & IPS_CONFIRMED)
473                         return XDP_PASS;
474         } else if (ct_lookup_opts.error != -ENOENT) {
475                 return XDP_ABORTED;
476         }
477
478         /* error == -ENOENT || !(status & IPS_CONFIRMED) */
479         return XDP_TX;
480 }
481
482 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
483                                           __u8 wscale)
484 {
485         __be32 *start = buf;
486
487         *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
488
489         if (!tsopt)
490                 return buf - start;
491
492         if (tsopt[0] & bpf_htonl(1 << 4))
493                 *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
494                                    (TCPOLEN_SACK_PERM << 16) |
495                                    (TCPOPT_TIMESTAMP << 8) |
496                                    TCPOLEN_TIMESTAMP);
497         else
498                 *buf++ = bpf_htonl((TCPOPT_NOP << 24) |
499                                    (TCPOPT_NOP << 16) |
500                                    (TCPOPT_TIMESTAMP << 8) |
501                                    TCPOLEN_TIMESTAMP);
502         *buf++ = tsopt[0];
503         *buf++ = tsopt[1];
504
505         if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
506                 *buf++ = bpf_htonl((TCPOPT_NOP << 24) |
507                                    (TCPOPT_WINDOW << 16) |
508                                    (TCPOLEN_WINDOW << 8) |
509                                    wscale);
510
511         return buf - start;
512 }
513
514 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
515                                            __u32 cookie, __be32 *tsopt,
516                                            __u16 mss, __u8 wscale)
517 {
518         void *tcp_options;
519
520         tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
521         if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
522                 tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
523         tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
524         swap(tcp_header->source, tcp_header->dest);
525         tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
526         tcp_header->seq = bpf_htonl(cookie);
527         tcp_header->window = 0;
528         tcp_header->urg_ptr = 0;
529         tcp_header->check = 0; /* Calculate checksum later. */
530
531         tcp_options = (void *)(tcp_header + 1);
532         tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
533 }
534
535 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
536                                              __u32 cookie, __be32 *tsopt)
537 {
538         __u8 wscale;
539         __u16 mss;
540         __u8 ttl;
541
542         values_get_tcpipopts(&mss, &wscale, &ttl, false);
543
544         swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
545
546         swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
547         hdr->ipv4->check = 0; /* Calculate checksum later. */
548         hdr->ipv4->tos = 0;
549         hdr->ipv4->id = 0;
550         hdr->ipv4->ttl = ttl;
551
552         tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
553
554         hdr->tcp_len = hdr->tcp->doff * 4;
555         hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
556 }
557
558 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
559                                              __u32 cookie, __be32 *tsopt)
560 {
561         __u8 wscale;
562         __u16 mss;
563         __u8 ttl;
564
565         values_get_tcpipopts(&mss, &wscale, &ttl, true);
566
567         swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
568
569         swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
570         *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
571         hdr->ipv6->hop_limit = ttl;
572
573         tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
574
575         hdr->tcp_len = hdr->tcp->doff * 4;
576         hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
577 }
578
579 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
580                                                 void *ctx,
581                                                 void *data, void *data_end,
582                                                 bool xdp)
583 {
584         __u32 old_pkt_size, new_pkt_size;
585         /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
586          * BPF verifier if tsopt is not volatile. Volatile forces it to store
587          * the pointer value and use it directly, otherwise tcp_mkoptions is
588          * (mis)compiled like this:
589          *   if (!tsopt)
590          *       return buf - start;
591          *   reg = stored_return_value_of_tscookie_init;
592          *   if (reg)
593          *       tsopt = tsopt_buf;
594          *   else
595          *       tsopt = NULL;
596          *   ...
597          *   *buf++ = tsopt[1];
598          * It creates a dead branch where tsopt is assigned NULL, but the
599          * verifier can't prove it's dead and blocks the program.
600          */
601         __be32 * volatile tsopt = NULL;
602         __be32 tsopt_buf[2] = {};
603         __u16 ip_len;
604         __u32 cookie;
605         __s64 value;
606
607         /* Checksum is not yet verified, but both checksum failure and TCP
608          * header checks return XDP_DROP, so the order doesn't matter.
609          */
610         if (hdr->tcp->fin || hdr->tcp->rst)
611                 return XDP_DROP;
612
613         /* Issue SYN cookies on allowed ports, drop SYN packets on blocked
614          * ports.
615          */
616         if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
617                 return XDP_DROP;
618
619         if (hdr->ipv4) {
620                 /* Check the IPv4 and TCP checksums before creating a SYNACK. */
621                 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
622                 if (value < 0)
623                         return XDP_ABORTED;
624                 if (csum_fold(value) != 0)
625                         return XDP_DROP; /* Bad IPv4 checksum. */
626
627                 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
628                 if (value < 0)
629                         return XDP_ABORTED;
630                 if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
631                                       hdr->tcp_len, IPPROTO_TCP, value) != 0)
632                         return XDP_DROP; /* Bad TCP checksum. */
633
634                 ip_len = sizeof(*hdr->ipv4);
635
636                 value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
637                                                        hdr->tcp_len);
638         } else if (hdr->ipv6) {
639                 /* Check the TCP checksum before creating a SYNACK. */
640                 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
641                 if (value < 0)
642                         return XDP_ABORTED;
643                 if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
644                                     hdr->tcp_len, IPPROTO_TCP, value) != 0)
645                         return XDP_DROP; /* Bad TCP checksum. */
646
647                 ip_len = sizeof(*hdr->ipv6);
648
649                 value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
650                                                        hdr->tcp_len);
651         } else {
652                 return XDP_ABORTED;
653         }
654
655         if (value < 0)
656                 return XDP_ABORTED;
657         cookie = (__u32)value;
658
659         if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
660                           &tsopt_buf[0], &tsopt_buf[1], data, data_end))
661                 tsopt = tsopt_buf;
662
663         /* Check that there is enough space for a SYNACK. It also covers
664          * the check that the destination of the __builtin_memmove below
665          * doesn't overflow.
666          */
667         if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
668                 return XDP_ABORTED;
669
670         if (hdr->ipv4) {
671                 if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
672                         struct tcphdr *new_tcp_header;
673
674                         new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
675                         __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
676                         hdr->tcp = new_tcp_header;
677
678                         hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
679                 }
680
681                 tcpv4_gen_synack(hdr, cookie, tsopt);
682         } else if (hdr->ipv6) {
683                 tcpv6_gen_synack(hdr, cookie, tsopt);
684         } else {
685                 return XDP_ABORTED;
686         }
687
688         /* Recalculate checksums. */
689         hdr->tcp->check = 0;
690         value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
691         if (value < 0)
692                 return XDP_ABORTED;
693         if (hdr->ipv4) {
694                 hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
695                                                     hdr->ipv4->daddr,
696                                                     hdr->tcp_len,
697                                                     IPPROTO_TCP,
698                                                     value);
699
700                 hdr->ipv4->check = 0;
701                 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
702                 if (value < 0)
703                         return XDP_ABORTED;
704                 hdr->ipv4->check = csum_fold(value);
705         } else if (hdr->ipv6) {
706                 hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
707                                                   &hdr->ipv6->daddr,
708                                                   hdr->tcp_len,
709                                                   IPPROTO_TCP,
710                                                   value);
711         } else {
712                 return XDP_ABORTED;
713         }
714
715         /* Set the new packet size. */
716         old_pkt_size = data_end - data;
717         new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
718         if (xdp) {
719                 if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
720                         return XDP_ABORTED;
721         } else {
722                 if (bpf_skb_change_tail(ctx, new_pkt_size, 0))
723                         return XDP_ABORTED;
724         }
725
726         values_inc_synacks();
727
728         return XDP_TX;
729 }
730
731 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
732 {
733         int err;
734
735         if (hdr->tcp->rst)
736                 return XDP_DROP;
737
738         if (hdr->ipv4)
739                 err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
740         else if (hdr->ipv6)
741                 err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
742         else
743                 return XDP_ABORTED;
744         if (err)
745                 return XDP_DROP;
746
747         return XDP_PASS;
748 }
749
750 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end,
751                                            struct header_pointers *hdr, bool xdp)
752 {
753         int ret;
754
755         ret = tcp_dissect(data, data_end, hdr);
756         if (ret != XDP_TX)
757                 return ret;
758
759         ret = tcp_lookup(ctx, hdr, xdp);
760         if (ret != XDP_TX)
761                 return ret;
762
763         /* Packet is TCP and doesn't belong to an established connection. */
764
765         if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1)
766                 return XDP_DROP;
767
768         /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len
769          * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
770          */
771         if (xdp) {
772                 if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len))
773                         return XDP_ABORTED;
774         } else {
775                 /* Without volatile the verifier throws this error:
776                  * R9 32-bit pointer arithmetic prohibited
777                  */
778                 volatile u64 old_len = data_end - data;
779
780                 if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0))
781                         return XDP_ABORTED;
782         }
783
784         return XDP_TX;
785 }
786
787 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end,
788                                            struct header_pointers *hdr, bool xdp)
789 {
790         if (hdr->ipv4) {
791                 hdr->eth = data;
792                 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
793                 /* IPV4_MAXLEN is needed when calculating checksum.
794                  * At least sizeof(struct iphdr) is needed here to access ihl.
795                  */
796                 if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end)
797                         return XDP_ABORTED;
798                 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
799         } else if (hdr->ipv6) {
800                 hdr->eth = data;
801                 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
802                 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
803         } else {
804                 return XDP_ABORTED;
805         }
806
807         if ((void *)hdr->tcp + TCP_MAXLEN > data_end)
808                 return XDP_ABORTED;
809
810         /* We run out of registers, tcp_len gets spilled to the stack, and the
811          * verifier forgets its min and max values checked above in tcp_dissect.
812          */
813         hdr->tcp_len = hdr->tcp->doff * 4;
814         if (hdr->tcp_len < sizeof(*hdr->tcp))
815                 return XDP_ABORTED;
816
817         return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) :
818                                syncookie_handle_ack(hdr);
819 }
820
821 SEC("xdp")
822 int syncookie_xdp(struct xdp_md *ctx)
823 {
824         void *data_end = (void *)(long)ctx->data_end;
825         void *data = (void *)(long)ctx->data;
826         struct header_pointers hdr;
827         int ret;
828
829         ret = syncookie_part1(ctx, data, data_end, &hdr, true);
830         if (ret != XDP_TX)
831                 return ret;
832
833         data_end = (void *)(long)ctx->data_end;
834         data = (void *)(long)ctx->data;
835
836         return syncookie_part2(ctx, data, data_end, &hdr, true);
837 }
838
839 SEC("tc")
840 int syncookie_tc(struct __sk_buff *skb)
841 {
842         void *data_end = (void *)(long)skb->data_end;
843         void *data = (void *)(long)skb->data;
844         struct header_pointers hdr;
845         int ret;
846
847         ret = syncookie_part1(skb, data, data_end, &hdr, false);
848         if (ret != XDP_TX)
849                 return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT;
850
851         data_end = (void *)(long)skb->data_end;
852         data = (void *)(long)skb->data;
853
854         ret = syncookie_part2(skb, data, data_end, &hdr, false);
855         switch (ret) {
856         case XDP_PASS:
857                 return TC_ACT_OK;
858         case XDP_TX:
859                 return bpf_redirect(skb->ifindex, 0);
860         default:
861                 return TC_ACT_SHOT;
862         }
863 }
864
865 char _license[] SEC("license") = "GPL";
This page took 0.123182 seconds and 4 git commands to generate.