1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
84 #include <trace/events/tcp.h>
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
96 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 tcp_hdr(skb)->source);
102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
115 /* Still does not detect *everything* that goes through
116 * lo, since we require a loopback src or dst address
117 * or direct binding to 'lo' interface.
119 bool loopback = false;
120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
122 #if IS_ENABLED(CONFIG_IPV6)
123 if (tw->tw_family == AF_INET6) {
124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
132 if (ipv4_is_loopback(tw->tw_daddr) ||
133 ipv4_is_loopback(tw->tw_rcv_saddr))
140 /* With PAWS, it is safe from the viewpoint
141 of data integrity. Even without PAWS it is safe provided sequence
142 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
144 Actually, the idea is close to VJ's one, only timestamp cache is
145 held not per host, but per port pair and TW bucket is used as state
148 If TW bucket has been already destroyed we fall back to VJ's scheme
149 and use initial timestamp retrieved from peer table.
151 if (tcptw->tw_ts_recent_stamp &&
152 (!twp || (reuse && time_after32(ktime_get_seconds(),
153 tcptw->tw_ts_recent_stamp)))) {
154 /* In case of repair and re-using TIME-WAIT sockets we still
155 * want to be sure that it is safe as above but honor the
156 * sequence numbers and time stamps set as part of the repair
159 * Without this check re-using a TIME-WAIT socket with TCP
160 * repair would accumulate a -1 on the repair assigned
161 * sequence number. The first time it is reused the sequence
162 * is -1, the second time -2, etc. This fixes that issue
163 * without appearing to create any others.
165 if (likely(!tp->repair)) {
166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
170 WRITE_ONCE(tp->write_seq, seq);
171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 /* This check is replicated from tcp_v4_connect() and intended to
186 * prevent BPF program called below from accessing bytes that are out
187 * of the bound specified by user in addr_len.
189 if (addr_len < sizeof(struct sockaddr_in))
192 sock_owned_by_me(sk);
194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 /* This will initiate an outgoing connection. */
198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 struct inet_sock *inet = inet_sk(sk);
202 struct tcp_sock *tp = tcp_sk(sk);
203 __be16 orig_sport, orig_dport;
204 __be32 daddr, nexthop;
208 struct ip_options_rcu *inet_opt;
209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
211 if (addr_len < sizeof(struct sockaddr_in))
214 if (usin->sin_family != AF_INET)
215 return -EAFNOSUPPORT;
217 nexthop = daddr = usin->sin_addr.s_addr;
218 inet_opt = rcu_dereference_protected(inet->inet_opt,
219 lockdep_sock_is_held(sk));
220 if (inet_opt && inet_opt->opt.srr) {
223 nexthop = inet_opt->opt.faddr;
226 orig_sport = inet->inet_sport;
227 orig_dport = usin->sin_port;
228 fl4 = &inet->cork.fl.u.ip4;
229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
232 orig_sport, orig_dport, sk);
235 if (err == -ENETUNREACH)
236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
245 if (!inet_opt || !inet_opt->opt.srr)
248 if (!inet->inet_saddr)
249 inet->inet_saddr = fl4->saddr;
250 sk_rcv_saddr_set(sk, inet->inet_saddr);
252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 /* Reset inherited state */
254 tp->rx_opt.ts_recent = 0;
255 tp->rx_opt.ts_recent_stamp = 0;
256 if (likely(!tp->repair))
257 WRITE_ONCE(tp->write_seq, 0);
260 inet->inet_dport = usin->sin_port;
261 sk_daddr_set(sk, daddr);
263 inet_csk(sk)->icsk_ext_hdr_len = 0;
265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269 /* Socket identity is still unknown (sport may be zero).
270 * However we set state to SYN-SENT and not releasing socket
271 * lock select source port, enter ourselves into the hash tables and
272 * complete initialization after this.
274 tcp_set_state(sk, TCP_SYN_SENT);
275 err = inet_hash_connect(tcp_death_row, sk);
281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 inet->inet_sport, inet->inet_dport, sk);
288 /* OK, now commit destination to socket. */
289 sk->sk_gso_type = SKB_GSO_TCPV4;
290 sk_setup_caps(sk, &rt->dst);
293 if (likely(!tp->repair)) {
295 WRITE_ONCE(tp->write_seq,
296 secure_tcp_seq(inet->inet_saddr,
300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
305 inet->inet_id = prandom_u32();
307 if (tcp_fastopen_defer_connect(sk, &err))
312 err = tcp_connect(sk);
321 * This unhashes the socket and releases the local port,
324 tcp_set_state(sk, TCP_CLOSE);
326 sk->sk_route_caps = 0;
327 inet->inet_dport = 0;
330 EXPORT_SYMBOL(tcp_v4_connect);
333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334 * It can be called through tcp_release_cb() if socket was owned by user
335 * at the time tcp_v4_err() was called to handle ICMP message.
337 void tcp_v4_mtu_reduced(struct sock *sk)
339 struct inet_sock *inet = inet_sk(sk);
340 struct dst_entry *dst;
343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
345 mtu = tcp_sk(sk)->mtu_info;
346 dst = inet_csk_update_pmtu(sk, mtu);
350 /* Something is about to be wrong... Remember soft error
351 * for the case, if this connection will not able to recover.
353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 sk->sk_err_soft = EMSGSIZE;
358 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 ip_sk_accept_pmtu(sk) &&
360 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 tcp_sync_mss(sk, mtu);
363 /* Resend the TCP packet because it's
364 * clear that the old packet has been
365 * dropped. This is the new "fast" path mtu
368 tcp_simple_retransmit(sk);
369 } /* else let the usual retransmit timer handle it */
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
375 struct dst_entry *dst = __sk_dst_check(sk, 0);
378 dst->ops->redirect(dst, sk, skb);
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
385 struct request_sock *req = inet_reqsk(sk);
386 struct net *net = sock_net(sk);
388 /* ICMPs are not backlogged, hence we cannot get
389 * an established socket here.
391 if (seq != tcp_rsk(req)->snt_isn) {
392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
395 * Still in SYN_RECV, just remove it silently.
396 * There is no good way to pass the error to the newly
397 * created socket, and POSIX does not want network
398 * errors returned from accept().
400 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 tcp_listendrop(req->rsk_listener);
405 EXPORT_SYMBOL(tcp_req_err);
407 /* TCP-LD (RFC 6069) logic */
408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
416 if (sock_owned_by_user(sk))
419 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
423 skb = tcp_rtx_queue_head(sk);
424 if (WARN_ON_ONCE(!skb))
427 icsk->icsk_backoff--;
428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
431 tcp_mstamp_refresh(tp);
432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 remaining, TCP_RTO_MAX);
439 /* RTO revert clocked out retransmission.
440 * Will retransmit now.
442 tcp_retransmit_timer(sk);
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
448 * This routine is called by the ICMP module when it gets some
449 * sort of error condition. If err < 0 then the socket should
450 * be closed and the error returned to the user. If err > 0
451 * it's just the icmp type << 8 | icmp code. After adjustment
452 * header points to the first 8 bytes of the tcp header. We need
453 * to find the appropriate port.
455 * The locking strategy used here is very "optimistic". When
456 * someone else accesses the socket the ICMP is just dropped
457 * and for some paths there is no check at all.
458 * A more general error queue to queue errors for later handling
459 * is probably better.
463 int tcp_v4_err(struct sk_buff *skb, u32 info)
465 const struct iphdr *iph = (const struct iphdr *)skb->data;
466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
468 struct inet_sock *inet;
469 const int type = icmp_hdr(skb)->type;
470 const int code = icmp_hdr(skb)->code;
472 struct request_sock *fastopen;
475 struct net *net = dev_net(skb->dev);
477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 th->dest, iph->saddr, ntohs(th->source),
481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
484 if (sk->sk_state == TCP_TIME_WAIT) {
485 inet_twsk_put(inet_twsk(sk));
488 seq = ntohl(th->seq);
489 if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 type == ICMP_TIME_EXCEEDED ||
492 (type == ICMP_DEST_UNREACH &&
493 (code == ICMP_NET_UNREACH ||
494 code == ICMP_HOST_UNREACH)));
499 /* If too many ICMPs get dropped on busy
500 * servers this needs to be solved differently.
501 * We do take care of PMTU discovery (RFC1191) special case :
502 * we can receive locally generated ICMP messages while socket is held.
504 if (sock_owned_by_user(sk)) {
505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
508 if (sk->sk_state == TCP_CLOSE)
511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
517 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 fastopen = rcu_dereference(tp->fastopen_rsk);
519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 if (sk->sk_state != TCP_LISTEN &&
521 !between(seq, snd_una, tp->snd_nxt)) {
522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
528 if (!sock_owned_by_user(sk))
529 do_redirect(skb, sk);
531 case ICMP_SOURCE_QUENCH:
532 /* Just silently ignore these. */
534 case ICMP_PARAMETERPROB:
537 case ICMP_DEST_UNREACH:
538 if (code > NR_ICMP_UNREACH)
541 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 /* We are not interested in TCP_LISTEN and open_requests
543 * (SYN-ACKs send out by Linux are always <576bytes so
544 * they should go through unfragmented).
546 if (sk->sk_state == TCP_LISTEN)
550 if (!sock_owned_by_user(sk)) {
551 tcp_v4_mtu_reduced(sk);
553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
559 err = icmp_err_convert[code].errno;
560 /* check if this ICMP message allows revert of backoff.
564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 tcp_ld_RTO_revert(sk, seq);
567 case ICMP_TIME_EXCEEDED:
574 switch (sk->sk_state) {
577 /* Only in fast or simultaneous open. If a fast open socket is
578 * already accepted it is treated as a connected one below.
580 if (fastopen && !fastopen->sk)
583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
585 if (!sock_owned_by_user(sk)) {
588 sk->sk_error_report(sk);
592 sk->sk_err_soft = err;
597 /* If we've already connected we will keep trying
598 * until we time out, or the user gives up.
600 * rfc1122 4.2.3.9 allows to consider as hard errors
601 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 * but it is obsoleted by pmtu discovery).
604 * Note, that in modern internet, where routing is unreliable
605 * and in each dark corner broken firewalls sit, sending random
606 * errors ordered by their masters even this two messages finally lose
607 * their original sense (even Linux sends invalid PORT_UNREACHs)
609 * Now we are in compliance with RFCs.
614 if (!sock_owned_by_user(sk) && inet->recverr) {
616 sk->sk_error_report(sk);
617 } else { /* Only an error on timeout */
618 sk->sk_err_soft = err;
627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
629 struct tcphdr *th = tcp_hdr(skb);
631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 skb->csum_start = skb_transport_header(skb) - skb->head;
633 skb->csum_offset = offsetof(struct tcphdr, check);
636 /* This routine computes an IPv4 TCP checksum. */
637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
639 const struct inet_sock *inet = inet_sk(sk);
641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
643 EXPORT_SYMBOL(tcp_v4_send_check);
646 * This routine will send an RST to the other tcp.
648 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
650 * Answer: if a packet caused RST, it is not for a socket
651 * existing in our system, if it is matched to a socket,
652 * it is just duplicate segment or bug in other side's TCP.
653 * So that we build reply only basing on parameters
654 * arrived with segment.
655 * Exception: precedence violation. We do not implement it in any case.
658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
660 const struct tcphdr *th = tcp_hdr(skb);
663 #ifdef CONFIG_TCP_MD5SIG
664 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
667 struct ip_reply_arg arg;
668 #ifdef CONFIG_TCP_MD5SIG
669 struct tcp_md5sig_key *key = NULL;
670 const __u8 *hash_location = NULL;
671 unsigned char newhash[16];
673 struct sock *sk1 = NULL;
675 u64 transmit_time = 0;
679 /* Never send a reset in response to a reset. */
683 /* If sk not NULL, it means we did a successful lookup and incoming
684 * route had to be correct. prequeue might have dropped our dst.
686 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
689 /* Swap the send and the receive. */
690 memset(&rep, 0, sizeof(rep));
691 rep.th.dest = th->source;
692 rep.th.source = th->dest;
693 rep.th.doff = sizeof(struct tcphdr) / 4;
697 rep.th.seq = th->ack_seq;
700 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
701 skb->len - (th->doff << 2));
704 memset(&arg, 0, sizeof(arg));
705 arg.iov[0].iov_base = (unsigned char *)&rep;
706 arg.iov[0].iov_len = sizeof(rep.th);
708 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
709 #ifdef CONFIG_TCP_MD5SIG
711 hash_location = tcp_parse_md5sig_option(th);
712 if (sk && sk_fullsock(sk)) {
713 const union tcp_md5_addr *addr;
716 /* sdif set, means packet ingressed via a device
717 * in an L3 domain and inet_iif is set to it.
719 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
720 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
721 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
722 } else if (hash_location) {
723 const union tcp_md5_addr *addr;
724 int sdif = tcp_v4_sdif(skb);
725 int dif = inet_iif(skb);
729 * active side is lost. Try to find listening socket through
730 * source port, and then find md5 key through listening socket.
731 * we are not loose security here:
732 * Incoming packet is checked with md5 hash with finding key,
733 * no RST generated if md5 hash doesn't match.
735 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
737 th->source, ip_hdr(skb)->daddr,
738 ntohs(th->source), dif, sdif);
739 /* don't send rst if it can't find key */
743 /* sdif set, means packet ingressed via a device
744 * in an L3 domain and dif is set to it.
746 l3index = sdif ? dif : 0;
747 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
753 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
754 if (genhash || memcmp(hash_location, newhash, 16) != 0)
760 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
762 (TCPOPT_MD5SIG << 8) |
764 /* Update length and the length the header thinks exists */
765 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
766 rep.th.doff = arg.iov[0].iov_len / 4;
768 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
769 key, ip_hdr(skb)->saddr,
770 ip_hdr(skb)->daddr, &rep.th);
773 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
774 ip_hdr(skb)->saddr, /* XXX */
775 arg.iov[0].iov_len, IPPROTO_TCP, 0);
776 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
777 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
779 /* When socket is gone, all binding information is lost.
780 * routing might fail in this case. No choice here, if we choose to force
781 * input interface, we will misroute in case of asymmetric route.
784 arg.bound_dev_if = sk->sk_bound_dev_if;
786 trace_tcp_send_reset(sk, skb);
789 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
790 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
792 arg.tos = ip_hdr(skb)->tos;
793 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
795 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
797 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
798 inet_twsk(sk)->tw_mark : sk->sk_mark;
799 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
800 inet_twsk(sk)->tw_priority : sk->sk_priority;
801 transmit_time = tcp_transmit_time(sk);
803 ip_send_unicast_reply(ctl_sk,
804 skb, &TCP_SKB_CB(skb)->header.h4.opt,
805 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806 &arg, arg.iov[0].iov_len,
810 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
811 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
814 #ifdef CONFIG_TCP_MD5SIG
820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
821 outside socket context is ugly, certainly. What can I do?
824 static void tcp_v4_send_ack(const struct sock *sk,
825 struct sk_buff *skb, u32 seq, u32 ack,
826 u32 win, u32 tsval, u32 tsecr, int oif,
827 struct tcp_md5sig_key *key,
828 int reply_flags, u8 tos)
830 const struct tcphdr *th = tcp_hdr(skb);
833 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
834 #ifdef CONFIG_TCP_MD5SIG
835 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
839 struct net *net = sock_net(sk);
840 struct ip_reply_arg arg;
844 memset(&rep.th, 0, sizeof(struct tcphdr));
845 memset(&arg, 0, sizeof(arg));
847 arg.iov[0].iov_base = (unsigned char *)&rep;
848 arg.iov[0].iov_len = sizeof(rep.th);
850 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
851 (TCPOPT_TIMESTAMP << 8) |
853 rep.opt[1] = htonl(tsval);
854 rep.opt[2] = htonl(tsecr);
855 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
858 /* Swap the send and the receive. */
859 rep.th.dest = th->source;
860 rep.th.source = th->dest;
861 rep.th.doff = arg.iov[0].iov_len / 4;
862 rep.th.seq = htonl(seq);
863 rep.th.ack_seq = htonl(ack);
865 rep.th.window = htons(win);
867 #ifdef CONFIG_TCP_MD5SIG
869 int offset = (tsecr) ? 3 : 0;
871 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
873 (TCPOPT_MD5SIG << 8) |
875 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
876 rep.th.doff = arg.iov[0].iov_len/4;
878 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
879 key, ip_hdr(skb)->saddr,
880 ip_hdr(skb)->daddr, &rep.th);
883 arg.flags = reply_flags;
884 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
885 ip_hdr(skb)->saddr, /* XXX */
886 arg.iov[0].iov_len, IPPROTO_TCP, 0);
887 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
889 arg.bound_dev_if = oif;
891 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
893 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
894 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
895 inet_twsk(sk)->tw_mark : sk->sk_mark;
896 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
897 inet_twsk(sk)->tw_priority : sk->sk_priority;
898 transmit_time = tcp_transmit_time(sk);
899 ip_send_unicast_reply(ctl_sk,
900 skb, &TCP_SKB_CB(skb)->header.h4.opt,
901 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
902 &arg, arg.iov[0].iov_len,
906 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
912 struct inet_timewait_sock *tw = inet_twsk(sk);
913 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
915 tcp_v4_send_ack(sk, skb,
916 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
917 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
918 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
921 tcp_twsk_md5_key(tcptw),
922 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
930 struct request_sock *req)
932 const union tcp_md5_addr *addr;
935 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
936 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
938 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
942 * The window field (SEG.WND) of every outgoing segment, with the
943 * exception of <SYN> segments, MUST be right-shifted by
944 * Rcv.Wind.Shift bits:
946 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
947 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
948 tcp_v4_send_ack(sk, skb, seq,
949 tcp_rsk(req)->rcv_nxt,
950 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
951 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
954 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
955 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
960 * Send a SYN-ACK after having received a SYN.
961 * This still operates on a request_sock only, not on a big
964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
966 struct request_sock *req,
967 struct tcp_fastopen_cookie *foc,
968 enum tcp_synack_type synack_type,
969 struct sk_buff *syn_skb)
971 const struct inet_request_sock *ireq = inet_rsk(req);
977 /* First, grab a route. */
978 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
981 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
983 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
984 tcp_rsk(req)->syn_tos : inet_sk(sk)->tos;
987 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
990 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
992 rcu_dereference(ireq->ireq_opt),
993 tos & ~INET_ECN_MASK);
995 err = net_xmit_eval(err);
1002 * IPv4 request_sock destructor.
1004 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1006 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1009 #ifdef CONFIG_TCP_MD5SIG
1011 * RFC2385 MD5 checksumming requires a mapping of
1012 * IP address->MD5 Key.
1013 * We need to maintain these in the sk structure.
1016 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1017 EXPORT_SYMBOL(tcp_md5_needed);
1019 /* Find the Key structure for an address. */
1020 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1021 const union tcp_md5_addr *addr,
1024 const struct tcp_sock *tp = tcp_sk(sk);
1025 struct tcp_md5sig_key *key;
1026 const struct tcp_md5sig_info *md5sig;
1028 struct tcp_md5sig_key *best_match = NULL;
1031 /* caller either holds rcu_read_lock() or socket lock */
1032 md5sig = rcu_dereference_check(tp->md5sig_info,
1033 lockdep_sock_is_held(sk));
1037 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1038 lockdep_sock_is_held(sk)) {
1039 if (key->family != family)
1041 if (key->l3index && key->l3index != l3index)
1043 if (family == AF_INET) {
1044 mask = inet_make_mask(key->prefixlen);
1045 match = (key->addr.a4.s_addr & mask) ==
1046 (addr->a4.s_addr & mask);
1047 #if IS_ENABLED(CONFIG_IPV6)
1048 } else if (family == AF_INET6) {
1049 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1056 if (match && (!best_match ||
1057 key->prefixlen > best_match->prefixlen))
1062 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1064 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1065 const union tcp_md5_addr *addr,
1066 int family, u8 prefixlen,
1069 const struct tcp_sock *tp = tcp_sk(sk);
1070 struct tcp_md5sig_key *key;
1071 unsigned int size = sizeof(struct in_addr);
1072 const struct tcp_md5sig_info *md5sig;
1074 /* caller either holds rcu_read_lock() or socket lock */
1075 md5sig = rcu_dereference_check(tp->md5sig_info,
1076 lockdep_sock_is_held(sk));
1079 #if IS_ENABLED(CONFIG_IPV6)
1080 if (family == AF_INET6)
1081 size = sizeof(struct in6_addr);
1083 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1084 lockdep_sock_is_held(sk)) {
1085 if (key->family != family)
1087 if (key->l3index && key->l3index != l3index)
1089 if (!memcmp(&key->addr, addr, size) &&
1090 key->prefixlen == prefixlen)
1096 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1097 const struct sock *addr_sk)
1099 const union tcp_md5_addr *addr;
1102 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1103 addr_sk->sk_bound_dev_if);
1104 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1105 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1107 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1109 /* This can be called on a newly created socket, from other files */
1110 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1111 int family, u8 prefixlen, int l3index,
1112 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1114 /* Add Key to the list */
1115 struct tcp_md5sig_key *key;
1116 struct tcp_sock *tp = tcp_sk(sk);
1117 struct tcp_md5sig_info *md5sig;
1119 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1121 /* Pre-existing entry - just update that one.
1122 * Note that the key might be used concurrently.
1123 * data_race() is telling kcsan that we do not care of
1124 * key mismatches, since changing MD5 key on live flows
1125 * can lead to packet drops.
1127 data_race(memcpy(key->key, newkey, newkeylen));
1129 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1130 * Also note that a reader could catch new key->keylen value
1131 * but old key->key[], this is the reason we use __GFP_ZERO
1132 * at sock_kmalloc() time below these lines.
1134 WRITE_ONCE(key->keylen, newkeylen);
1139 md5sig = rcu_dereference_protected(tp->md5sig_info,
1140 lockdep_sock_is_held(sk));
1142 md5sig = kmalloc(sizeof(*md5sig), gfp);
1146 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1147 INIT_HLIST_HEAD(&md5sig->head);
1148 rcu_assign_pointer(tp->md5sig_info, md5sig);
1151 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1154 if (!tcp_alloc_md5sig_pool()) {
1155 sock_kfree_s(sk, key, sizeof(*key));
1159 memcpy(key->key, newkey, newkeylen);
1160 key->keylen = newkeylen;
1161 key->family = family;
1162 key->prefixlen = prefixlen;
1163 key->l3index = l3index;
1164 memcpy(&key->addr, addr,
1165 (family == AF_INET6) ? sizeof(struct in6_addr) :
1166 sizeof(struct in_addr));
1167 hlist_add_head_rcu(&key->node, &md5sig->head);
1170 EXPORT_SYMBOL(tcp_md5_do_add);
1172 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1173 u8 prefixlen, int l3index)
1175 struct tcp_md5sig_key *key;
1177 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1180 hlist_del_rcu(&key->node);
1181 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1182 kfree_rcu(key, rcu);
1185 EXPORT_SYMBOL(tcp_md5_do_del);
1187 static void tcp_clear_md5_list(struct sock *sk)
1189 struct tcp_sock *tp = tcp_sk(sk);
1190 struct tcp_md5sig_key *key;
1191 struct hlist_node *n;
1192 struct tcp_md5sig_info *md5sig;
1194 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1196 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1197 hlist_del_rcu(&key->node);
1198 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1199 kfree_rcu(key, rcu);
1203 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1204 sockptr_t optval, int optlen)
1206 struct tcp_md5sig cmd;
1207 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1208 const union tcp_md5_addr *addr;
1212 if (optlen < sizeof(cmd))
1215 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1218 if (sin->sin_family != AF_INET)
1221 if (optname == TCP_MD5SIG_EXT &&
1222 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1223 prefixlen = cmd.tcpm_prefixlen;
1228 if (optname == TCP_MD5SIG_EXT &&
1229 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1230 struct net_device *dev;
1233 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1234 if (dev && netif_is_l3_master(dev))
1235 l3index = dev->ifindex;
1239 /* ok to reference set/not set outside of rcu;
1240 * right now device MUST be an L3 master
1242 if (!dev || !l3index)
1246 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1248 if (!cmd.tcpm_keylen)
1249 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1251 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1254 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1255 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1258 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1259 __be32 daddr, __be32 saddr,
1260 const struct tcphdr *th, int nbytes)
1262 struct tcp4_pseudohdr *bp;
1263 struct scatterlist sg;
1270 bp->protocol = IPPROTO_TCP;
1271 bp->len = cpu_to_be16(nbytes);
1273 _th = (struct tcphdr *)(bp + 1);
1274 memcpy(_th, th, sizeof(*th));
1277 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1278 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1279 sizeof(*bp) + sizeof(*th));
1280 return crypto_ahash_update(hp->md5_req);
1283 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1284 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1286 struct tcp_md5sig_pool *hp;
1287 struct ahash_request *req;
1289 hp = tcp_get_md5sig_pool();
1291 goto clear_hash_noput;
1294 if (crypto_ahash_init(req))
1296 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1298 if (tcp_md5_hash_key(hp, key))
1300 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1301 if (crypto_ahash_final(req))
1304 tcp_put_md5sig_pool();
1308 tcp_put_md5sig_pool();
1310 memset(md5_hash, 0, 16);
1314 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1315 const struct sock *sk,
1316 const struct sk_buff *skb)
1318 struct tcp_md5sig_pool *hp;
1319 struct ahash_request *req;
1320 const struct tcphdr *th = tcp_hdr(skb);
1321 __be32 saddr, daddr;
1323 if (sk) { /* valid for establish/request sockets */
1324 saddr = sk->sk_rcv_saddr;
1325 daddr = sk->sk_daddr;
1327 const struct iphdr *iph = ip_hdr(skb);
1332 hp = tcp_get_md5sig_pool();
1334 goto clear_hash_noput;
1337 if (crypto_ahash_init(req))
1340 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1342 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1344 if (tcp_md5_hash_key(hp, key))
1346 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1347 if (crypto_ahash_final(req))
1350 tcp_put_md5sig_pool();
1354 tcp_put_md5sig_pool();
1356 memset(md5_hash, 0, 16);
1359 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1363 /* Called with rcu_read_lock() */
1364 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1365 const struct sk_buff *skb,
1368 #ifdef CONFIG_TCP_MD5SIG
1370 * This gets called for each TCP segment that arrives
1371 * so we want to be efficient.
1372 * We have 3 drop cases:
1373 * o No MD5 hash and one expected.
1374 * o MD5 hash and we're not expecting one.
1375 * o MD5 hash and its wrong.
1377 const __u8 *hash_location = NULL;
1378 struct tcp_md5sig_key *hash_expected;
1379 const struct iphdr *iph = ip_hdr(skb);
1380 const struct tcphdr *th = tcp_hdr(skb);
1381 const union tcp_md5_addr *addr;
1382 unsigned char newhash[16];
1383 int genhash, l3index;
1385 /* sdif set, means packet ingressed via a device
1386 * in an L3 domain and dif is set to the l3mdev
1388 l3index = sdif ? dif : 0;
1390 addr = (union tcp_md5_addr *)&iph->saddr;
1391 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1392 hash_location = tcp_parse_md5sig_option(th);
1394 /* We've parsed the options - do we have a hash? */
1395 if (!hash_expected && !hash_location)
1398 if (hash_expected && !hash_location) {
1399 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1403 if (!hash_expected && hash_location) {
1404 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1408 /* Okay, so this is hash_expected and hash_location -
1409 * so we need to calculate the checksum.
1411 genhash = tcp_v4_md5_hash_skb(newhash,
1415 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1416 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1417 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1418 &iph->saddr, ntohs(th->source),
1419 &iph->daddr, ntohs(th->dest),
1420 genhash ? " tcp_v4_calc_md5_hash failed"
1429 static void tcp_v4_init_req(struct request_sock *req,
1430 const struct sock *sk_listener,
1431 struct sk_buff *skb)
1433 struct inet_request_sock *ireq = inet_rsk(req);
1434 struct net *net = sock_net(sk_listener);
1436 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1437 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1438 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1441 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1443 const struct request_sock *req)
1445 return inet_csk_route_req(sk, &fl->u.ip4, req);
1448 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1450 .obj_size = sizeof(struct tcp_request_sock),
1451 .rtx_syn_ack = tcp_rtx_synack,
1452 .send_ack = tcp_v4_reqsk_send_ack,
1453 .destructor = tcp_v4_reqsk_destructor,
1454 .send_reset = tcp_v4_send_reset,
1455 .syn_ack_timeout = tcp_syn_ack_timeout,
1458 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1459 .mss_clamp = TCP_MSS_DEFAULT,
1460 #ifdef CONFIG_TCP_MD5SIG
1461 .req_md5_lookup = tcp_v4_md5_lookup,
1462 .calc_md5_hash = tcp_v4_md5_hash_skb,
1464 .init_req = tcp_v4_init_req,
1465 #ifdef CONFIG_SYN_COOKIES
1466 .cookie_init_seq = cookie_v4_init_sequence,
1468 .route_req = tcp_v4_route_req,
1469 .init_seq = tcp_v4_init_seq,
1470 .init_ts_off = tcp_v4_init_ts_off,
1471 .send_synack = tcp_v4_send_synack,
1474 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1476 /* Never answer to SYNs send to broadcast or multicast */
1477 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1480 return tcp_conn_request(&tcp_request_sock_ops,
1481 &tcp_request_sock_ipv4_ops, sk, skb);
1487 EXPORT_SYMBOL(tcp_v4_conn_request);
1491 * The three way handshake has completed - we got a valid synack -
1492 * now create the new socket.
1494 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1495 struct request_sock *req,
1496 struct dst_entry *dst,
1497 struct request_sock *req_unhash,
1500 struct inet_request_sock *ireq;
1501 struct inet_sock *newinet;
1502 struct tcp_sock *newtp;
1504 #ifdef CONFIG_TCP_MD5SIG
1505 const union tcp_md5_addr *addr;
1506 struct tcp_md5sig_key *key;
1509 struct ip_options_rcu *inet_opt;
1511 if (sk_acceptq_is_full(sk))
1514 newsk = tcp_create_openreq_child(sk, req, skb);
1518 newsk->sk_gso_type = SKB_GSO_TCPV4;
1519 inet_sk_rx_dst_set(newsk, skb);
1521 newtp = tcp_sk(newsk);
1522 newinet = inet_sk(newsk);
1523 ireq = inet_rsk(req);
1524 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1525 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1526 newsk->sk_bound_dev_if = ireq->ir_iif;
1527 newinet->inet_saddr = ireq->ir_loc_addr;
1528 inet_opt = rcu_dereference(ireq->ireq_opt);
1529 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1530 newinet->mc_index = inet_iif(skb);
1531 newinet->mc_ttl = ip_hdr(skb)->ttl;
1532 newinet->rcv_tos = ip_hdr(skb)->tos;
1533 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1535 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1536 newinet->inet_id = prandom_u32();
1538 /* Set ToS of the new socket based upon the value of incoming SYN. */
1539 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1540 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1543 dst = inet_csk_route_child_sock(sk, newsk, req);
1547 /* syncookie case : see end of cookie_v4_check() */
1549 sk_setup_caps(newsk, dst);
1551 tcp_ca_openreq_child(newsk, dst);
1553 tcp_sync_mss(newsk, dst_mtu(dst));
1554 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1556 tcp_initialize_rcv_mss(newsk);
1558 #ifdef CONFIG_TCP_MD5SIG
1559 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1560 /* Copy over the MD5 key from the original socket */
1561 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1562 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1565 * We're using one, so create a matching key
1566 * on the newsk structure. If we fail to get
1567 * memory, then we end up not copying the key
1570 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1571 key->key, key->keylen, GFP_ATOMIC);
1572 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1576 if (__inet_inherit_port(sk, newsk) < 0)
1578 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1579 if (likely(*own_req)) {
1580 tcp_move_syn(newtp, req);
1581 ireq->ireq_opt = NULL;
1583 newinet->inet_opt = NULL;
1588 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1595 newinet->inet_opt = NULL;
1596 inet_csk_prepare_forced_close(newsk);
1600 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1602 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1604 #ifdef CONFIG_SYN_COOKIES
1605 const struct tcphdr *th = tcp_hdr(skb);
1608 sk = cookie_v4_check(sk, skb);
1613 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1614 struct tcphdr *th, u32 *cookie)
1617 #ifdef CONFIG_SYN_COOKIES
1618 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1619 &tcp_request_sock_ipv4_ops, sk, th);
1621 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1622 tcp_synq_overflow(sk);
1628 /* The socket must have it's spinlock held when we get
1629 * here, unless it is a TCP_LISTEN socket.
1631 * We have a potential double-lock case here, so even when
1632 * doing backlog processing we use the BH locking scheme.
1633 * This is because we cannot sleep with the original spinlock
1636 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1640 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1641 struct dst_entry *dst = sk->sk_rx_dst;
1643 sock_rps_save_rxhash(sk, skb);
1644 sk_mark_napi_id(sk, skb);
1646 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1647 !dst->ops->check(dst, 0)) {
1649 sk->sk_rx_dst = NULL;
1652 tcp_rcv_established(sk, skb);
1656 if (tcp_checksum_complete(skb))
1659 if (sk->sk_state == TCP_LISTEN) {
1660 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1665 if (tcp_child_process(sk, nsk, skb)) {
1672 sock_rps_save_rxhash(sk, skb);
1674 if (tcp_rcv_state_process(sk, skb)) {
1681 tcp_v4_send_reset(rsk, skb);
1684 /* Be careful here. If this function gets more complicated and
1685 * gcc suffers from register pressure on the x86, sk (in %ebx)
1686 * might be destroyed here. This current version compiles correctly,
1687 * but you have been warned.
1692 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1693 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1696 EXPORT_SYMBOL(tcp_v4_do_rcv);
1698 int tcp_v4_early_demux(struct sk_buff *skb)
1700 const struct iphdr *iph;
1701 const struct tcphdr *th;
1704 if (skb->pkt_type != PACKET_HOST)
1707 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1713 if (th->doff < sizeof(struct tcphdr) / 4)
1716 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1717 iph->saddr, th->source,
1718 iph->daddr, ntohs(th->dest),
1719 skb->skb_iif, inet_sdif(skb));
1722 skb->destructor = sock_edemux;
1723 if (sk_fullsock(sk)) {
1724 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1727 dst = dst_check(dst, 0);
1729 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1730 skb_dst_set_noref(skb, dst);
1736 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1738 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1739 struct skb_shared_info *shinfo;
1740 const struct tcphdr *th;
1741 struct tcphdr *thtail;
1742 struct sk_buff *tail;
1743 unsigned int hdrlen;
1748 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1749 * we can fix skb->truesize to its real value to avoid future drops.
1750 * This is valid because skb is not yet charged to the socket.
1751 * It has been noticed pure SACK packets were sometimes dropped
1752 * (if cooked by drivers without copybreak feature).
1758 if (unlikely(tcp_checksum_complete(skb))) {
1760 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1761 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1765 /* Attempt coalescing to last skb in backlog, even if we are
1767 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1769 th = (const struct tcphdr *)skb->data;
1770 hdrlen = th->doff * 4;
1771 shinfo = skb_shinfo(skb);
1773 if (!shinfo->gso_size)
1774 shinfo->gso_size = skb->len - hdrlen;
1776 if (!shinfo->gso_segs)
1777 shinfo->gso_segs = 1;
1779 tail = sk->sk_backlog.tail;
1782 thtail = (struct tcphdr *)tail->data;
1784 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1785 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1786 ((TCP_SKB_CB(tail)->tcp_flags |
1787 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1788 !((TCP_SKB_CB(tail)->tcp_flags &
1789 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1790 ((TCP_SKB_CB(tail)->tcp_flags ^
1791 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1792 #ifdef CONFIG_TLS_DEVICE
1793 tail->decrypted != skb->decrypted ||
1795 thtail->doff != th->doff ||
1796 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1799 __skb_pull(skb, hdrlen);
1800 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1801 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1803 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1804 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1805 thtail->window = th->window;
1808 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1809 * thtail->fin, so that the fast path in tcp_rcv_established()
1810 * is not entered if we append a packet with a FIN.
1811 * SYN, RST, URG are not present.
1812 * ACK is set on both packets.
1813 * PSH : we do not really care in TCP stack,
1814 * at least for 'GRO' packets.
1816 thtail->fin |= th->fin;
1817 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1819 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1820 TCP_SKB_CB(tail)->has_rxtstamp = true;
1821 tail->tstamp = skb->tstamp;
1822 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1825 /* Not as strict as GRO. We only need to carry mss max value */
1826 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1827 skb_shinfo(tail)->gso_size);
1829 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1830 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1832 sk->sk_backlog.len += delta;
1833 __NET_INC_STATS(sock_net(sk),
1834 LINUX_MIB_TCPBACKLOGCOALESCE);
1835 kfree_skb_partial(skb, fragstolen);
1838 __skb_push(skb, hdrlen);
1841 /* Only socket owner can try to collapse/prune rx queues
1842 * to reduce memory overhead, so add a little headroom here.
1843 * Few sockets backlog are possibly concurrently non empty.
1847 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1849 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1854 EXPORT_SYMBOL(tcp_add_backlog);
1856 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1858 struct tcphdr *th = (struct tcphdr *)skb->data;
1860 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1862 EXPORT_SYMBOL(tcp_filter);
1864 static void tcp_v4_restore_cb(struct sk_buff *skb)
1866 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1867 sizeof(struct inet_skb_parm));
1870 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1871 const struct tcphdr *th)
1873 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1874 * barrier() makes sure compiler wont play fool^Waliasing games.
1876 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1877 sizeof(struct inet_skb_parm));
1880 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1881 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1882 skb->len - th->doff * 4);
1883 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1884 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1885 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1886 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1887 TCP_SKB_CB(skb)->sacked = 0;
1888 TCP_SKB_CB(skb)->has_rxtstamp =
1889 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1896 int tcp_v4_rcv(struct sk_buff *skb)
1898 struct net *net = dev_net(skb->dev);
1899 struct sk_buff *skb_to_free;
1900 int sdif = inet_sdif(skb);
1901 int dif = inet_iif(skb);
1902 const struct iphdr *iph;
1903 const struct tcphdr *th;
1908 if (skb->pkt_type != PACKET_HOST)
1911 /* Count it even if it's bad */
1912 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1914 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1917 th = (const struct tcphdr *)skb->data;
1919 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1921 if (!pskb_may_pull(skb, th->doff * 4))
1924 /* An explanation is required here, I think.
1925 * Packet length and doff are validated by header prediction,
1926 * provided case of th->doff==0 is eliminated.
1927 * So, we defer the checks. */
1929 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1932 th = (const struct tcphdr *)skb->data;
1935 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1936 th->dest, sdif, &refcounted);
1941 if (sk->sk_state == TCP_TIME_WAIT)
1944 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1945 struct request_sock *req = inet_reqsk(sk);
1946 bool req_stolen = false;
1949 sk = req->rsk_listener;
1950 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1951 sk_drops_add(sk, skb);
1955 if (tcp_checksum_complete(skb)) {
1959 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1960 inet_csk_reqsk_queue_drop_and_put(sk, req);
1963 /* We own a reference on the listener, increase it again
1964 * as we might lose it too soon.
1969 if (!tcp_filter(sk, skb)) {
1970 th = (const struct tcphdr *)skb->data;
1972 tcp_v4_fill_cb(skb, iph, th);
1973 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1978 /* Another cpu got exclusive access to req
1979 * and created a full blown socket.
1980 * Try to feed this packet to this socket
1981 * instead of discarding it.
1983 tcp_v4_restore_cb(skb);
1987 goto discard_and_relse;
1991 tcp_v4_restore_cb(skb);
1992 } else if (tcp_child_process(sk, nsk, skb)) {
1993 tcp_v4_send_reset(nsk, skb);
1994 goto discard_and_relse;
2000 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2001 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2002 goto discard_and_relse;
2005 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2006 goto discard_and_relse;
2008 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2009 goto discard_and_relse;
2013 if (tcp_filter(sk, skb))
2014 goto discard_and_relse;
2015 th = (const struct tcphdr *)skb->data;
2017 tcp_v4_fill_cb(skb, iph, th);
2021 if (sk->sk_state == TCP_LISTEN) {
2022 ret = tcp_v4_do_rcv(sk, skb);
2023 goto put_and_return;
2026 sk_incoming_cpu_update(sk);
2028 bh_lock_sock_nested(sk);
2029 tcp_segs_in(tcp_sk(sk), skb);
2031 if (!sock_owned_by_user(sk)) {
2032 skb_to_free = sk->sk_rx_skb_cache;
2033 sk->sk_rx_skb_cache = NULL;
2034 ret = tcp_v4_do_rcv(sk, skb);
2036 if (tcp_add_backlog(sk, skb))
2037 goto discard_and_relse;
2042 __kfree_skb(skb_to_free);
2051 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2054 tcp_v4_fill_cb(skb, iph, th);
2056 if (tcp_checksum_complete(skb)) {
2058 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2060 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2062 tcp_v4_send_reset(NULL, skb);
2066 /* Discard frame. */
2071 sk_drops_add(sk, skb);
2077 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2078 inet_twsk_put(inet_twsk(sk));
2082 tcp_v4_fill_cb(skb, iph, th);
2084 if (tcp_checksum_complete(skb)) {
2085 inet_twsk_put(inet_twsk(sk));
2088 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2090 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2093 iph->saddr, th->source,
2094 iph->daddr, th->dest,
2098 inet_twsk_deschedule_put(inet_twsk(sk));
2100 tcp_v4_restore_cb(skb);
2108 tcp_v4_timewait_ack(sk, skb);
2111 tcp_v4_send_reset(sk, skb);
2112 inet_twsk_deschedule_put(inet_twsk(sk));
2114 case TCP_TW_SUCCESS:;
2119 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2120 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2121 .twsk_unique = tcp_twsk_unique,
2122 .twsk_destructor= tcp_twsk_destructor,
2125 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2127 struct dst_entry *dst = skb_dst(skb);
2129 if (dst && dst_hold_safe(dst)) {
2130 sk->sk_rx_dst = dst;
2131 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2134 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2136 const struct inet_connection_sock_af_ops ipv4_specific = {
2137 .queue_xmit = ip_queue_xmit,
2138 .send_check = tcp_v4_send_check,
2139 .rebuild_header = inet_sk_rebuild_header,
2140 .sk_rx_dst_set = inet_sk_rx_dst_set,
2141 .conn_request = tcp_v4_conn_request,
2142 .syn_recv_sock = tcp_v4_syn_recv_sock,
2143 .net_header_len = sizeof(struct iphdr),
2144 .setsockopt = ip_setsockopt,
2145 .getsockopt = ip_getsockopt,
2146 .addr2sockaddr = inet_csk_addr2sockaddr,
2147 .sockaddr_len = sizeof(struct sockaddr_in),
2148 .mtu_reduced = tcp_v4_mtu_reduced,
2150 EXPORT_SYMBOL(ipv4_specific);
2152 #ifdef CONFIG_TCP_MD5SIG
2153 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2154 .md5_lookup = tcp_v4_md5_lookup,
2155 .calc_md5_hash = tcp_v4_md5_hash_skb,
2156 .md5_parse = tcp_v4_parse_md5_keys,
2160 /* NOTE: A lot of things set to zero explicitly by call to
2161 * sk_alloc() so need not be done here.
2163 static int tcp_v4_init_sock(struct sock *sk)
2165 struct inet_connection_sock *icsk = inet_csk(sk);
2169 icsk->icsk_af_ops = &ipv4_specific;
2171 #ifdef CONFIG_TCP_MD5SIG
2172 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2178 void tcp_v4_destroy_sock(struct sock *sk)
2180 struct tcp_sock *tp = tcp_sk(sk);
2182 trace_tcp_destroy_sock(sk);
2184 tcp_clear_xmit_timers(sk);
2186 tcp_cleanup_congestion_control(sk);
2188 tcp_cleanup_ulp(sk);
2190 /* Cleanup up the write buffer. */
2191 tcp_write_queue_purge(sk);
2193 /* Check if we want to disable active TFO */
2194 tcp_fastopen_active_disable_ofo_check(sk);
2196 /* Cleans up our, hopefully empty, out_of_order_queue. */
2197 skb_rbtree_purge(&tp->out_of_order_queue);
2199 #ifdef CONFIG_TCP_MD5SIG
2200 /* Clean up the MD5 key list, if any */
2201 if (tp->md5sig_info) {
2202 tcp_clear_md5_list(sk);
2203 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2204 tp->md5sig_info = NULL;
2208 /* Clean up a referenced TCP bind bucket. */
2209 if (inet_csk(sk)->icsk_bind_hash)
2212 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2214 /* If socket is aborted during connect operation */
2215 tcp_free_fastopen_req(tp);
2216 tcp_fastopen_destroy_cipher(sk);
2217 tcp_saved_syn_free(tp);
2219 sk_sockets_allocated_dec(sk);
2221 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2223 #ifdef CONFIG_PROC_FS
2224 /* Proc filesystem TCP sock list dumping. */
2227 * Get next listener socket follow cur. If cur is NULL, get first socket
2228 * starting from bucket given in st->bucket; when st->bucket is zero the
2229 * very first socket in the hash table is returned.
2231 static void *listening_get_next(struct seq_file *seq, void *cur)
2233 struct tcp_seq_afinfo *afinfo;
2234 struct tcp_iter_state *st = seq->private;
2235 struct net *net = seq_file_net(seq);
2236 struct inet_listen_hashbucket *ilb;
2237 struct hlist_nulls_node *node;
2238 struct sock *sk = cur;
2240 if (st->bpf_seq_afinfo)
2241 afinfo = st->bpf_seq_afinfo;
2243 afinfo = PDE_DATA(file_inode(seq->file));
2247 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2248 spin_lock(&ilb->lock);
2249 sk = sk_nulls_head(&ilb->nulls_head);
2253 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2257 sk = sk_nulls_next(sk);
2259 sk_nulls_for_each_from(sk, node) {
2260 if (!net_eq(sock_net(sk), net))
2262 if (afinfo->family == AF_UNSPEC ||
2263 sk->sk_family == afinfo->family)
2266 spin_unlock(&ilb->lock);
2268 if (++st->bucket < INET_LHTABLE_SIZE)
2273 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2275 struct tcp_iter_state *st = seq->private;
2280 rc = listening_get_next(seq, NULL);
2282 while (rc && *pos) {
2283 rc = listening_get_next(seq, rc);
2289 static inline bool empty_bucket(const struct tcp_iter_state *st)
2291 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2295 * Get first established socket starting from bucket given in st->bucket.
2296 * If st->bucket is zero, the very first socket in the hash is returned.
2298 static void *established_get_first(struct seq_file *seq)
2300 struct tcp_seq_afinfo *afinfo;
2301 struct tcp_iter_state *st = seq->private;
2302 struct net *net = seq_file_net(seq);
2305 if (st->bpf_seq_afinfo)
2306 afinfo = st->bpf_seq_afinfo;
2308 afinfo = PDE_DATA(file_inode(seq->file));
2311 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2313 struct hlist_nulls_node *node;
2314 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2316 /* Lockless fast path for the common case of empty buckets */
2317 if (empty_bucket(st))
2321 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2322 if ((afinfo->family != AF_UNSPEC &&
2323 sk->sk_family != afinfo->family) ||
2324 !net_eq(sock_net(sk), net)) {
2330 spin_unlock_bh(lock);
2336 static void *established_get_next(struct seq_file *seq, void *cur)
2338 struct tcp_seq_afinfo *afinfo;
2339 struct sock *sk = cur;
2340 struct hlist_nulls_node *node;
2341 struct tcp_iter_state *st = seq->private;
2342 struct net *net = seq_file_net(seq);
2344 if (st->bpf_seq_afinfo)
2345 afinfo = st->bpf_seq_afinfo;
2347 afinfo = PDE_DATA(file_inode(seq->file));
2352 sk = sk_nulls_next(sk);
2354 sk_nulls_for_each_from(sk, node) {
2355 if ((afinfo->family == AF_UNSPEC ||
2356 sk->sk_family == afinfo->family) &&
2357 net_eq(sock_net(sk), net))
2361 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2363 return established_get_first(seq);
2366 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2368 struct tcp_iter_state *st = seq->private;
2372 rc = established_get_first(seq);
2375 rc = established_get_next(seq, rc);
2381 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2384 struct tcp_iter_state *st = seq->private;
2386 st->state = TCP_SEQ_STATE_LISTENING;
2387 rc = listening_get_idx(seq, &pos);
2390 st->state = TCP_SEQ_STATE_ESTABLISHED;
2391 rc = established_get_idx(seq, pos);
2397 static void *tcp_seek_last_pos(struct seq_file *seq)
2399 struct tcp_iter_state *st = seq->private;
2400 int offset = st->offset;
2401 int orig_num = st->num;
2404 switch (st->state) {
2405 case TCP_SEQ_STATE_LISTENING:
2406 if (st->bucket >= INET_LHTABLE_SIZE)
2408 st->state = TCP_SEQ_STATE_LISTENING;
2409 rc = listening_get_next(seq, NULL);
2410 while (offset-- && rc)
2411 rc = listening_get_next(seq, rc);
2415 st->state = TCP_SEQ_STATE_ESTABLISHED;
2417 case TCP_SEQ_STATE_ESTABLISHED:
2418 if (st->bucket > tcp_hashinfo.ehash_mask)
2420 rc = established_get_first(seq);
2421 while (offset-- && rc)
2422 rc = established_get_next(seq, rc);
2430 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2432 struct tcp_iter_state *st = seq->private;
2435 if (*pos && *pos == st->last_pos) {
2436 rc = tcp_seek_last_pos(seq);
2441 st->state = TCP_SEQ_STATE_LISTENING;
2445 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2448 st->last_pos = *pos;
2451 EXPORT_SYMBOL(tcp_seq_start);
2453 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2455 struct tcp_iter_state *st = seq->private;
2458 if (v == SEQ_START_TOKEN) {
2459 rc = tcp_get_idx(seq, 0);
2463 switch (st->state) {
2464 case TCP_SEQ_STATE_LISTENING:
2465 rc = listening_get_next(seq, v);
2467 st->state = TCP_SEQ_STATE_ESTABLISHED;
2470 rc = established_get_first(seq);
2473 case TCP_SEQ_STATE_ESTABLISHED:
2474 rc = established_get_next(seq, v);
2479 st->last_pos = *pos;
2482 EXPORT_SYMBOL(tcp_seq_next);
2484 void tcp_seq_stop(struct seq_file *seq, void *v)
2486 struct tcp_iter_state *st = seq->private;
2488 switch (st->state) {
2489 case TCP_SEQ_STATE_LISTENING:
2490 if (v != SEQ_START_TOKEN)
2491 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2493 case TCP_SEQ_STATE_ESTABLISHED:
2495 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2499 EXPORT_SYMBOL(tcp_seq_stop);
2501 static void get_openreq4(const struct request_sock *req,
2502 struct seq_file *f, int i)
2504 const struct inet_request_sock *ireq = inet_rsk(req);
2505 long delta = req->rsk_timer.expires - jiffies;
2507 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2508 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2513 ntohs(ireq->ir_rmt_port),
2515 0, 0, /* could print option size, but that is af dependent. */
2516 1, /* timers active (only the expire timer) */
2517 jiffies_delta_to_clock_t(delta),
2519 from_kuid_munged(seq_user_ns(f),
2520 sock_i_uid(req->rsk_listener)),
2521 0, /* non standard timer */
2522 0, /* open_requests have no inode */
2527 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2530 unsigned long timer_expires;
2531 const struct tcp_sock *tp = tcp_sk(sk);
2532 const struct inet_connection_sock *icsk = inet_csk(sk);
2533 const struct inet_sock *inet = inet_sk(sk);
2534 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2535 __be32 dest = inet->inet_daddr;
2536 __be32 src = inet->inet_rcv_saddr;
2537 __u16 destp = ntohs(inet->inet_dport);
2538 __u16 srcp = ntohs(inet->inet_sport);
2542 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2543 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2544 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2546 timer_expires = icsk->icsk_timeout;
2547 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2549 timer_expires = icsk->icsk_timeout;
2550 } else if (timer_pending(&sk->sk_timer)) {
2552 timer_expires = sk->sk_timer.expires;
2555 timer_expires = jiffies;
2558 state = inet_sk_state_load(sk);
2559 if (state == TCP_LISTEN)
2560 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2562 /* Because we don't lock the socket,
2563 * we might find a transient negative value.
2565 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2566 READ_ONCE(tp->copied_seq), 0);
2568 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2569 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2570 i, src, srcp, dest, destp, state,
2571 READ_ONCE(tp->write_seq) - tp->snd_una,
2574 jiffies_delta_to_clock_t(timer_expires - jiffies),
2575 icsk->icsk_retransmits,
2576 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2577 icsk->icsk_probes_out,
2579 refcount_read(&sk->sk_refcnt), sk,
2580 jiffies_to_clock_t(icsk->icsk_rto),
2581 jiffies_to_clock_t(icsk->icsk_ack.ato),
2582 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2584 state == TCP_LISTEN ?
2585 fastopenq->max_qlen :
2586 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2589 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2590 struct seq_file *f, int i)
2592 long delta = tw->tw_timer.expires - jiffies;
2596 dest = tw->tw_daddr;
2597 src = tw->tw_rcv_saddr;
2598 destp = ntohs(tw->tw_dport);
2599 srcp = ntohs(tw->tw_sport);
2601 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2602 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2603 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2604 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2605 refcount_read(&tw->tw_refcnt), tw);
2610 static int tcp4_seq_show(struct seq_file *seq, void *v)
2612 struct tcp_iter_state *st;
2613 struct sock *sk = v;
2615 seq_setwidth(seq, TMPSZ - 1);
2616 if (v == SEQ_START_TOKEN) {
2617 seq_puts(seq, " sl local_address rem_address st tx_queue "
2618 "rx_queue tr tm->when retrnsmt uid timeout "
2624 if (sk->sk_state == TCP_TIME_WAIT)
2625 get_timewait4_sock(v, seq, st->num);
2626 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2627 get_openreq4(v, seq, st->num);
2629 get_tcp4_sock(v, seq, st->num);
2635 #ifdef CONFIG_BPF_SYSCALL
2636 struct bpf_iter__tcp {
2637 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2638 __bpf_md_ptr(struct sock_common *, sk_common);
2639 uid_t uid __aligned(8);
2642 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2643 struct sock_common *sk_common, uid_t uid)
2645 struct bpf_iter__tcp ctx;
2647 meta->seq_num--; /* skip SEQ_START_TOKEN */
2649 ctx.sk_common = sk_common;
2651 return bpf_iter_run_prog(prog, &ctx);
2654 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2656 struct bpf_iter_meta meta;
2657 struct bpf_prog *prog;
2658 struct sock *sk = v;
2661 if (v == SEQ_START_TOKEN)
2664 if (sk->sk_state == TCP_TIME_WAIT) {
2666 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2667 const struct request_sock *req = v;
2669 uid = from_kuid_munged(seq_user_ns(seq),
2670 sock_i_uid(req->rsk_listener));
2672 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2676 prog = bpf_iter_get_info(&meta, false);
2677 return tcp_prog_seq_show(prog, &meta, v, uid);
2680 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2682 struct bpf_iter_meta meta;
2683 struct bpf_prog *prog;
2687 prog = bpf_iter_get_info(&meta, true);
2689 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2692 tcp_seq_stop(seq, v);
2695 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2696 .show = bpf_iter_tcp_seq_show,
2697 .start = tcp_seq_start,
2698 .next = tcp_seq_next,
2699 .stop = bpf_iter_tcp_seq_stop,
2703 static const struct seq_operations tcp4_seq_ops = {
2704 .show = tcp4_seq_show,
2705 .start = tcp_seq_start,
2706 .next = tcp_seq_next,
2707 .stop = tcp_seq_stop,
2710 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2714 static int __net_init tcp4_proc_init_net(struct net *net)
2716 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2717 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2722 static void __net_exit tcp4_proc_exit_net(struct net *net)
2724 remove_proc_entry("tcp", net->proc_net);
2727 static struct pernet_operations tcp4_net_ops = {
2728 .init = tcp4_proc_init_net,
2729 .exit = tcp4_proc_exit_net,
2732 int __init tcp4_proc_init(void)
2734 return register_pernet_subsys(&tcp4_net_ops);
2737 void tcp4_proc_exit(void)
2739 unregister_pernet_subsys(&tcp4_net_ops);
2741 #endif /* CONFIG_PROC_FS */
2743 struct proto tcp_prot = {
2745 .owner = THIS_MODULE,
2747 .pre_connect = tcp_v4_pre_connect,
2748 .connect = tcp_v4_connect,
2749 .disconnect = tcp_disconnect,
2750 .accept = inet_csk_accept,
2752 .init = tcp_v4_init_sock,
2753 .destroy = tcp_v4_destroy_sock,
2754 .shutdown = tcp_shutdown,
2755 .setsockopt = tcp_setsockopt,
2756 .getsockopt = tcp_getsockopt,
2757 .keepalive = tcp_set_keepalive,
2758 .recvmsg = tcp_recvmsg,
2759 .sendmsg = tcp_sendmsg,
2760 .sendpage = tcp_sendpage,
2761 .backlog_rcv = tcp_v4_do_rcv,
2762 .release_cb = tcp_release_cb,
2764 .unhash = inet_unhash,
2765 .get_port = inet_csk_get_port,
2766 .enter_memory_pressure = tcp_enter_memory_pressure,
2767 .leave_memory_pressure = tcp_leave_memory_pressure,
2768 .stream_memory_free = tcp_stream_memory_free,
2769 .sockets_allocated = &tcp_sockets_allocated,
2770 .orphan_count = &tcp_orphan_count,
2771 .memory_allocated = &tcp_memory_allocated,
2772 .memory_pressure = &tcp_memory_pressure,
2773 .sysctl_mem = sysctl_tcp_mem,
2774 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2775 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2776 .max_header = MAX_TCP_HEADER,
2777 .obj_size = sizeof(struct tcp_sock),
2778 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2779 .twsk_prot = &tcp_timewait_sock_ops,
2780 .rsk_prot = &tcp_request_sock_ops,
2781 .h.hashinfo = &tcp_hashinfo,
2782 .no_autobind = true,
2783 .diag_destroy = tcp_abort,
2785 EXPORT_SYMBOL(tcp_prot);
2787 static void __net_exit tcp_sk_exit(struct net *net)
2791 if (net->ipv4.tcp_congestion_control)
2792 bpf_module_put(net->ipv4.tcp_congestion_control,
2793 net->ipv4.tcp_congestion_control->owner);
2795 for_each_possible_cpu(cpu)
2796 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2797 free_percpu(net->ipv4.tcp_sk);
2800 static int __net_init tcp_sk_init(struct net *net)
2804 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2805 if (!net->ipv4.tcp_sk)
2808 for_each_possible_cpu(cpu) {
2811 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2815 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2817 /* Please enforce IP_DF and IPID==0 for RST and
2818 * ACK sent in SYN-RECV and TIME-WAIT state.
2820 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2822 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2825 net->ipv4.sysctl_tcp_ecn = 2;
2826 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2828 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2829 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2830 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2831 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2832 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2834 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2835 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2836 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2838 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2839 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2840 net->ipv4.sysctl_tcp_syncookies = 1;
2841 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2842 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2843 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2844 net->ipv4.sysctl_tcp_orphan_retries = 0;
2845 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2846 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2847 net->ipv4.sysctl_tcp_tw_reuse = 2;
2848 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2850 cnt = tcp_hashinfo.ehash_mask + 1;
2851 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2852 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2854 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2855 net->ipv4.sysctl_tcp_sack = 1;
2856 net->ipv4.sysctl_tcp_window_scaling = 1;
2857 net->ipv4.sysctl_tcp_timestamps = 1;
2858 net->ipv4.sysctl_tcp_early_retrans = 3;
2859 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2860 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2861 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2862 net->ipv4.sysctl_tcp_max_reordering = 300;
2863 net->ipv4.sysctl_tcp_dsack = 1;
2864 net->ipv4.sysctl_tcp_app_win = 31;
2865 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2866 net->ipv4.sysctl_tcp_frto = 2;
2867 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2868 /* This limits the percentage of the congestion window which we
2869 * will allow a single TSO frame to consume. Building TSO frames
2870 * which are too large can cause TCP streams to be bursty.
2872 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2873 /* Default TSQ limit of 16 TSO segments */
2874 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2875 /* rfc5961 challenge ack rate limiting */
2876 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2877 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2878 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2879 net->ipv4.sysctl_tcp_autocorking = 1;
2880 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2881 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2882 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2883 if (net != &init_net) {
2884 memcpy(net->ipv4.sysctl_tcp_rmem,
2885 init_net.ipv4.sysctl_tcp_rmem,
2886 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2887 memcpy(net->ipv4.sysctl_tcp_wmem,
2888 init_net.ipv4.sysctl_tcp_wmem,
2889 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2891 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2892 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2893 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2894 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2895 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2896 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2897 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2899 /* Reno is always built in */
2900 if (!net_eq(net, &init_net) &&
2901 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2902 init_net.ipv4.tcp_congestion_control->owner))
2903 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2905 net->ipv4.tcp_congestion_control = &tcp_reno;
2914 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2918 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2920 list_for_each_entry(net, net_exit_list, exit_list)
2921 tcp_fastopen_ctx_destroy(net);
2924 static struct pernet_operations __net_initdata tcp_sk_ops = {
2925 .init = tcp_sk_init,
2926 .exit = tcp_sk_exit,
2927 .exit_batch = tcp_sk_exit_batch,
2930 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2931 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2932 struct sock_common *sk_common, uid_t uid)
2934 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2936 struct tcp_iter_state *st = priv_data;
2937 struct tcp_seq_afinfo *afinfo;
2940 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2944 afinfo->family = AF_UNSPEC;
2945 st->bpf_seq_afinfo = afinfo;
2946 ret = bpf_iter_init_seq_net(priv_data, aux);
2952 static void bpf_iter_fini_tcp(void *priv_data)
2954 struct tcp_iter_state *st = priv_data;
2956 kfree(st->bpf_seq_afinfo);
2957 bpf_iter_fini_seq_net(priv_data);
2960 static const struct bpf_iter_seq_info tcp_seq_info = {
2961 .seq_ops = &bpf_iter_tcp_seq_ops,
2962 .init_seq_private = bpf_iter_init_tcp,
2963 .fini_seq_private = bpf_iter_fini_tcp,
2964 .seq_priv_size = sizeof(struct tcp_iter_state),
2967 static struct bpf_iter_reg tcp_reg_info = {
2969 .ctx_arg_info_size = 1,
2971 { offsetof(struct bpf_iter__tcp, sk_common),
2972 PTR_TO_BTF_ID_OR_NULL },
2974 .seq_info = &tcp_seq_info,
2977 static void __init bpf_iter_register(void)
2979 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
2980 if (bpf_iter_reg_target(&tcp_reg_info))
2981 pr_warn("Warning: could not register bpf iterator tcp\n");
2986 void __init tcp_v4_init(void)
2988 if (register_pernet_subsys(&tcp_sk_ops))
2989 panic("Failed to create the TCP control socket.\n");
2991 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2992 bpf_iter_register();