1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
84 #include <trace/events/tcp.h>
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
101 tcp_hdr(skb)->source);
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 const struct inet_timewait_sock *tw = inet_twsk(sktw);
112 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
113 struct tcp_sock *tp = tcp_sk(sk);
114 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 #if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156 /* In case of repair and re-using TIME-WAIT sockets we still
157 * want to be sure that it is safe as above but honor the
158 * sequence numbers and time stamps set as part of the repair
161 * Without this check re-using a TIME-WAIT socket with TCP
162 * repair would accumulate a -1 on the repair assigned
163 * sequence number. The first time it is reused the sequence
164 * is -1, the second time -2, etc. This fixes that issue
165 * without appearing to create any others.
167 if (likely(!tp->repair)) {
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
172 WRITE_ONCE(tp->write_seq, seq);
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
187 /* This check is replicated from tcp_v4_connect() and intended to
188 * prevent BPF program called below from accessing bytes that are out
189 * of the bound specified by user in addr_len.
191 if (addr_len < sizeof(struct sockaddr_in))
194 sock_owned_by_me(sk);
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 struct inet_sock *inet = inet_sk(sk);
204 struct tcp_sock *tp = tcp_sk(sk);
205 __be16 orig_sport, orig_dport;
206 __be32 daddr, nexthop;
210 struct ip_options_rcu *inet_opt;
211 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
213 if (addr_len < sizeof(struct sockaddr_in))
216 if (usin->sin_family != AF_INET)
217 return -EAFNOSUPPORT;
219 nexthop = daddr = usin->sin_addr.s_addr;
220 inet_opt = rcu_dereference_protected(inet->inet_opt,
221 lockdep_sock_is_held(sk));
222 if (inet_opt && inet_opt->opt.srr) {
225 nexthop = inet_opt->opt.faddr;
228 orig_sport = inet->inet_sport;
229 orig_dport = usin->sin_port;
230 fl4 = &inet->cork.fl.u.ip4;
231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
236 if (err == -ENETUNREACH)
237 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
241 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
246 if (!inet_opt || !inet_opt->opt.srr)
249 if (!inet->inet_saddr)
250 inet->inet_saddr = fl4->saddr;
251 sk_rcv_saddr_set(sk, inet->inet_saddr);
253 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
254 /* Reset inherited state */
255 tp->rx_opt.ts_recent = 0;
256 tp->rx_opt.ts_recent_stamp = 0;
257 if (likely(!tp->repair))
258 WRITE_ONCE(tp->write_seq, 0);
261 inet->inet_dport = usin->sin_port;
262 sk_daddr_set(sk, daddr);
264 inet_csk(sk)->icsk_ext_hdr_len = 0;
266 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
268 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
270 /* Socket identity is still unknown (sport may be zero).
271 * However we set state to SYN-SENT and not releasing socket
272 * lock select source port, enter ourselves into the hash tables and
273 * complete initialization after this.
275 tcp_set_state(sk, TCP_SYN_SENT);
276 err = inet_hash_connect(tcp_death_row, sk);
282 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
283 inet->inet_sport, inet->inet_dport, sk);
289 /* OK, now commit destination to socket. */
290 sk->sk_gso_type = SKB_GSO_TCPV4;
291 sk_setup_caps(sk, &rt->dst);
294 if (likely(!tp->repair)) {
296 WRITE_ONCE(tp->write_seq,
297 secure_tcp_seq(inet->inet_saddr,
301 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
306 inet->inet_id = prandom_u32();
308 if (tcp_fastopen_defer_connect(sk, &err))
313 err = tcp_connect(sk);
322 * This unhashes the socket and releases the local port,
325 tcp_set_state(sk, TCP_CLOSE);
327 sk->sk_route_caps = 0;
328 inet->inet_dport = 0;
331 EXPORT_SYMBOL(tcp_v4_connect);
334 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
335 * It can be called through tcp_release_cb() if socket was owned by user
336 * at the time tcp_v4_err() was called to handle ICMP message.
338 void tcp_v4_mtu_reduced(struct sock *sk)
340 struct inet_sock *inet = inet_sk(sk);
341 struct dst_entry *dst;
344 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
346 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
347 dst = inet_csk_update_pmtu(sk, mtu);
351 /* Something is about to be wrong... Remember soft error
352 * for the case, if this connection will not able to recover.
354 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
355 sk->sk_err_soft = EMSGSIZE;
359 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
360 ip_sk_accept_pmtu(sk) &&
361 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
362 tcp_sync_mss(sk, mtu);
364 /* Resend the TCP packet because it's
365 * clear that the old packet has been
366 * dropped. This is the new "fast" path mtu
369 tcp_simple_retransmit(sk);
370 } /* else let the usual retransmit timer handle it */
372 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
374 static void do_redirect(struct sk_buff *skb, struct sock *sk)
376 struct dst_entry *dst = __sk_dst_check(sk, 0);
379 dst->ops->redirect(dst, sk, skb);
383 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
384 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
386 struct request_sock *req = inet_reqsk(sk);
387 struct net *net = sock_net(sk);
389 /* ICMPs are not backlogged, hence we cannot get
390 * an established socket here.
392 if (seq != tcp_rsk(req)->snt_isn) {
393 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
396 * Still in SYN_RECV, just remove it silently.
397 * There is no good way to pass the error to the newly
398 * created socket, and POSIX does not want network
399 * errors returned from accept().
401 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
402 tcp_listendrop(req->rsk_listener);
406 EXPORT_SYMBOL(tcp_req_err);
408 /* TCP-LD (RFC 6069) logic */
409 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
411 struct inet_connection_sock *icsk = inet_csk(sk);
412 struct tcp_sock *tp = tcp_sk(sk);
417 if (sock_owned_by_user(sk))
420 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
424 skb = tcp_rtx_queue_head(sk);
425 if (WARN_ON_ONCE(!skb))
428 icsk->icsk_backoff--;
429 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
430 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
432 tcp_mstamp_refresh(tp);
433 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
434 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
437 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438 remaining, TCP_RTO_MAX);
440 /* RTO revert clocked out retransmission.
441 * Will retransmit now.
443 tcp_retransmit_timer(sk);
446 EXPORT_SYMBOL(tcp_ld_RTO_revert);
449 * This routine is called by the ICMP module when it gets some
450 * sort of error condition. If err < 0 then the socket should
451 * be closed and the error returned to the user. If err > 0
452 * it's just the icmp type << 8 | icmp code. After adjustment
453 * header points to the first 8 bytes of the tcp header. We need
454 * to find the appropriate port.
456 * The locking strategy used here is very "optimistic". When
457 * someone else accesses the socket the ICMP is just dropped
458 * and for some paths there is no check at all.
459 * A more general error queue to queue errors for later handling
460 * is probably better.
464 int tcp_v4_err(struct sk_buff *skb, u32 info)
466 const struct iphdr *iph = (const struct iphdr *)skb->data;
467 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
469 struct inet_sock *inet;
470 const int type = icmp_hdr(skb)->type;
471 const int code = icmp_hdr(skb)->code;
473 struct request_sock *fastopen;
476 struct net *net = dev_net(skb->dev);
478 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
479 th->dest, iph->saddr, ntohs(th->source),
482 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
485 if (sk->sk_state == TCP_TIME_WAIT) {
486 inet_twsk_put(inet_twsk(sk));
489 seq = ntohl(th->seq);
490 if (sk->sk_state == TCP_NEW_SYN_RECV) {
491 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
492 type == ICMP_TIME_EXCEEDED ||
493 (type == ICMP_DEST_UNREACH &&
494 (code == ICMP_NET_UNREACH ||
495 code == ICMP_HOST_UNREACH)));
500 /* If too many ICMPs get dropped on busy
501 * servers this needs to be solved differently.
502 * We do take care of PMTU discovery (RFC1191) special case :
503 * we can receive locally generated ICMP messages while socket is held.
505 if (sock_owned_by_user(sk)) {
506 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
507 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
509 if (sk->sk_state == TCP_CLOSE)
512 if (static_branch_unlikely(&ip4_min_ttl)) {
513 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
514 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
515 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
521 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
522 fastopen = rcu_dereference(tp->fastopen_rsk);
523 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
524 if (sk->sk_state != TCP_LISTEN &&
525 !between(seq, snd_una, tp->snd_nxt)) {
526 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
532 if (!sock_owned_by_user(sk))
533 do_redirect(skb, sk);
535 case ICMP_SOURCE_QUENCH:
536 /* Just silently ignore these. */
538 case ICMP_PARAMETERPROB:
541 case ICMP_DEST_UNREACH:
542 if (code > NR_ICMP_UNREACH)
545 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
546 /* We are not interested in TCP_LISTEN and open_requests
547 * (SYN-ACKs send out by Linux are always <576bytes so
548 * they should go through unfragmented).
550 if (sk->sk_state == TCP_LISTEN)
553 WRITE_ONCE(tp->mtu_info, info);
554 if (!sock_owned_by_user(sk)) {
555 tcp_v4_mtu_reduced(sk);
557 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
563 err = icmp_err_convert[code].errno;
564 /* check if this ICMP message allows revert of backoff.
568 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
569 tcp_ld_RTO_revert(sk, seq);
571 case ICMP_TIME_EXCEEDED:
578 switch (sk->sk_state) {
581 /* Only in fast or simultaneous open. If a fast open socket is
582 * already accepted it is treated as a connected one below.
584 if (fastopen && !fastopen->sk)
587 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
589 if (!sock_owned_by_user(sk)) {
596 sk->sk_err_soft = err;
601 /* If we've already connected we will keep trying
602 * until we time out, or the user gives up.
604 * rfc1122 4.2.3.9 allows to consider as hard errors
605 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
606 * but it is obsoleted by pmtu discovery).
608 * Note, that in modern internet, where routing is unreliable
609 * and in each dark corner broken firewalls sit, sending random
610 * errors ordered by their masters even this two messages finally lose
611 * their original sense (even Linux sends invalid PORT_UNREACHs)
613 * Now we are in compliance with RFCs.
618 if (!sock_owned_by_user(sk) && inet->recverr) {
621 } else { /* Only an error on timeout */
622 sk->sk_err_soft = err;
631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
633 struct tcphdr *th = tcp_hdr(skb);
635 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 skb->csum_start = skb_transport_header(skb) - skb->head;
637 skb->csum_offset = offsetof(struct tcphdr, check);
640 /* This routine computes an IPv4 TCP checksum. */
641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
643 const struct inet_sock *inet = inet_sk(sk);
645 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
647 EXPORT_SYMBOL(tcp_v4_send_check);
650 * This routine will send an RST to the other tcp.
652 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
654 * Answer: if a packet caused RST, it is not for a socket
655 * existing in our system, if it is matched to a socket,
656 * it is just duplicate segment or bug in other side's TCP.
657 * So that we build reply only basing on parameters
658 * arrived with segment.
659 * Exception: precedence violation. We do not implement it in any case.
662 #ifdef CONFIG_TCP_MD5SIG
663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
665 #define OPTION_BYTES sizeof(__be32)
668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
670 const struct tcphdr *th = tcp_hdr(skb);
673 __be32 opt[OPTION_BYTES / sizeof(__be32)];
675 struct ip_reply_arg arg;
676 #ifdef CONFIG_TCP_MD5SIG
677 struct tcp_md5sig_key *key = NULL;
678 const __u8 *hash_location = NULL;
679 unsigned char newhash[16];
681 struct sock *sk1 = NULL;
683 u64 transmit_time = 0;
687 /* Never send a reset in response to a reset. */
691 /* If sk not NULL, it means we did a successful lookup and incoming
692 * route had to be correct. prequeue might have dropped our dst.
694 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
697 /* Swap the send and the receive. */
698 memset(&rep, 0, sizeof(rep));
699 rep.th.dest = th->source;
700 rep.th.source = th->dest;
701 rep.th.doff = sizeof(struct tcphdr) / 4;
705 rep.th.seq = th->ack_seq;
708 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
709 skb->len - (th->doff << 2));
712 memset(&arg, 0, sizeof(arg));
713 arg.iov[0].iov_base = (unsigned char *)&rep;
714 arg.iov[0].iov_len = sizeof(rep.th);
716 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
717 #ifdef CONFIG_TCP_MD5SIG
719 hash_location = tcp_parse_md5sig_option(th);
720 if (sk && sk_fullsock(sk)) {
721 const union tcp_md5_addr *addr;
724 /* sdif set, means packet ingressed via a device
725 * in an L3 domain and inet_iif is set to it.
727 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
728 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
729 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
730 } else if (hash_location) {
731 const union tcp_md5_addr *addr;
732 int sdif = tcp_v4_sdif(skb);
733 int dif = inet_iif(skb);
737 * active side is lost. Try to find listening socket through
738 * source port, and then find md5 key through listening socket.
739 * we are not loose security here:
740 * Incoming packet is checked with md5 hash with finding key,
741 * no RST generated if md5 hash doesn't match.
743 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
745 th->source, ip_hdr(skb)->daddr,
746 ntohs(th->source), dif, sdif);
747 /* don't send rst if it can't find key */
751 /* sdif set, means packet ingressed via a device
752 * in an L3 domain and dif is set to it.
754 l3index = sdif ? dif : 0;
755 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
756 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
761 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
762 if (genhash || memcmp(hash_location, newhash, 16) != 0)
768 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
770 (TCPOPT_MD5SIG << 8) |
772 /* Update length and the length the header thinks exists */
773 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
774 rep.th.doff = arg.iov[0].iov_len / 4;
776 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
777 key, ip_hdr(skb)->saddr,
778 ip_hdr(skb)->daddr, &rep.th);
781 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
782 if (rep.opt[0] == 0) {
783 __be32 mrst = mptcp_reset_option(skb);
787 arg.iov[0].iov_len += sizeof(mrst);
788 rep.th.doff = arg.iov[0].iov_len / 4;
792 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
793 ip_hdr(skb)->saddr, /* XXX */
794 arg.iov[0].iov_len, IPPROTO_TCP, 0);
795 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
796 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
798 /* When socket is gone, all binding information is lost.
799 * routing might fail in this case. No choice here, if we choose to force
800 * input interface, we will misroute in case of asymmetric route.
803 arg.bound_dev_if = sk->sk_bound_dev_if;
805 trace_tcp_send_reset(sk, skb);
808 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
809 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
811 arg.tos = ip_hdr(skb)->tos;
812 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
814 ctl_sk = this_cpu_read(ipv4_tcp_sk);
815 sock_net_set(ctl_sk, net);
817 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
818 inet_twsk(sk)->tw_mark : sk->sk_mark;
819 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
820 inet_twsk(sk)->tw_priority : sk->sk_priority;
821 transmit_time = tcp_transmit_time(sk);
823 ip_send_unicast_reply(ctl_sk,
824 skb, &TCP_SKB_CB(skb)->header.h4.opt,
825 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
826 &arg, arg.iov[0].iov_len,
830 sock_net_set(ctl_sk, &init_net);
831 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
832 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
835 #ifdef CONFIG_TCP_MD5SIG
841 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
842 outside socket context is ugly, certainly. What can I do?
845 static void tcp_v4_send_ack(const struct sock *sk,
846 struct sk_buff *skb, u32 seq, u32 ack,
847 u32 win, u32 tsval, u32 tsecr, int oif,
848 struct tcp_md5sig_key *key,
849 int reply_flags, u8 tos)
851 const struct tcphdr *th = tcp_hdr(skb);
854 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
855 #ifdef CONFIG_TCP_MD5SIG
856 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
860 struct net *net = sock_net(sk);
861 struct ip_reply_arg arg;
865 memset(&rep.th, 0, sizeof(struct tcphdr));
866 memset(&arg, 0, sizeof(arg));
868 arg.iov[0].iov_base = (unsigned char *)&rep;
869 arg.iov[0].iov_len = sizeof(rep.th);
871 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
872 (TCPOPT_TIMESTAMP << 8) |
874 rep.opt[1] = htonl(tsval);
875 rep.opt[2] = htonl(tsecr);
876 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
879 /* Swap the send and the receive. */
880 rep.th.dest = th->source;
881 rep.th.source = th->dest;
882 rep.th.doff = arg.iov[0].iov_len / 4;
883 rep.th.seq = htonl(seq);
884 rep.th.ack_seq = htonl(ack);
886 rep.th.window = htons(win);
888 #ifdef CONFIG_TCP_MD5SIG
890 int offset = (tsecr) ? 3 : 0;
892 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
894 (TCPOPT_MD5SIG << 8) |
896 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
897 rep.th.doff = arg.iov[0].iov_len/4;
899 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
900 key, ip_hdr(skb)->saddr,
901 ip_hdr(skb)->daddr, &rep.th);
904 arg.flags = reply_flags;
905 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
906 ip_hdr(skb)->saddr, /* XXX */
907 arg.iov[0].iov_len, IPPROTO_TCP, 0);
908 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
910 arg.bound_dev_if = oif;
912 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
914 ctl_sk = this_cpu_read(ipv4_tcp_sk);
915 sock_net_set(ctl_sk, net);
916 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
917 inet_twsk(sk)->tw_mark : sk->sk_mark;
918 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
919 inet_twsk(sk)->tw_priority : sk->sk_priority;
920 transmit_time = tcp_transmit_time(sk);
921 ip_send_unicast_reply(ctl_sk,
922 skb, &TCP_SKB_CB(skb)->header.h4.opt,
923 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
924 &arg, arg.iov[0].iov_len,
928 sock_net_set(ctl_sk, &init_net);
929 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
933 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
935 struct inet_timewait_sock *tw = inet_twsk(sk);
936 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
938 tcp_v4_send_ack(sk, skb,
939 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
940 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
941 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
944 tcp_twsk_md5_key(tcptw),
945 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
952 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
953 struct request_sock *req)
955 const union tcp_md5_addr *addr;
958 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
959 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
961 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
965 * The window field (SEG.WND) of every outgoing segment, with the
966 * exception of <SYN> segments, MUST be right-shifted by
967 * Rcv.Wind.Shift bits:
969 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
970 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
971 tcp_v4_send_ack(sk, skb, seq,
972 tcp_rsk(req)->rcv_nxt,
973 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
974 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
977 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
978 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
983 * Send a SYN-ACK after having received a SYN.
984 * This still operates on a request_sock only, not on a big
987 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
989 struct request_sock *req,
990 struct tcp_fastopen_cookie *foc,
991 enum tcp_synack_type synack_type,
992 struct sk_buff *syn_skb)
994 const struct inet_request_sock *ireq = inet_rsk(req);
1000 /* First, grab a route. */
1001 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1004 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1007 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1009 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1010 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1011 (inet_sk(sk)->tos & INET_ECN_MASK) :
1014 if (!INET_ECN_is_capable(tos) &&
1015 tcp_bpf_ca_needs_ecn((struct sock *)req))
1016 tos |= INET_ECN_ECT_0;
1019 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1021 rcu_dereference(ireq->ireq_opt),
1024 err = net_xmit_eval(err);
1031 * IPv4 request_sock destructor.
1033 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1035 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1038 #ifdef CONFIG_TCP_MD5SIG
1040 * RFC2385 MD5 checksumming requires a mapping of
1041 * IP address->MD5 Key.
1042 * We need to maintain these in the sk structure.
1045 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1046 EXPORT_SYMBOL(tcp_md5_needed);
1048 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1053 /* l3index always overrides non-l3index */
1054 if (old->l3index && new->l3index == 0)
1056 if (old->l3index == 0 && new->l3index)
1059 return old->prefixlen < new->prefixlen;
1062 /* Find the Key structure for an address. */
1063 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1064 const union tcp_md5_addr *addr,
1067 const struct tcp_sock *tp = tcp_sk(sk);
1068 struct tcp_md5sig_key *key;
1069 const struct tcp_md5sig_info *md5sig;
1071 struct tcp_md5sig_key *best_match = NULL;
1074 /* caller either holds rcu_read_lock() or socket lock */
1075 md5sig = rcu_dereference_check(tp->md5sig_info,
1076 lockdep_sock_is_held(sk));
1080 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1081 lockdep_sock_is_held(sk)) {
1082 if (key->family != family)
1084 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1086 if (family == AF_INET) {
1087 mask = inet_make_mask(key->prefixlen);
1088 match = (key->addr.a4.s_addr & mask) ==
1089 (addr->a4.s_addr & mask);
1090 #if IS_ENABLED(CONFIG_IPV6)
1091 } else if (family == AF_INET6) {
1092 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1099 if (match && better_md5_match(best_match, key))
1104 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1106 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1107 const union tcp_md5_addr *addr,
1108 int family, u8 prefixlen,
1109 int l3index, u8 flags)
1111 const struct tcp_sock *tp = tcp_sk(sk);
1112 struct tcp_md5sig_key *key;
1113 unsigned int size = sizeof(struct in_addr);
1114 const struct tcp_md5sig_info *md5sig;
1116 /* caller either holds rcu_read_lock() or socket lock */
1117 md5sig = rcu_dereference_check(tp->md5sig_info,
1118 lockdep_sock_is_held(sk));
1121 #if IS_ENABLED(CONFIG_IPV6)
1122 if (family == AF_INET6)
1123 size = sizeof(struct in6_addr);
1125 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1126 lockdep_sock_is_held(sk)) {
1127 if (key->family != family)
1129 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1131 if (key->l3index != l3index)
1133 if (!memcmp(&key->addr, addr, size) &&
1134 key->prefixlen == prefixlen)
1140 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1141 const struct sock *addr_sk)
1143 const union tcp_md5_addr *addr;
1146 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1147 addr_sk->sk_bound_dev_if);
1148 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1149 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1151 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1153 /* This can be called on a newly created socket, from other files */
1154 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1155 int family, u8 prefixlen, int l3index, u8 flags,
1156 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1158 /* Add Key to the list */
1159 struct tcp_md5sig_key *key;
1160 struct tcp_sock *tp = tcp_sk(sk);
1161 struct tcp_md5sig_info *md5sig;
1163 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1165 /* Pre-existing entry - just update that one.
1166 * Note that the key might be used concurrently.
1167 * data_race() is telling kcsan that we do not care of
1168 * key mismatches, since changing MD5 key on live flows
1169 * can lead to packet drops.
1171 data_race(memcpy(key->key, newkey, newkeylen));
1173 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1174 * Also note that a reader could catch new key->keylen value
1175 * but old key->key[], this is the reason we use __GFP_ZERO
1176 * at sock_kmalloc() time below these lines.
1178 WRITE_ONCE(key->keylen, newkeylen);
1183 md5sig = rcu_dereference_protected(tp->md5sig_info,
1184 lockdep_sock_is_held(sk));
1186 md5sig = kmalloc(sizeof(*md5sig), gfp);
1191 INIT_HLIST_HEAD(&md5sig->head);
1192 rcu_assign_pointer(tp->md5sig_info, md5sig);
1195 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1198 if (!tcp_alloc_md5sig_pool()) {
1199 sock_kfree_s(sk, key, sizeof(*key));
1203 memcpy(key->key, newkey, newkeylen);
1204 key->keylen = newkeylen;
1205 key->family = family;
1206 key->prefixlen = prefixlen;
1207 key->l3index = l3index;
1209 memcpy(&key->addr, addr,
1210 (family == AF_INET6) ? sizeof(struct in6_addr) :
1211 sizeof(struct in_addr));
1212 hlist_add_head_rcu(&key->node, &md5sig->head);
1215 EXPORT_SYMBOL(tcp_md5_do_add);
1217 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1218 u8 prefixlen, int l3index, u8 flags)
1220 struct tcp_md5sig_key *key;
1222 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1225 hlist_del_rcu(&key->node);
1226 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1227 kfree_rcu(key, rcu);
1230 EXPORT_SYMBOL(tcp_md5_do_del);
1232 static void tcp_clear_md5_list(struct sock *sk)
1234 struct tcp_sock *tp = tcp_sk(sk);
1235 struct tcp_md5sig_key *key;
1236 struct hlist_node *n;
1237 struct tcp_md5sig_info *md5sig;
1239 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1241 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1242 hlist_del_rcu(&key->node);
1243 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1244 kfree_rcu(key, rcu);
1248 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1249 sockptr_t optval, int optlen)
1251 struct tcp_md5sig cmd;
1252 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1253 const union tcp_md5_addr *addr;
1258 if (optlen < sizeof(cmd))
1261 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1264 if (sin->sin_family != AF_INET)
1267 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1269 if (optname == TCP_MD5SIG_EXT &&
1270 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1271 prefixlen = cmd.tcpm_prefixlen;
1276 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1277 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1278 struct net_device *dev;
1281 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1282 if (dev && netif_is_l3_master(dev))
1283 l3index = dev->ifindex;
1287 /* ok to reference set/not set outside of rcu;
1288 * right now device MUST be an L3 master
1290 if (!dev || !l3index)
1294 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1296 if (!cmd.tcpm_keylen)
1297 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1299 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1302 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1303 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1306 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1307 __be32 daddr, __be32 saddr,
1308 const struct tcphdr *th, int nbytes)
1310 struct tcp4_pseudohdr *bp;
1311 struct scatterlist sg;
1318 bp->protocol = IPPROTO_TCP;
1319 bp->len = cpu_to_be16(nbytes);
1321 _th = (struct tcphdr *)(bp + 1);
1322 memcpy(_th, th, sizeof(*th));
1325 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1326 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1327 sizeof(*bp) + sizeof(*th));
1328 return crypto_ahash_update(hp->md5_req);
1331 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1332 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1334 struct tcp_md5sig_pool *hp;
1335 struct ahash_request *req;
1337 hp = tcp_get_md5sig_pool();
1339 goto clear_hash_noput;
1342 if (crypto_ahash_init(req))
1344 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1346 if (tcp_md5_hash_key(hp, key))
1348 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1349 if (crypto_ahash_final(req))
1352 tcp_put_md5sig_pool();
1356 tcp_put_md5sig_pool();
1358 memset(md5_hash, 0, 16);
1362 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1363 const struct sock *sk,
1364 const struct sk_buff *skb)
1366 struct tcp_md5sig_pool *hp;
1367 struct ahash_request *req;
1368 const struct tcphdr *th = tcp_hdr(skb);
1369 __be32 saddr, daddr;
1371 if (sk) { /* valid for establish/request sockets */
1372 saddr = sk->sk_rcv_saddr;
1373 daddr = sk->sk_daddr;
1375 const struct iphdr *iph = ip_hdr(skb);
1380 hp = tcp_get_md5sig_pool();
1382 goto clear_hash_noput;
1385 if (crypto_ahash_init(req))
1388 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1390 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1392 if (tcp_md5_hash_key(hp, key))
1394 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1395 if (crypto_ahash_final(req))
1398 tcp_put_md5sig_pool();
1402 tcp_put_md5sig_pool();
1404 memset(md5_hash, 0, 16);
1407 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1411 static void tcp_v4_init_req(struct request_sock *req,
1412 const struct sock *sk_listener,
1413 struct sk_buff *skb)
1415 struct inet_request_sock *ireq = inet_rsk(req);
1416 struct net *net = sock_net(sk_listener);
1418 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1419 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1420 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1423 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1424 struct sk_buff *skb,
1426 struct request_sock *req)
1428 tcp_v4_init_req(req, sk, skb);
1430 if (security_inet_conn_request(sk, skb, req))
1433 return inet_csk_route_req(sk, &fl->u.ip4, req);
1436 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1438 .obj_size = sizeof(struct tcp_request_sock),
1439 .rtx_syn_ack = tcp_rtx_synack,
1440 .send_ack = tcp_v4_reqsk_send_ack,
1441 .destructor = tcp_v4_reqsk_destructor,
1442 .send_reset = tcp_v4_send_reset,
1443 .syn_ack_timeout = tcp_syn_ack_timeout,
1446 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1447 .mss_clamp = TCP_MSS_DEFAULT,
1448 #ifdef CONFIG_TCP_MD5SIG
1449 .req_md5_lookup = tcp_v4_md5_lookup,
1450 .calc_md5_hash = tcp_v4_md5_hash_skb,
1452 #ifdef CONFIG_SYN_COOKIES
1453 .cookie_init_seq = cookie_v4_init_sequence,
1455 .route_req = tcp_v4_route_req,
1456 .init_seq = tcp_v4_init_seq,
1457 .init_ts_off = tcp_v4_init_ts_off,
1458 .send_synack = tcp_v4_send_synack,
1461 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1463 /* Never answer to SYNs send to broadcast or multicast */
1464 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1467 return tcp_conn_request(&tcp_request_sock_ops,
1468 &tcp_request_sock_ipv4_ops, sk, skb);
1474 EXPORT_SYMBOL(tcp_v4_conn_request);
1478 * The three way handshake has completed - we got a valid synack -
1479 * now create the new socket.
1481 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1482 struct request_sock *req,
1483 struct dst_entry *dst,
1484 struct request_sock *req_unhash,
1487 struct inet_request_sock *ireq;
1488 bool found_dup_sk = false;
1489 struct inet_sock *newinet;
1490 struct tcp_sock *newtp;
1492 #ifdef CONFIG_TCP_MD5SIG
1493 const union tcp_md5_addr *addr;
1494 struct tcp_md5sig_key *key;
1497 struct ip_options_rcu *inet_opt;
1499 if (sk_acceptq_is_full(sk))
1502 newsk = tcp_create_openreq_child(sk, req, skb);
1506 newsk->sk_gso_type = SKB_GSO_TCPV4;
1507 inet_sk_rx_dst_set(newsk, skb);
1509 newtp = tcp_sk(newsk);
1510 newinet = inet_sk(newsk);
1511 ireq = inet_rsk(req);
1512 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1513 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1514 newsk->sk_bound_dev_if = ireq->ir_iif;
1515 newinet->inet_saddr = ireq->ir_loc_addr;
1516 inet_opt = rcu_dereference(ireq->ireq_opt);
1517 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1518 newinet->mc_index = inet_iif(skb);
1519 newinet->mc_ttl = ip_hdr(skb)->ttl;
1520 newinet->rcv_tos = ip_hdr(skb)->tos;
1521 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1523 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1524 newinet->inet_id = prandom_u32();
1526 /* Set ToS of the new socket based upon the value of incoming SYN.
1527 * ECT bits are set later in tcp_init_transfer().
1529 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1530 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1533 dst = inet_csk_route_child_sock(sk, newsk, req);
1537 /* syncookie case : see end of cookie_v4_check() */
1539 sk_setup_caps(newsk, dst);
1541 tcp_ca_openreq_child(newsk, dst);
1543 tcp_sync_mss(newsk, dst_mtu(dst));
1544 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1546 tcp_initialize_rcv_mss(newsk);
1548 #ifdef CONFIG_TCP_MD5SIG
1549 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1550 /* Copy over the MD5 key from the original socket */
1551 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1552 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1555 * We're using one, so create a matching key
1556 * on the newsk structure. If we fail to get
1557 * memory, then we end up not copying the key
1560 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1561 key->key, key->keylen, GFP_ATOMIC);
1562 sk_gso_disable(newsk);
1566 if (__inet_inherit_port(sk, newsk) < 0)
1568 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1570 if (likely(*own_req)) {
1571 tcp_move_syn(newtp, req);
1572 ireq->ireq_opt = NULL;
1574 newinet->inet_opt = NULL;
1576 if (!req_unhash && found_dup_sk) {
1577 /* This code path should only be executed in the
1578 * syncookie case only
1580 bh_unlock_sock(newsk);
1588 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1595 newinet->inet_opt = NULL;
1596 inet_csk_prepare_forced_close(newsk);
1600 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1602 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1604 #ifdef CONFIG_SYN_COOKIES
1605 const struct tcphdr *th = tcp_hdr(skb);
1608 sk = cookie_v4_check(sk, skb);
1613 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1614 struct tcphdr *th, u32 *cookie)
1617 #ifdef CONFIG_SYN_COOKIES
1618 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1619 &tcp_request_sock_ipv4_ops, sk, th);
1621 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1622 tcp_synq_overflow(sk);
1628 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1630 /* The socket must have it's spinlock held when we get
1631 * here, unless it is a TCP_LISTEN socket.
1633 * We have a potential double-lock case here, so even when
1634 * doing backlog processing we use the BH locking scheme.
1635 * This is because we cannot sleep with the original spinlock
1638 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1640 enum skb_drop_reason reason;
1643 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1644 struct dst_entry *dst;
1646 dst = rcu_dereference_protected(sk->sk_rx_dst,
1647 lockdep_sock_is_held(sk));
1649 sock_rps_save_rxhash(sk, skb);
1650 sk_mark_napi_id(sk, skb);
1652 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1653 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1655 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1659 tcp_rcv_established(sk, skb);
1663 reason = SKB_DROP_REASON_NOT_SPECIFIED;
1664 if (tcp_checksum_complete(skb))
1667 if (sk->sk_state == TCP_LISTEN) {
1668 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1673 if (tcp_child_process(sk, nsk, skb)) {
1680 sock_rps_save_rxhash(sk, skb);
1682 if (tcp_rcv_state_process(sk, skb)) {
1689 tcp_v4_send_reset(rsk, skb);
1691 kfree_skb_reason(skb, reason);
1692 /* Be careful here. If this function gets more complicated and
1693 * gcc suffers from register pressure on the x86, sk (in %ebx)
1694 * might be destroyed here. This current version compiles correctly,
1695 * but you have been warned.
1700 reason = SKB_DROP_REASON_TCP_CSUM;
1701 trace_tcp_bad_csum(skb);
1702 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1703 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1706 EXPORT_SYMBOL(tcp_v4_do_rcv);
1708 int tcp_v4_early_demux(struct sk_buff *skb)
1710 const struct iphdr *iph;
1711 const struct tcphdr *th;
1714 if (skb->pkt_type != PACKET_HOST)
1717 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1723 if (th->doff < sizeof(struct tcphdr) / 4)
1726 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1727 iph->saddr, th->source,
1728 iph->daddr, ntohs(th->dest),
1729 skb->skb_iif, inet_sdif(skb));
1732 skb->destructor = sock_edemux;
1733 if (sk_fullsock(sk)) {
1734 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1737 dst = dst_check(dst, 0);
1739 sk->sk_rx_dst_ifindex == skb->skb_iif)
1740 skb_dst_set_noref(skb, dst);
1746 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1747 enum skb_drop_reason *reason)
1749 u32 limit, tail_gso_size, tail_gso_segs;
1750 struct skb_shared_info *shinfo;
1751 const struct tcphdr *th;
1752 struct tcphdr *thtail;
1753 struct sk_buff *tail;
1754 unsigned int hdrlen;
1760 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1761 * we can fix skb->truesize to its real value to avoid future drops.
1762 * This is valid because skb is not yet charged to the socket.
1763 * It has been noticed pure SACK packets were sometimes dropped
1764 * (if cooked by drivers without copybreak feature).
1770 if (unlikely(tcp_checksum_complete(skb))) {
1772 trace_tcp_bad_csum(skb);
1773 *reason = SKB_DROP_REASON_TCP_CSUM;
1774 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1775 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1779 /* Attempt coalescing to last skb in backlog, even if we are
1781 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1783 th = (const struct tcphdr *)skb->data;
1784 hdrlen = th->doff * 4;
1786 tail = sk->sk_backlog.tail;
1789 thtail = (struct tcphdr *)tail->data;
1791 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1792 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1793 ((TCP_SKB_CB(tail)->tcp_flags |
1794 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1795 !((TCP_SKB_CB(tail)->tcp_flags &
1796 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1797 ((TCP_SKB_CB(tail)->tcp_flags ^
1798 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1799 #ifdef CONFIG_TLS_DEVICE
1800 tail->decrypted != skb->decrypted ||
1802 thtail->doff != th->doff ||
1803 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1806 __skb_pull(skb, hdrlen);
1808 shinfo = skb_shinfo(skb);
1809 gso_size = shinfo->gso_size ?: skb->len;
1810 gso_segs = shinfo->gso_segs ?: 1;
1812 shinfo = skb_shinfo(tail);
1813 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1814 tail_gso_segs = shinfo->gso_segs ?: 1;
1816 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1817 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1819 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1820 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1821 thtail->window = th->window;
1824 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1825 * thtail->fin, so that the fast path in tcp_rcv_established()
1826 * is not entered if we append a packet with a FIN.
1827 * SYN, RST, URG are not present.
1828 * ACK is set on both packets.
1829 * PSH : we do not really care in TCP stack,
1830 * at least for 'GRO' packets.
1832 thtail->fin |= th->fin;
1833 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1835 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1836 TCP_SKB_CB(tail)->has_rxtstamp = true;
1837 tail->tstamp = skb->tstamp;
1838 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1841 /* Not as strict as GRO. We only need to carry mss max value */
1842 shinfo->gso_size = max(gso_size, tail_gso_size);
1843 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1845 sk->sk_backlog.len += delta;
1846 __NET_INC_STATS(sock_net(sk),
1847 LINUX_MIB_TCPBACKLOGCOALESCE);
1848 kfree_skb_partial(skb, fragstolen);
1851 __skb_push(skb, hdrlen);
1854 /* Only socket owner can try to collapse/prune rx queues
1855 * to reduce memory overhead, so add a little headroom here.
1856 * Few sockets backlog are possibly concurrently non empty.
1858 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1860 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1862 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1863 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1868 EXPORT_SYMBOL(tcp_add_backlog);
1870 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1872 struct tcphdr *th = (struct tcphdr *)skb->data;
1874 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1876 EXPORT_SYMBOL(tcp_filter);
1878 static void tcp_v4_restore_cb(struct sk_buff *skb)
1880 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1881 sizeof(struct inet_skb_parm));
1884 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1885 const struct tcphdr *th)
1887 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1888 * barrier() makes sure compiler wont play fool^Waliasing games.
1890 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1891 sizeof(struct inet_skb_parm));
1894 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1895 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1896 skb->len - th->doff * 4);
1897 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1898 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1899 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1900 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1901 TCP_SKB_CB(skb)->sacked = 0;
1902 TCP_SKB_CB(skb)->has_rxtstamp =
1903 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1910 int tcp_v4_rcv(struct sk_buff *skb)
1912 struct net *net = dev_net(skb->dev);
1913 enum skb_drop_reason drop_reason;
1914 int sdif = inet_sdif(skb);
1915 int dif = inet_iif(skb);
1916 const struct iphdr *iph;
1917 const struct tcphdr *th;
1922 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1923 if (skb->pkt_type != PACKET_HOST)
1926 /* Count it even if it's bad */
1927 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1929 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1932 th = (const struct tcphdr *)skb->data;
1934 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1935 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1938 if (!pskb_may_pull(skb, th->doff * 4))
1941 /* An explanation is required here, I think.
1942 * Packet length and doff are validated by header prediction,
1943 * provided case of th->doff==0 is eliminated.
1944 * So, we defer the checks. */
1946 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1949 th = (const struct tcphdr *)skb->data;
1952 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1953 th->dest, sdif, &refcounted);
1958 if (sk->sk_state == TCP_TIME_WAIT)
1961 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1962 struct request_sock *req = inet_reqsk(sk);
1963 bool req_stolen = false;
1966 sk = req->rsk_listener;
1967 drop_reason = tcp_inbound_md5_hash(sk, skb,
1968 &iph->saddr, &iph->daddr,
1969 AF_INET, dif, sdif);
1970 if (unlikely(drop_reason)) {
1971 sk_drops_add(sk, skb);
1975 if (tcp_checksum_complete(skb)) {
1979 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1980 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1982 inet_csk_reqsk_queue_drop_and_put(sk, req);
1986 /* reuseport_migrate_sock() has already held one sk_refcnt
1990 /* We own a reference on the listener, increase it again
1991 * as we might lose it too soon.
1997 if (!tcp_filter(sk, skb)) {
1998 th = (const struct tcphdr *)skb->data;
2000 tcp_v4_fill_cb(skb, iph, th);
2001 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2003 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2008 /* Another cpu got exclusive access to req
2009 * and created a full blown socket.
2010 * Try to feed this packet to this socket
2011 * instead of discarding it.
2013 tcp_v4_restore_cb(skb);
2017 goto discard_and_relse;
2021 tcp_v4_restore_cb(skb);
2022 } else if (tcp_child_process(sk, nsk, skb)) {
2023 tcp_v4_send_reset(nsk, skb);
2024 goto discard_and_relse;
2031 if (static_branch_unlikely(&ip4_min_ttl)) {
2032 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2033 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2034 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2035 goto discard_and_relse;
2039 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2040 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2041 goto discard_and_relse;
2044 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2045 &iph->daddr, AF_INET, dif, sdif);
2047 goto discard_and_relse;
2051 if (tcp_filter(sk, skb)) {
2052 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2053 goto discard_and_relse;
2055 th = (const struct tcphdr *)skb->data;
2057 tcp_v4_fill_cb(skb, iph, th);
2061 if (sk->sk_state == TCP_LISTEN) {
2062 ret = tcp_v4_do_rcv(sk, skb);
2063 goto put_and_return;
2066 sk_incoming_cpu_update(sk);
2068 bh_lock_sock_nested(sk);
2069 tcp_segs_in(tcp_sk(sk), skb);
2071 if (!sock_owned_by_user(sk)) {
2072 ret = tcp_v4_do_rcv(sk, skb);
2074 if (tcp_add_backlog(sk, skb, &drop_reason))
2075 goto discard_and_relse;
2086 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2087 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2090 tcp_v4_fill_cb(skb, iph, th);
2092 if (tcp_checksum_complete(skb)) {
2094 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2095 trace_tcp_bad_csum(skb);
2096 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2098 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2100 tcp_v4_send_reset(NULL, skb);
2104 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2105 /* Discard frame. */
2106 kfree_skb_reason(skb, drop_reason);
2110 sk_drops_add(sk, skb);
2116 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2117 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2118 inet_twsk_put(inet_twsk(sk));
2122 tcp_v4_fill_cb(skb, iph, th);
2124 if (tcp_checksum_complete(skb)) {
2125 inet_twsk_put(inet_twsk(sk));
2128 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2130 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2133 iph->saddr, th->source,
2134 iph->daddr, th->dest,
2138 inet_twsk_deschedule_put(inet_twsk(sk));
2140 tcp_v4_restore_cb(skb);
2148 tcp_v4_timewait_ack(sk, skb);
2151 tcp_v4_send_reset(sk, skb);
2152 inet_twsk_deschedule_put(inet_twsk(sk));
2154 case TCP_TW_SUCCESS:;
2159 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2160 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2161 .twsk_unique = tcp_twsk_unique,
2162 .twsk_destructor= tcp_twsk_destructor,
2165 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2167 struct dst_entry *dst = skb_dst(skb);
2169 if (dst && dst_hold_safe(dst)) {
2170 rcu_assign_pointer(sk->sk_rx_dst, dst);
2171 sk->sk_rx_dst_ifindex = skb->skb_iif;
2174 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2176 const struct inet_connection_sock_af_ops ipv4_specific = {
2177 .queue_xmit = ip_queue_xmit,
2178 .send_check = tcp_v4_send_check,
2179 .rebuild_header = inet_sk_rebuild_header,
2180 .sk_rx_dst_set = inet_sk_rx_dst_set,
2181 .conn_request = tcp_v4_conn_request,
2182 .syn_recv_sock = tcp_v4_syn_recv_sock,
2183 .net_header_len = sizeof(struct iphdr),
2184 .setsockopt = ip_setsockopt,
2185 .getsockopt = ip_getsockopt,
2186 .addr2sockaddr = inet_csk_addr2sockaddr,
2187 .sockaddr_len = sizeof(struct sockaddr_in),
2188 .mtu_reduced = tcp_v4_mtu_reduced,
2190 EXPORT_SYMBOL(ipv4_specific);
2192 #ifdef CONFIG_TCP_MD5SIG
2193 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2194 .md5_lookup = tcp_v4_md5_lookup,
2195 .calc_md5_hash = tcp_v4_md5_hash_skb,
2196 .md5_parse = tcp_v4_parse_md5_keys,
2200 /* NOTE: A lot of things set to zero explicitly by call to
2201 * sk_alloc() so need not be done here.
2203 static int tcp_v4_init_sock(struct sock *sk)
2205 struct inet_connection_sock *icsk = inet_csk(sk);
2209 icsk->icsk_af_ops = &ipv4_specific;
2211 #ifdef CONFIG_TCP_MD5SIG
2212 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2218 void tcp_v4_destroy_sock(struct sock *sk)
2220 struct tcp_sock *tp = tcp_sk(sk);
2222 trace_tcp_destroy_sock(sk);
2224 tcp_clear_xmit_timers(sk);
2226 tcp_cleanup_congestion_control(sk);
2228 tcp_cleanup_ulp(sk);
2230 /* Cleanup up the write buffer. */
2231 tcp_write_queue_purge(sk);
2233 /* Check if we want to disable active TFO */
2234 tcp_fastopen_active_disable_ofo_check(sk);
2236 /* Cleans up our, hopefully empty, out_of_order_queue. */
2237 skb_rbtree_purge(&tp->out_of_order_queue);
2239 #ifdef CONFIG_TCP_MD5SIG
2240 /* Clean up the MD5 key list, if any */
2241 if (tp->md5sig_info) {
2242 tcp_clear_md5_list(sk);
2243 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2244 tp->md5sig_info = NULL;
2248 /* Clean up a referenced TCP bind bucket. */
2249 if (inet_csk(sk)->icsk_bind_hash)
2252 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2254 /* If socket is aborted during connect operation */
2255 tcp_free_fastopen_req(tp);
2256 tcp_fastopen_destroy_cipher(sk);
2257 tcp_saved_syn_free(tp);
2259 sk_sockets_allocated_dec(sk);
2261 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2263 #ifdef CONFIG_PROC_FS
2264 /* Proc filesystem TCP sock list dumping. */
2266 static unsigned short seq_file_family(const struct seq_file *seq);
2268 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2270 unsigned short family = seq_file_family(seq);
2272 /* AF_UNSPEC is used as a match all */
2273 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2274 net_eq(sock_net(sk), seq_file_net(seq)));
2277 /* Find a non empty bucket (starting from st->bucket)
2278 * and return the first sk from it.
2280 static void *listening_get_first(struct seq_file *seq)
2282 struct tcp_iter_state *st = seq->private;
2285 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2286 struct inet_listen_hashbucket *ilb2;
2287 struct hlist_nulls_node *node;
2290 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2291 if (hlist_nulls_empty(&ilb2->nulls_head))
2294 spin_lock(&ilb2->lock);
2295 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2296 if (seq_sk_match(seq, sk))
2299 spin_unlock(&ilb2->lock);
2305 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2306 * If "cur" is the last one in the st->bucket,
2307 * call listening_get_first() to return the first sk of the next
2310 static void *listening_get_next(struct seq_file *seq, void *cur)
2312 struct tcp_iter_state *st = seq->private;
2313 struct inet_listen_hashbucket *ilb2;
2314 struct hlist_nulls_node *node;
2315 struct sock *sk = cur;
2320 sk = sk_nulls_next(sk);
2321 sk_nulls_for_each_from(sk, node) {
2322 if (seq_sk_match(seq, sk))
2326 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2327 spin_unlock(&ilb2->lock);
2329 return listening_get_first(seq);
2332 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2334 struct tcp_iter_state *st = seq->private;
2339 rc = listening_get_first(seq);
2341 while (rc && *pos) {
2342 rc = listening_get_next(seq, rc);
2348 static inline bool empty_bucket(const struct tcp_iter_state *st)
2350 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2354 * Get first established socket starting from bucket given in st->bucket.
2355 * If st->bucket is zero, the very first socket in the hash is returned.
2357 static void *established_get_first(struct seq_file *seq)
2359 struct tcp_iter_state *st = seq->private;
2362 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2364 struct hlist_nulls_node *node;
2365 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2367 /* Lockless fast path for the common case of empty buckets */
2368 if (empty_bucket(st))
2372 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2373 if (seq_sk_match(seq, sk))
2376 spin_unlock_bh(lock);
2382 static void *established_get_next(struct seq_file *seq, void *cur)
2384 struct sock *sk = cur;
2385 struct hlist_nulls_node *node;
2386 struct tcp_iter_state *st = seq->private;
2391 sk = sk_nulls_next(sk);
2393 sk_nulls_for_each_from(sk, node) {
2394 if (seq_sk_match(seq, sk))
2398 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2400 return established_get_first(seq);
2403 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2405 struct tcp_iter_state *st = seq->private;
2409 rc = established_get_first(seq);
2412 rc = established_get_next(seq, rc);
2418 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2421 struct tcp_iter_state *st = seq->private;
2423 st->state = TCP_SEQ_STATE_LISTENING;
2424 rc = listening_get_idx(seq, &pos);
2427 st->state = TCP_SEQ_STATE_ESTABLISHED;
2428 rc = established_get_idx(seq, pos);
2434 static void *tcp_seek_last_pos(struct seq_file *seq)
2436 struct tcp_iter_state *st = seq->private;
2437 int bucket = st->bucket;
2438 int offset = st->offset;
2439 int orig_num = st->num;
2442 switch (st->state) {
2443 case TCP_SEQ_STATE_LISTENING:
2444 if (st->bucket > tcp_hashinfo.lhash2_mask)
2446 st->state = TCP_SEQ_STATE_LISTENING;
2447 rc = listening_get_first(seq);
2448 while (offset-- && rc && bucket == st->bucket)
2449 rc = listening_get_next(seq, rc);
2453 st->state = TCP_SEQ_STATE_ESTABLISHED;
2455 case TCP_SEQ_STATE_ESTABLISHED:
2456 if (st->bucket > tcp_hashinfo.ehash_mask)
2458 rc = established_get_first(seq);
2459 while (offset-- && rc && bucket == st->bucket)
2460 rc = established_get_next(seq, rc);
2468 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2470 struct tcp_iter_state *st = seq->private;
2473 if (*pos && *pos == st->last_pos) {
2474 rc = tcp_seek_last_pos(seq);
2479 st->state = TCP_SEQ_STATE_LISTENING;
2483 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2486 st->last_pos = *pos;
2489 EXPORT_SYMBOL(tcp_seq_start);
2491 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2493 struct tcp_iter_state *st = seq->private;
2496 if (v == SEQ_START_TOKEN) {
2497 rc = tcp_get_idx(seq, 0);
2501 switch (st->state) {
2502 case TCP_SEQ_STATE_LISTENING:
2503 rc = listening_get_next(seq, v);
2505 st->state = TCP_SEQ_STATE_ESTABLISHED;
2508 rc = established_get_first(seq);
2511 case TCP_SEQ_STATE_ESTABLISHED:
2512 rc = established_get_next(seq, v);
2517 st->last_pos = *pos;
2520 EXPORT_SYMBOL(tcp_seq_next);
2522 void tcp_seq_stop(struct seq_file *seq, void *v)
2524 struct tcp_iter_state *st = seq->private;
2526 switch (st->state) {
2527 case TCP_SEQ_STATE_LISTENING:
2528 if (v != SEQ_START_TOKEN)
2529 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2531 case TCP_SEQ_STATE_ESTABLISHED:
2533 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2537 EXPORT_SYMBOL(tcp_seq_stop);
2539 static void get_openreq4(const struct request_sock *req,
2540 struct seq_file *f, int i)
2542 const struct inet_request_sock *ireq = inet_rsk(req);
2543 long delta = req->rsk_timer.expires - jiffies;
2545 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2546 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2551 ntohs(ireq->ir_rmt_port),
2553 0, 0, /* could print option size, but that is af dependent. */
2554 1, /* timers active (only the expire timer) */
2555 jiffies_delta_to_clock_t(delta),
2557 from_kuid_munged(seq_user_ns(f),
2558 sock_i_uid(req->rsk_listener)),
2559 0, /* non standard timer */
2560 0, /* open_requests have no inode */
2565 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2568 unsigned long timer_expires;
2569 const struct tcp_sock *tp = tcp_sk(sk);
2570 const struct inet_connection_sock *icsk = inet_csk(sk);
2571 const struct inet_sock *inet = inet_sk(sk);
2572 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2573 __be32 dest = inet->inet_daddr;
2574 __be32 src = inet->inet_rcv_saddr;
2575 __u16 destp = ntohs(inet->inet_dport);
2576 __u16 srcp = ntohs(inet->inet_sport);
2580 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2581 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2582 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2584 timer_expires = icsk->icsk_timeout;
2585 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2587 timer_expires = icsk->icsk_timeout;
2588 } else if (timer_pending(&sk->sk_timer)) {
2590 timer_expires = sk->sk_timer.expires;
2593 timer_expires = jiffies;
2596 state = inet_sk_state_load(sk);
2597 if (state == TCP_LISTEN)
2598 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2600 /* Because we don't lock the socket,
2601 * we might find a transient negative value.
2603 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2604 READ_ONCE(tp->copied_seq), 0);
2606 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2607 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2608 i, src, srcp, dest, destp, state,
2609 READ_ONCE(tp->write_seq) - tp->snd_una,
2612 jiffies_delta_to_clock_t(timer_expires - jiffies),
2613 icsk->icsk_retransmits,
2614 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2615 icsk->icsk_probes_out,
2617 refcount_read(&sk->sk_refcnt), sk,
2618 jiffies_to_clock_t(icsk->icsk_rto),
2619 jiffies_to_clock_t(icsk->icsk_ack.ato),
2620 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2622 state == TCP_LISTEN ?
2623 fastopenq->max_qlen :
2624 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2627 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2628 struct seq_file *f, int i)
2630 long delta = tw->tw_timer.expires - jiffies;
2634 dest = tw->tw_daddr;
2635 src = tw->tw_rcv_saddr;
2636 destp = ntohs(tw->tw_dport);
2637 srcp = ntohs(tw->tw_sport);
2639 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2640 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2641 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2642 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2643 refcount_read(&tw->tw_refcnt), tw);
2648 static int tcp4_seq_show(struct seq_file *seq, void *v)
2650 struct tcp_iter_state *st;
2651 struct sock *sk = v;
2653 seq_setwidth(seq, TMPSZ - 1);
2654 if (v == SEQ_START_TOKEN) {
2655 seq_puts(seq, " sl local_address rem_address st tx_queue "
2656 "rx_queue tr tm->when retrnsmt uid timeout "
2662 if (sk->sk_state == TCP_TIME_WAIT)
2663 get_timewait4_sock(v, seq, st->num);
2664 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2665 get_openreq4(v, seq, st->num);
2667 get_tcp4_sock(v, seq, st->num);
2673 #ifdef CONFIG_BPF_SYSCALL
2674 struct bpf_tcp_iter_state {
2675 struct tcp_iter_state state;
2676 unsigned int cur_sk;
2677 unsigned int end_sk;
2678 unsigned int max_sk;
2679 struct sock **batch;
2680 bool st_bucket_done;
2683 struct bpf_iter__tcp {
2684 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2685 __bpf_md_ptr(struct sock_common *, sk_common);
2686 uid_t uid __aligned(8);
2689 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2690 struct sock_common *sk_common, uid_t uid)
2692 struct bpf_iter__tcp ctx;
2694 meta->seq_num--; /* skip SEQ_START_TOKEN */
2696 ctx.sk_common = sk_common;
2698 return bpf_iter_run_prog(prog, &ctx);
2701 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2703 while (iter->cur_sk < iter->end_sk)
2704 sock_put(iter->batch[iter->cur_sk++]);
2707 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2708 unsigned int new_batch_sz)
2710 struct sock **new_batch;
2712 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2713 GFP_USER | __GFP_NOWARN);
2717 bpf_iter_tcp_put_batch(iter);
2718 kvfree(iter->batch);
2719 iter->batch = new_batch;
2720 iter->max_sk = new_batch_sz;
2725 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2726 struct sock *start_sk)
2728 struct bpf_tcp_iter_state *iter = seq->private;
2729 struct tcp_iter_state *st = &iter->state;
2730 struct hlist_nulls_node *node;
2731 unsigned int expected = 1;
2734 sock_hold(start_sk);
2735 iter->batch[iter->end_sk++] = start_sk;
2737 sk = sk_nulls_next(start_sk);
2738 sk_nulls_for_each_from(sk, node) {
2739 if (seq_sk_match(seq, sk)) {
2740 if (iter->end_sk < iter->max_sk) {
2742 iter->batch[iter->end_sk++] = sk;
2747 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2752 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2753 struct sock *start_sk)
2755 struct bpf_tcp_iter_state *iter = seq->private;
2756 struct tcp_iter_state *st = &iter->state;
2757 struct hlist_nulls_node *node;
2758 unsigned int expected = 1;
2761 sock_hold(start_sk);
2762 iter->batch[iter->end_sk++] = start_sk;
2764 sk = sk_nulls_next(start_sk);
2765 sk_nulls_for_each_from(sk, node) {
2766 if (seq_sk_match(seq, sk)) {
2767 if (iter->end_sk < iter->max_sk) {
2769 iter->batch[iter->end_sk++] = sk;
2774 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2779 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2781 struct bpf_tcp_iter_state *iter = seq->private;
2782 struct tcp_iter_state *st = &iter->state;
2783 unsigned int expected;
2784 bool resized = false;
2787 /* The st->bucket is done. Directly advance to the next
2788 * bucket instead of having the tcp_seek_last_pos() to skip
2789 * one by one in the current bucket and eventually find out
2790 * it has to advance to the next bucket.
2792 if (iter->st_bucket_done) {
2795 if (st->state == TCP_SEQ_STATE_LISTENING &&
2796 st->bucket > tcp_hashinfo.lhash2_mask) {
2797 st->state = TCP_SEQ_STATE_ESTABLISHED;
2803 /* Get a new batch */
2806 iter->st_bucket_done = false;
2808 sk = tcp_seek_last_pos(seq);
2810 return NULL; /* Done */
2812 if (st->state == TCP_SEQ_STATE_LISTENING)
2813 expected = bpf_iter_tcp_listening_batch(seq, sk);
2815 expected = bpf_iter_tcp_established_batch(seq, sk);
2817 if (iter->end_sk == expected) {
2818 iter->st_bucket_done = true;
2822 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2830 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2832 /* bpf iter does not support lseek, so it always
2833 * continue from where it was stop()-ped.
2836 return bpf_iter_tcp_batch(seq);
2838 return SEQ_START_TOKEN;
2841 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2843 struct bpf_tcp_iter_state *iter = seq->private;
2844 struct tcp_iter_state *st = &iter->state;
2847 /* Whenever seq_next() is called, the iter->cur_sk is
2848 * done with seq_show(), so advance to the next sk in
2851 if (iter->cur_sk < iter->end_sk) {
2852 /* Keeping st->num consistent in tcp_iter_state.
2853 * bpf_iter_tcp does not use st->num.
2854 * meta.seq_num is used instead.
2857 /* Move st->offset to the next sk in the bucket such that
2858 * the future start() will resume at st->offset in
2859 * st->bucket. See tcp_seek_last_pos().
2862 sock_put(iter->batch[iter->cur_sk++]);
2865 if (iter->cur_sk < iter->end_sk)
2866 sk = iter->batch[iter->cur_sk];
2868 sk = bpf_iter_tcp_batch(seq);
2871 /* Keeping st->last_pos consistent in tcp_iter_state.
2872 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2874 st->last_pos = *pos;
2878 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2880 struct bpf_iter_meta meta;
2881 struct bpf_prog *prog;
2882 struct sock *sk = v;
2887 if (v == SEQ_START_TOKEN)
2890 if (sk_fullsock(sk))
2891 slow = lock_sock_fast(sk);
2893 if (unlikely(sk_unhashed(sk))) {
2898 if (sk->sk_state == TCP_TIME_WAIT) {
2900 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2901 const struct request_sock *req = v;
2903 uid = from_kuid_munged(seq_user_ns(seq),
2904 sock_i_uid(req->rsk_listener));
2906 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2910 prog = bpf_iter_get_info(&meta, false);
2911 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2914 if (sk_fullsock(sk))
2915 unlock_sock_fast(sk, slow);
2920 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2922 struct bpf_tcp_iter_state *iter = seq->private;
2923 struct bpf_iter_meta meta;
2924 struct bpf_prog *prog;
2928 prog = bpf_iter_get_info(&meta, true);
2930 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2933 if (iter->cur_sk < iter->end_sk) {
2934 bpf_iter_tcp_put_batch(iter);
2935 iter->st_bucket_done = false;
2939 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2940 .show = bpf_iter_tcp_seq_show,
2941 .start = bpf_iter_tcp_seq_start,
2942 .next = bpf_iter_tcp_seq_next,
2943 .stop = bpf_iter_tcp_seq_stop,
2946 static unsigned short seq_file_family(const struct seq_file *seq)
2948 const struct tcp_seq_afinfo *afinfo;
2950 #ifdef CONFIG_BPF_SYSCALL
2951 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
2952 if (seq->op == &bpf_iter_tcp_seq_ops)
2956 /* Iterated from proc fs */
2957 afinfo = pde_data(file_inode(seq->file));
2958 return afinfo->family;
2961 static const struct seq_operations tcp4_seq_ops = {
2962 .show = tcp4_seq_show,
2963 .start = tcp_seq_start,
2964 .next = tcp_seq_next,
2965 .stop = tcp_seq_stop,
2968 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2972 static int __net_init tcp4_proc_init_net(struct net *net)
2974 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2975 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2980 static void __net_exit tcp4_proc_exit_net(struct net *net)
2982 remove_proc_entry("tcp", net->proc_net);
2985 static struct pernet_operations tcp4_net_ops = {
2986 .init = tcp4_proc_init_net,
2987 .exit = tcp4_proc_exit_net,
2990 int __init tcp4_proc_init(void)
2992 return register_pernet_subsys(&tcp4_net_ops);
2995 void tcp4_proc_exit(void)
2997 unregister_pernet_subsys(&tcp4_net_ops);
2999 #endif /* CONFIG_PROC_FS */
3001 /* @wake is one when sk_stream_write_space() calls us.
3002 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3003 * This mimics the strategy used in sock_def_write_space().
3005 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3007 const struct tcp_sock *tp = tcp_sk(sk);
3008 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3009 READ_ONCE(tp->snd_nxt);
3011 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3013 EXPORT_SYMBOL(tcp_stream_memory_free);
3015 struct proto tcp_prot = {
3017 .owner = THIS_MODULE,
3019 .pre_connect = tcp_v4_pre_connect,
3020 .connect = tcp_v4_connect,
3021 .disconnect = tcp_disconnect,
3022 .accept = inet_csk_accept,
3024 .init = tcp_v4_init_sock,
3025 .destroy = tcp_v4_destroy_sock,
3026 .shutdown = tcp_shutdown,
3027 .setsockopt = tcp_setsockopt,
3028 .getsockopt = tcp_getsockopt,
3029 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3030 .keepalive = tcp_set_keepalive,
3031 .recvmsg = tcp_recvmsg,
3032 .sendmsg = tcp_sendmsg,
3033 .sendpage = tcp_sendpage,
3034 .backlog_rcv = tcp_v4_do_rcv,
3035 .release_cb = tcp_release_cb,
3037 .unhash = inet_unhash,
3038 .get_port = inet_csk_get_port,
3039 .put_port = inet_put_port,
3040 #ifdef CONFIG_BPF_SYSCALL
3041 .psock_update_sk_prot = tcp_bpf_update_proto,
3043 .enter_memory_pressure = tcp_enter_memory_pressure,
3044 .leave_memory_pressure = tcp_leave_memory_pressure,
3045 .stream_memory_free = tcp_stream_memory_free,
3046 .sockets_allocated = &tcp_sockets_allocated,
3047 .orphan_count = &tcp_orphan_count,
3048 .memory_allocated = &tcp_memory_allocated,
3049 .memory_pressure = &tcp_memory_pressure,
3050 .sysctl_mem = sysctl_tcp_mem,
3051 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3052 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3053 .max_header = MAX_TCP_HEADER,
3054 .obj_size = sizeof(struct tcp_sock),
3055 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3056 .twsk_prot = &tcp_timewait_sock_ops,
3057 .rsk_prot = &tcp_request_sock_ops,
3058 .h.hashinfo = &tcp_hashinfo,
3059 .no_autobind = true,
3060 .diag_destroy = tcp_abort,
3062 EXPORT_SYMBOL(tcp_prot);
3064 static void __net_exit tcp_sk_exit(struct net *net)
3066 struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3068 if (net->ipv4.tcp_congestion_control)
3069 bpf_module_put(net->ipv4.tcp_congestion_control,
3070 net->ipv4.tcp_congestion_control->owner);
3071 if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3072 kfree(tcp_death_row);
3075 static int __net_init tcp_sk_init(struct net *net)
3079 net->ipv4.sysctl_tcp_ecn = 2;
3080 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3082 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3083 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3084 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3085 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3086 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3088 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3089 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3090 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3092 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3093 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3094 net->ipv4.sysctl_tcp_syncookies = 1;
3095 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3096 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3097 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3098 net->ipv4.sysctl_tcp_orphan_retries = 0;
3099 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3100 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3101 net->ipv4.sysctl_tcp_tw_reuse = 2;
3102 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3104 net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3105 if (!net->ipv4.tcp_death_row)
3107 refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3108 cnt = tcp_hashinfo.ehash_mask + 1;
3109 net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3110 net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3112 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3113 net->ipv4.sysctl_tcp_sack = 1;
3114 net->ipv4.sysctl_tcp_window_scaling = 1;
3115 net->ipv4.sysctl_tcp_timestamps = 1;
3116 net->ipv4.sysctl_tcp_early_retrans = 3;
3117 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3118 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3119 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3120 net->ipv4.sysctl_tcp_max_reordering = 300;
3121 net->ipv4.sysctl_tcp_dsack = 1;
3122 net->ipv4.sysctl_tcp_app_win = 31;
3123 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3124 net->ipv4.sysctl_tcp_frto = 2;
3125 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3126 /* This limits the percentage of the congestion window which we
3127 * will allow a single TSO frame to consume. Building TSO frames
3128 * which are too large can cause TCP streams to be bursty.
3130 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3131 /* Default TSQ limit of 16 TSO segments */
3132 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3133 /* rfc5961 challenge ack rate limiting */
3134 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3135 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3136 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3137 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3138 net->ipv4.sysctl_tcp_autocorking = 1;
3139 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3140 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3141 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3142 if (net != &init_net) {
3143 memcpy(net->ipv4.sysctl_tcp_rmem,
3144 init_net.ipv4.sysctl_tcp_rmem,
3145 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3146 memcpy(net->ipv4.sysctl_tcp_wmem,
3147 init_net.ipv4.sysctl_tcp_wmem,
3148 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3150 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3151 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3152 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3153 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3154 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3155 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3157 /* Reno is always built in */
3158 if (!net_eq(net, &init_net) &&
3159 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3160 init_net.ipv4.tcp_congestion_control->owner))
3161 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3163 net->ipv4.tcp_congestion_control = &tcp_reno;
3168 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3172 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3174 list_for_each_entry(net, net_exit_list, exit_list)
3175 tcp_fastopen_ctx_destroy(net);
3178 static struct pernet_operations __net_initdata tcp_sk_ops = {
3179 .init = tcp_sk_init,
3180 .exit = tcp_sk_exit,
3181 .exit_batch = tcp_sk_exit_batch,
3184 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3185 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3186 struct sock_common *sk_common, uid_t uid)
3188 #define INIT_BATCH_SZ 16
3190 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3192 struct bpf_tcp_iter_state *iter = priv_data;
3195 err = bpf_iter_init_seq_net(priv_data, aux);
3199 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3201 bpf_iter_fini_seq_net(priv_data);
3208 static void bpf_iter_fini_tcp(void *priv_data)
3210 struct bpf_tcp_iter_state *iter = priv_data;
3212 bpf_iter_fini_seq_net(priv_data);
3213 kvfree(iter->batch);
3216 static const struct bpf_iter_seq_info tcp_seq_info = {
3217 .seq_ops = &bpf_iter_tcp_seq_ops,
3218 .init_seq_private = bpf_iter_init_tcp,
3219 .fini_seq_private = bpf_iter_fini_tcp,
3220 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3223 static const struct bpf_func_proto *
3224 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3225 const struct bpf_prog *prog)
3228 case BPF_FUNC_setsockopt:
3229 return &bpf_sk_setsockopt_proto;
3230 case BPF_FUNC_getsockopt:
3231 return &bpf_sk_getsockopt_proto;
3237 static struct bpf_iter_reg tcp_reg_info = {
3239 .ctx_arg_info_size = 1,
3241 { offsetof(struct bpf_iter__tcp, sk_common),
3242 PTR_TO_BTF_ID_OR_NULL },
3244 .get_func_proto = bpf_iter_tcp_get_func_proto,
3245 .seq_info = &tcp_seq_info,
3248 static void __init bpf_iter_register(void)
3250 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3251 if (bpf_iter_reg_target(&tcp_reg_info))
3252 pr_warn("Warning: could not register bpf iterator tcp\n");
3257 void __init tcp_v4_init(void)
3261 for_each_possible_cpu(cpu) {
3264 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3265 IPPROTO_TCP, &init_net);
3267 panic("Failed to create the TCP control socket.\n");
3268 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3270 /* Please enforce IP_DF and IPID==0 for RST and
3271 * ACK sent in SYN-RECV and TIME-WAIT state.
3273 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3275 per_cpu(ipv4_tcp_sk, cpu) = sk;
3277 if (register_pernet_subsys(&tcp_sk_ops))
3278 panic("Failed to create the TCP control socket.\n");
3280 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3281 bpf_iter_register();