1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Implementation of the Transmission Control Protocol(TCP).
9 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
48 #define pr_fmt(fmt) "TCP: " fmt
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
61 #include <net/net_namespace.h>
63 #include <net/inet_hashtables.h>
65 #include <net/transp_v6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
83 #include <trace/events/tcp.h>
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 return secure_tcp_seq(ip_hdr(skb)->daddr,
98 tcp_hdr(skb)->source);
101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 const struct inet_timewait_sock *tw = inet_twsk(sktw);
109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 struct tcp_sock *tp = tcp_sk(sk);
111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
114 /* Still does not detect *everything* that goes through
115 * lo, since we require a loopback src or dst address
116 * or direct binding to 'lo' interface.
118 bool loopback = false;
119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 #if IS_ENABLED(CONFIG_IPV6)
122 if (tw->tw_family == AF_INET6) {
123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
126 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131 if (ipv4_is_loopback(tw->tw_daddr) ||
132 ipv4_is_loopback(tw->tw_rcv_saddr))
139 /* With PAWS, it is safe from the viewpoint
140 of data integrity. Even without PAWS it is safe provided sequence
141 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 Actually, the idea is close to VJ's one, only timestamp cache is
144 held not per host, but per port pair and TW bucket is used as state
147 If TW bucket has been already destroyed we fall back to VJ's scheme
148 and use initial timestamp retrieved from peer table.
150 if (tcptw->tw_ts_recent_stamp &&
151 (!twp || (reuse && time_after32(ktime_get_seconds(),
152 tcptw->tw_ts_recent_stamp)))) {
153 /* In case of repair and re-using TIME-WAIT sockets we still
154 * want to be sure that it is safe as above but honor the
155 * sequence numbers and time stamps set as part of the repair
158 * Without this check re-using a TIME-WAIT socket with TCP
159 * repair would accumulate a -1 on the repair assigned
160 * sequence number. The first time it is reused the sequence
161 * is -1, the second time -2, etc. This fixes that issue
162 * without appearing to create any others.
164 if (likely(!tp->repair)) {
165 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169 WRITE_ONCE(tp->write_seq, seq);
170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
184 /* This check is replicated from tcp_v4_connect() and intended to
185 * prevent BPF program called below from accessing bytes that are out
186 * of the bound specified by user in addr_len.
188 if (addr_len < sizeof(struct sockaddr_in))
191 sock_owned_by_me(sk);
193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
196 /* This will initiate an outgoing connection. */
197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
200 struct inet_sock *inet = inet_sk(sk);
201 struct tcp_sock *tp = tcp_sk(sk);
202 __be16 orig_sport, orig_dport;
203 __be32 daddr, nexthop;
207 struct ip_options_rcu *inet_opt;
208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 if (addr_len < sizeof(struct sockaddr_in))
213 if (usin->sin_family != AF_INET)
214 return -EAFNOSUPPORT;
216 nexthop = daddr = usin->sin_addr.s_addr;
217 inet_opt = rcu_dereference_protected(inet->inet_opt,
218 lockdep_sock_is_held(sk));
219 if (inet_opt && inet_opt->opt.srr) {
222 nexthop = inet_opt->opt.faddr;
225 orig_sport = inet->inet_sport;
226 orig_dport = usin->sin_port;
227 fl4 = &inet->cork.fl.u.ip4;
228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 orig_sport, orig_dport, sk);
234 if (err == -ENETUNREACH)
235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244 if (!inet_opt || !inet_opt->opt.srr)
247 if (!inet->inet_saddr)
248 inet->inet_saddr = fl4->saddr;
249 sk_rcv_saddr_set(sk, inet->inet_saddr);
251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
252 /* Reset inherited state */
253 tp->rx_opt.ts_recent = 0;
254 tp->rx_opt.ts_recent_stamp = 0;
255 if (likely(!tp->repair))
256 WRITE_ONCE(tp->write_seq, 0);
259 inet->inet_dport = usin->sin_port;
260 sk_daddr_set(sk, daddr);
262 inet_csk(sk)->icsk_ext_hdr_len = 0;
264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 /* Socket identity is still unknown (sport may be zero).
269 * However we set state to SYN-SENT and not releasing socket
270 * lock select source port, enter ourselves into the hash tables and
271 * complete initialization after this.
273 tcp_set_state(sk, TCP_SYN_SENT);
274 err = inet_hash_connect(tcp_death_row, sk);
280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
281 inet->inet_sport, inet->inet_dport, sk);
287 /* OK, now commit destination to socket. */
288 sk->sk_gso_type = SKB_GSO_TCPV4;
289 sk_setup_caps(sk, &rt->dst);
292 if (likely(!tp->repair)) {
294 WRITE_ONCE(tp->write_seq,
295 secure_tcp_seq(inet->inet_saddr,
299 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
304 inet->inet_id = prandom_u32();
306 if (tcp_fastopen_defer_connect(sk, &err))
311 err = tcp_connect(sk);
320 * This unhashes the socket and releases the local port,
323 tcp_set_state(sk, TCP_CLOSE);
325 sk->sk_route_caps = 0;
326 inet->inet_dport = 0;
329 EXPORT_SYMBOL(tcp_v4_connect);
332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
333 * It can be called through tcp_release_cb() if socket was owned by user
334 * at the time tcp_v4_err() was called to handle ICMP message.
336 void tcp_v4_mtu_reduced(struct sock *sk)
338 struct inet_sock *inet = inet_sk(sk);
339 struct dst_entry *dst;
342 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 mtu = tcp_sk(sk)->mtu_info;
345 dst = inet_csk_update_pmtu(sk, mtu);
349 /* Something is about to be wrong... Remember soft error
350 * for the case, if this connection will not able to recover.
352 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
353 sk->sk_err_soft = EMSGSIZE;
357 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
358 ip_sk_accept_pmtu(sk) &&
359 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
360 tcp_sync_mss(sk, mtu);
362 /* Resend the TCP packet because it's
363 * clear that the old packet has been
364 * dropped. This is the new "fast" path mtu
367 tcp_simple_retransmit(sk);
368 } /* else let the usual retransmit timer handle it */
370 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 struct dst_entry *dst = __sk_dst_check(sk, 0);
377 dst->ops->redirect(dst, sk, skb);
381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
382 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 struct request_sock *req = inet_reqsk(sk);
385 struct net *net = sock_net(sk);
387 /* ICMPs are not backlogged, hence we cannot get
388 * an established socket here.
390 if (seq != tcp_rsk(req)->snt_isn) {
391 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
394 * Still in SYN_RECV, just remove it silently.
395 * There is no good way to pass the error to the newly
396 * created socket, and POSIX does not want network
397 * errors returned from accept().
399 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
400 tcp_listendrop(req->rsk_listener);
404 EXPORT_SYMBOL(tcp_req_err);
407 * This routine is called by the ICMP module when it gets some
408 * sort of error condition. If err < 0 then the socket should
409 * be closed and the error returned to the user. If err > 0
410 * it's just the icmp type << 8 | icmp code. After adjustment
411 * header points to the first 8 bytes of the tcp header. We need
412 * to find the appropriate port.
414 * The locking strategy used here is very "optimistic". When
415 * someone else accesses the socket the ICMP is just dropped
416 * and for some paths there is no check at all.
417 * A more general error queue to queue errors for later handling
418 * is probably better.
422 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
424 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
425 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
426 struct inet_connection_sock *icsk;
428 struct inet_sock *inet;
429 const int type = icmp_hdr(icmp_skb)->type;
430 const int code = icmp_hdr(icmp_skb)->code;
433 struct request_sock *fastopen;
438 struct net *net = dev_net(icmp_skb->dev);
440 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
441 th->dest, iph->saddr, ntohs(th->source),
442 inet_iif(icmp_skb), 0);
444 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
447 if (sk->sk_state == TCP_TIME_WAIT) {
448 inet_twsk_put(inet_twsk(sk));
451 seq = ntohl(th->seq);
452 if (sk->sk_state == TCP_NEW_SYN_RECV) {
453 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
454 type == ICMP_TIME_EXCEEDED ||
455 (type == ICMP_DEST_UNREACH &&
456 (code == ICMP_NET_UNREACH ||
457 code == ICMP_HOST_UNREACH)));
462 /* If too many ICMPs get dropped on busy
463 * servers this needs to be solved differently.
464 * We do take care of PMTU discovery (RFC1191) special case :
465 * we can receive locally generated ICMP messages while socket is held.
467 if (sock_owned_by_user(sk)) {
468 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
469 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
471 if (sk->sk_state == TCP_CLOSE)
474 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
475 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
481 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
482 fastopen = rcu_dereference(tp->fastopen_rsk);
483 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
484 if (sk->sk_state != TCP_LISTEN &&
485 !between(seq, snd_una, tp->snd_nxt)) {
486 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
492 if (!sock_owned_by_user(sk))
493 do_redirect(icmp_skb, sk);
495 case ICMP_SOURCE_QUENCH:
496 /* Just silently ignore these. */
498 case ICMP_PARAMETERPROB:
501 case ICMP_DEST_UNREACH:
502 if (code > NR_ICMP_UNREACH)
505 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
506 /* We are not interested in TCP_LISTEN and open_requests
507 * (SYN-ACKs send out by Linux are always <576bytes so
508 * they should go through unfragmented).
510 if (sk->sk_state == TCP_LISTEN)
514 if (!sock_owned_by_user(sk)) {
515 tcp_v4_mtu_reduced(sk);
517 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
523 err = icmp_err_convert[code].errno;
524 /* check if icmp_skb allows revert of backoff
525 * (see draft-zimmermann-tcp-lcd) */
526 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
528 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
529 !icsk->icsk_backoff || fastopen)
532 if (sock_owned_by_user(sk))
535 skb = tcp_rtx_queue_head(sk);
536 if (WARN_ON_ONCE(!skb))
539 icsk->icsk_backoff--;
540 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
542 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
545 tcp_mstamp_refresh(tp);
546 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
547 remaining = icsk->icsk_rto -
548 usecs_to_jiffies(delta_us);
551 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
552 remaining, TCP_RTO_MAX);
554 /* RTO revert clocked out retransmission.
555 * Will retransmit now */
556 tcp_retransmit_timer(sk);
560 case ICMP_TIME_EXCEEDED:
567 switch (sk->sk_state) {
570 /* Only in fast or simultaneous open. If a fast open socket is
571 * is already accepted it is treated as a connected one below.
573 if (fastopen && !fastopen->sk)
576 if (!sock_owned_by_user(sk)) {
579 sk->sk_error_report(sk);
583 sk->sk_err_soft = err;
588 /* If we've already connected we will keep trying
589 * until we time out, or the user gives up.
591 * rfc1122 4.2.3.9 allows to consider as hard errors
592 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
593 * but it is obsoleted by pmtu discovery).
595 * Note, that in modern internet, where routing is unreliable
596 * and in each dark corner broken firewalls sit, sending random
597 * errors ordered by their masters even this two messages finally lose
598 * their original sense (even Linux sends invalid PORT_UNREACHs)
600 * Now we are in compliance with RFCs.
605 if (!sock_owned_by_user(sk) && inet->recverr) {
607 sk->sk_error_report(sk);
608 } else { /* Only an error on timeout */
609 sk->sk_err_soft = err;
618 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
620 struct tcphdr *th = tcp_hdr(skb);
622 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
623 skb->csum_start = skb_transport_header(skb) - skb->head;
624 skb->csum_offset = offsetof(struct tcphdr, check);
627 /* This routine computes an IPv4 TCP checksum. */
628 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
630 const struct inet_sock *inet = inet_sk(sk);
632 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
634 EXPORT_SYMBOL(tcp_v4_send_check);
637 * This routine will send an RST to the other tcp.
639 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
641 * Answer: if a packet caused RST, it is not for a socket
642 * existing in our system, if it is matched to a socket,
643 * it is just duplicate segment or bug in other side's TCP.
644 * So that we build reply only basing on parameters
645 * arrived with segment.
646 * Exception: precedence violation. We do not implement it in any case.
649 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
651 const struct tcphdr *th = tcp_hdr(skb);
654 #ifdef CONFIG_TCP_MD5SIG
655 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
658 struct ip_reply_arg arg;
659 #ifdef CONFIG_TCP_MD5SIG
660 struct tcp_md5sig_key *key = NULL;
661 const __u8 *hash_location = NULL;
662 unsigned char newhash[16];
664 struct sock *sk1 = NULL;
666 u64 transmit_time = 0;
670 /* Never send a reset in response to a reset. */
674 /* If sk not NULL, it means we did a successful lookup and incoming
675 * route had to be correct. prequeue might have dropped our dst.
677 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
680 /* Swap the send and the receive. */
681 memset(&rep, 0, sizeof(rep));
682 rep.th.dest = th->source;
683 rep.th.source = th->dest;
684 rep.th.doff = sizeof(struct tcphdr) / 4;
688 rep.th.seq = th->ack_seq;
691 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
692 skb->len - (th->doff << 2));
695 memset(&arg, 0, sizeof(arg));
696 arg.iov[0].iov_base = (unsigned char *)&rep;
697 arg.iov[0].iov_len = sizeof(rep.th);
699 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
700 #ifdef CONFIG_TCP_MD5SIG
702 hash_location = tcp_parse_md5sig_option(th);
703 if (sk && sk_fullsock(sk)) {
704 const union tcp_md5_addr *addr;
707 /* sdif set, means packet ingressed via a device
708 * in an L3 domain and inet_iif is set to it.
710 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
711 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
712 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
713 } else if (hash_location) {
714 const union tcp_md5_addr *addr;
715 int sdif = tcp_v4_sdif(skb);
716 int dif = inet_iif(skb);
720 * active side is lost. Try to find listening socket through
721 * source port, and then find md5 key through listening socket.
722 * we are not loose security here:
723 * Incoming packet is checked with md5 hash with finding key,
724 * no RST generated if md5 hash doesn't match.
726 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
728 th->source, ip_hdr(skb)->daddr,
729 ntohs(th->source), dif, sdif);
730 /* don't send rst if it can't find key */
734 /* sdif set, means packet ingressed via a device
735 * in an L3 domain and dif is set to it.
737 l3index = sdif ? dif : 0;
738 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
739 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
744 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
745 if (genhash || memcmp(hash_location, newhash, 16) != 0)
751 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
753 (TCPOPT_MD5SIG << 8) |
755 /* Update length and the length the header thinks exists */
756 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
757 rep.th.doff = arg.iov[0].iov_len / 4;
759 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
760 key, ip_hdr(skb)->saddr,
761 ip_hdr(skb)->daddr, &rep.th);
764 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
765 ip_hdr(skb)->saddr, /* XXX */
766 arg.iov[0].iov_len, IPPROTO_TCP, 0);
767 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
768 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
770 /* When socket is gone, all binding information is lost.
771 * routing might fail in this case. No choice here, if we choose to force
772 * input interface, we will misroute in case of asymmetric route.
775 arg.bound_dev_if = sk->sk_bound_dev_if;
777 trace_tcp_send_reset(sk, skb);
780 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
781 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
783 arg.tos = ip_hdr(skb)->tos;
784 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
786 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
788 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
789 inet_twsk(sk)->tw_mark : sk->sk_mark;
790 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
791 inet_twsk(sk)->tw_priority : sk->sk_priority;
792 transmit_time = tcp_transmit_time(sk);
794 ip_send_unicast_reply(ctl_sk,
795 skb, &TCP_SKB_CB(skb)->header.h4.opt,
796 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
797 &arg, arg.iov[0].iov_len,
801 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
802 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
805 #ifdef CONFIG_TCP_MD5SIG
811 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
812 outside socket context is ugly, certainly. What can I do?
815 static void tcp_v4_send_ack(const struct sock *sk,
816 struct sk_buff *skb, u32 seq, u32 ack,
817 u32 win, u32 tsval, u32 tsecr, int oif,
818 struct tcp_md5sig_key *key,
819 int reply_flags, u8 tos)
821 const struct tcphdr *th = tcp_hdr(skb);
824 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
825 #ifdef CONFIG_TCP_MD5SIG
826 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
830 struct net *net = sock_net(sk);
831 struct ip_reply_arg arg;
835 memset(&rep.th, 0, sizeof(struct tcphdr));
836 memset(&arg, 0, sizeof(arg));
838 arg.iov[0].iov_base = (unsigned char *)&rep;
839 arg.iov[0].iov_len = sizeof(rep.th);
841 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
842 (TCPOPT_TIMESTAMP << 8) |
844 rep.opt[1] = htonl(tsval);
845 rep.opt[2] = htonl(tsecr);
846 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
849 /* Swap the send and the receive. */
850 rep.th.dest = th->source;
851 rep.th.source = th->dest;
852 rep.th.doff = arg.iov[0].iov_len / 4;
853 rep.th.seq = htonl(seq);
854 rep.th.ack_seq = htonl(ack);
856 rep.th.window = htons(win);
858 #ifdef CONFIG_TCP_MD5SIG
860 int offset = (tsecr) ? 3 : 0;
862 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
864 (TCPOPT_MD5SIG << 8) |
866 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
867 rep.th.doff = arg.iov[0].iov_len/4;
869 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
870 key, ip_hdr(skb)->saddr,
871 ip_hdr(skb)->daddr, &rep.th);
874 arg.flags = reply_flags;
875 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
876 ip_hdr(skb)->saddr, /* XXX */
877 arg.iov[0].iov_len, IPPROTO_TCP, 0);
878 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
880 arg.bound_dev_if = oif;
882 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
884 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
885 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
886 inet_twsk(sk)->tw_mark : sk->sk_mark;
887 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
888 inet_twsk(sk)->tw_priority : sk->sk_priority;
889 transmit_time = tcp_transmit_time(sk);
890 ip_send_unicast_reply(ctl_sk,
891 skb, &TCP_SKB_CB(skb)->header.h4.opt,
892 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
893 &arg, arg.iov[0].iov_len,
897 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
901 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
903 struct inet_timewait_sock *tw = inet_twsk(sk);
904 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
906 tcp_v4_send_ack(sk, skb,
907 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
908 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
909 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
912 tcp_twsk_md5_key(tcptw),
913 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
920 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
921 struct request_sock *req)
923 const union tcp_md5_addr *addr;
926 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
927 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
929 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
933 * The window field (SEG.WND) of every outgoing segment, with the
934 * exception of <SYN> segments, MUST be right-shifted by
935 * Rcv.Wind.Shift bits:
937 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
938 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
939 tcp_v4_send_ack(sk, skb, seq,
940 tcp_rsk(req)->rcv_nxt,
941 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
942 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
945 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
946 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
951 * Send a SYN-ACK after having received a SYN.
952 * This still operates on a request_sock only, not on a big
955 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
957 struct request_sock *req,
958 struct tcp_fastopen_cookie *foc,
959 enum tcp_synack_type synack_type)
961 const struct inet_request_sock *ireq = inet_rsk(req);
966 /* First, grab a route. */
967 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
970 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
973 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
976 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
978 rcu_dereference(ireq->ireq_opt));
980 err = net_xmit_eval(err);
987 * IPv4 request_sock destructor.
989 static void tcp_v4_reqsk_destructor(struct request_sock *req)
991 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
994 #ifdef CONFIG_TCP_MD5SIG
996 * RFC2385 MD5 checksumming requires a mapping of
997 * IP address->MD5 Key.
998 * We need to maintain these in the sk structure.
1001 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1002 EXPORT_SYMBOL(tcp_md5_needed);
1004 /* Find the Key structure for an address. */
1005 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1006 const union tcp_md5_addr *addr,
1009 const struct tcp_sock *tp = tcp_sk(sk);
1010 struct tcp_md5sig_key *key;
1011 const struct tcp_md5sig_info *md5sig;
1013 struct tcp_md5sig_key *best_match = NULL;
1016 /* caller either holds rcu_read_lock() or socket lock */
1017 md5sig = rcu_dereference_check(tp->md5sig_info,
1018 lockdep_sock_is_held(sk));
1022 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1023 lockdep_sock_is_held(sk)) {
1024 if (key->family != family)
1026 if (key->l3index && key->l3index != l3index)
1028 if (family == AF_INET) {
1029 mask = inet_make_mask(key->prefixlen);
1030 match = (key->addr.a4.s_addr & mask) ==
1031 (addr->a4.s_addr & mask);
1032 #if IS_ENABLED(CONFIG_IPV6)
1033 } else if (family == AF_INET6) {
1034 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1041 if (match && (!best_match ||
1042 key->prefixlen > best_match->prefixlen))
1047 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1049 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1050 const union tcp_md5_addr *addr,
1051 int family, u8 prefixlen,
1054 const struct tcp_sock *tp = tcp_sk(sk);
1055 struct tcp_md5sig_key *key;
1056 unsigned int size = sizeof(struct in_addr);
1057 const struct tcp_md5sig_info *md5sig;
1059 /* caller either holds rcu_read_lock() or socket lock */
1060 md5sig = rcu_dereference_check(tp->md5sig_info,
1061 lockdep_sock_is_held(sk));
1064 #if IS_ENABLED(CONFIG_IPV6)
1065 if (family == AF_INET6)
1066 size = sizeof(struct in6_addr);
1068 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1069 lockdep_sock_is_held(sk)) {
1070 if (key->family != family)
1072 if (key->l3index && key->l3index != l3index)
1074 if (!memcmp(&key->addr, addr, size) &&
1075 key->prefixlen == prefixlen)
1081 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1082 const struct sock *addr_sk)
1084 const union tcp_md5_addr *addr;
1087 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1088 addr_sk->sk_bound_dev_if);
1089 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1090 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1092 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1094 /* This can be called on a newly created socket, from other files */
1095 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1096 int family, u8 prefixlen, int l3index,
1097 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1099 /* Add Key to the list */
1100 struct tcp_md5sig_key *key;
1101 struct tcp_sock *tp = tcp_sk(sk);
1102 struct tcp_md5sig_info *md5sig;
1104 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1106 /* Pre-existing entry - just update that one. */
1107 memcpy(key->key, newkey, newkeylen);
1108 key->keylen = newkeylen;
1112 md5sig = rcu_dereference_protected(tp->md5sig_info,
1113 lockdep_sock_is_held(sk));
1115 md5sig = kmalloc(sizeof(*md5sig), gfp);
1119 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1120 INIT_HLIST_HEAD(&md5sig->head);
1121 rcu_assign_pointer(tp->md5sig_info, md5sig);
1124 key = sock_kmalloc(sk, sizeof(*key), gfp);
1127 if (!tcp_alloc_md5sig_pool()) {
1128 sock_kfree_s(sk, key, sizeof(*key));
1132 memcpy(key->key, newkey, newkeylen);
1133 key->keylen = newkeylen;
1134 key->family = family;
1135 key->prefixlen = prefixlen;
1136 key->l3index = l3index;
1137 memcpy(&key->addr, addr,
1138 (family == AF_INET6) ? sizeof(struct in6_addr) :
1139 sizeof(struct in_addr));
1140 hlist_add_head_rcu(&key->node, &md5sig->head);
1143 EXPORT_SYMBOL(tcp_md5_do_add);
1145 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1146 u8 prefixlen, int l3index)
1148 struct tcp_md5sig_key *key;
1150 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1153 hlist_del_rcu(&key->node);
1154 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1155 kfree_rcu(key, rcu);
1158 EXPORT_SYMBOL(tcp_md5_do_del);
1160 static void tcp_clear_md5_list(struct sock *sk)
1162 struct tcp_sock *tp = tcp_sk(sk);
1163 struct tcp_md5sig_key *key;
1164 struct hlist_node *n;
1165 struct tcp_md5sig_info *md5sig;
1167 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1169 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1170 hlist_del_rcu(&key->node);
1171 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1172 kfree_rcu(key, rcu);
1176 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1177 char __user *optval, int optlen)
1179 struct tcp_md5sig cmd;
1180 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1181 const union tcp_md5_addr *addr;
1185 if (optlen < sizeof(cmd))
1188 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1191 if (sin->sin_family != AF_INET)
1194 if (optname == TCP_MD5SIG_EXT &&
1195 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1196 prefixlen = cmd.tcpm_prefixlen;
1201 if (optname == TCP_MD5SIG_EXT &&
1202 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1203 struct net_device *dev;
1206 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1207 if (dev && netif_is_l3_master(dev))
1208 l3index = dev->ifindex;
1212 /* ok to reference set/not set outside of rcu;
1213 * right now device MUST be an L3 master
1215 if (!dev || !l3index)
1219 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1221 if (!cmd.tcpm_keylen)
1222 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1224 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1227 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1228 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1231 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1232 __be32 daddr, __be32 saddr,
1233 const struct tcphdr *th, int nbytes)
1235 struct tcp4_pseudohdr *bp;
1236 struct scatterlist sg;
1243 bp->protocol = IPPROTO_TCP;
1244 bp->len = cpu_to_be16(nbytes);
1246 _th = (struct tcphdr *)(bp + 1);
1247 memcpy(_th, th, sizeof(*th));
1250 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1251 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1252 sizeof(*bp) + sizeof(*th));
1253 return crypto_ahash_update(hp->md5_req);
1256 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1257 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1259 struct tcp_md5sig_pool *hp;
1260 struct ahash_request *req;
1262 hp = tcp_get_md5sig_pool();
1264 goto clear_hash_noput;
1267 if (crypto_ahash_init(req))
1269 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1271 if (tcp_md5_hash_key(hp, key))
1273 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1274 if (crypto_ahash_final(req))
1277 tcp_put_md5sig_pool();
1281 tcp_put_md5sig_pool();
1283 memset(md5_hash, 0, 16);
1287 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1288 const struct sock *sk,
1289 const struct sk_buff *skb)
1291 struct tcp_md5sig_pool *hp;
1292 struct ahash_request *req;
1293 const struct tcphdr *th = tcp_hdr(skb);
1294 __be32 saddr, daddr;
1296 if (sk) { /* valid for establish/request sockets */
1297 saddr = sk->sk_rcv_saddr;
1298 daddr = sk->sk_daddr;
1300 const struct iphdr *iph = ip_hdr(skb);
1305 hp = tcp_get_md5sig_pool();
1307 goto clear_hash_noput;
1310 if (crypto_ahash_init(req))
1313 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1315 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1317 if (tcp_md5_hash_key(hp, key))
1319 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1320 if (crypto_ahash_final(req))
1323 tcp_put_md5sig_pool();
1327 tcp_put_md5sig_pool();
1329 memset(md5_hash, 0, 16);
1332 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1336 /* Called with rcu_read_lock() */
1337 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1338 const struct sk_buff *skb,
1341 #ifdef CONFIG_TCP_MD5SIG
1343 * This gets called for each TCP segment that arrives
1344 * so we want to be efficient.
1345 * We have 3 drop cases:
1346 * o No MD5 hash and one expected.
1347 * o MD5 hash and we're not expecting one.
1348 * o MD5 hash and its wrong.
1350 const __u8 *hash_location = NULL;
1351 struct tcp_md5sig_key *hash_expected;
1352 const struct iphdr *iph = ip_hdr(skb);
1353 const struct tcphdr *th = tcp_hdr(skb);
1354 const union tcp_md5_addr *addr;
1355 unsigned char newhash[16];
1356 int genhash, l3index;
1358 /* sdif set, means packet ingressed via a device
1359 * in an L3 domain and dif is set to the l3mdev
1361 l3index = sdif ? dif : 0;
1363 addr = (union tcp_md5_addr *)&iph->saddr;
1364 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1365 hash_location = tcp_parse_md5sig_option(th);
1367 /* We've parsed the options - do we have a hash? */
1368 if (!hash_expected && !hash_location)
1371 if (hash_expected && !hash_location) {
1372 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1376 if (!hash_expected && hash_location) {
1377 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1381 /* Okay, so this is hash_expected and hash_location -
1382 * so we need to calculate the checksum.
1384 genhash = tcp_v4_md5_hash_skb(newhash,
1388 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1389 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1390 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1391 &iph->saddr, ntohs(th->source),
1392 &iph->daddr, ntohs(th->dest),
1393 genhash ? " tcp_v4_calc_md5_hash failed"
1402 static void tcp_v4_init_req(struct request_sock *req,
1403 const struct sock *sk_listener,
1404 struct sk_buff *skb)
1406 struct inet_request_sock *ireq = inet_rsk(req);
1407 struct net *net = sock_net(sk_listener);
1409 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1410 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1411 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1414 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1416 const struct request_sock *req)
1418 return inet_csk_route_req(sk, &fl->u.ip4, req);
1421 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1423 .obj_size = sizeof(struct tcp_request_sock),
1424 .rtx_syn_ack = tcp_rtx_synack,
1425 .send_ack = tcp_v4_reqsk_send_ack,
1426 .destructor = tcp_v4_reqsk_destructor,
1427 .send_reset = tcp_v4_send_reset,
1428 .syn_ack_timeout = tcp_syn_ack_timeout,
1431 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1432 .mss_clamp = TCP_MSS_DEFAULT,
1433 #ifdef CONFIG_TCP_MD5SIG
1434 .req_md5_lookup = tcp_v4_md5_lookup,
1435 .calc_md5_hash = tcp_v4_md5_hash_skb,
1437 .init_req = tcp_v4_init_req,
1438 #ifdef CONFIG_SYN_COOKIES
1439 .cookie_init_seq = cookie_v4_init_sequence,
1441 .route_req = tcp_v4_route_req,
1442 .init_seq = tcp_v4_init_seq,
1443 .init_ts_off = tcp_v4_init_ts_off,
1444 .send_synack = tcp_v4_send_synack,
1447 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1449 /* Never answer to SYNs send to broadcast or multicast */
1450 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1453 return tcp_conn_request(&tcp_request_sock_ops,
1454 &tcp_request_sock_ipv4_ops, sk, skb);
1460 EXPORT_SYMBOL(tcp_v4_conn_request);
1464 * The three way handshake has completed - we got a valid synack -
1465 * now create the new socket.
1467 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1468 struct request_sock *req,
1469 struct dst_entry *dst,
1470 struct request_sock *req_unhash,
1473 struct inet_request_sock *ireq;
1474 struct inet_sock *newinet;
1475 struct tcp_sock *newtp;
1477 #ifdef CONFIG_TCP_MD5SIG
1478 const union tcp_md5_addr *addr;
1479 struct tcp_md5sig_key *key;
1482 struct ip_options_rcu *inet_opt;
1484 if (sk_acceptq_is_full(sk))
1487 newsk = tcp_create_openreq_child(sk, req, skb);
1491 newsk->sk_gso_type = SKB_GSO_TCPV4;
1492 inet_sk_rx_dst_set(newsk, skb);
1494 newtp = tcp_sk(newsk);
1495 newinet = inet_sk(newsk);
1496 ireq = inet_rsk(req);
1497 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1498 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1499 newsk->sk_bound_dev_if = ireq->ir_iif;
1500 newinet->inet_saddr = ireq->ir_loc_addr;
1501 inet_opt = rcu_dereference(ireq->ireq_opt);
1502 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1503 newinet->mc_index = inet_iif(skb);
1504 newinet->mc_ttl = ip_hdr(skb)->ttl;
1505 newinet->rcv_tos = ip_hdr(skb)->tos;
1506 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1508 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1509 newinet->inet_id = prandom_u32();
1512 dst = inet_csk_route_child_sock(sk, newsk, req);
1516 /* syncookie case : see end of cookie_v4_check() */
1518 sk_setup_caps(newsk, dst);
1520 tcp_ca_openreq_child(newsk, dst);
1522 tcp_sync_mss(newsk, dst_mtu(dst));
1523 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1525 tcp_initialize_rcv_mss(newsk);
1527 #ifdef CONFIG_TCP_MD5SIG
1528 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1529 /* Copy over the MD5 key from the original socket */
1530 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1531 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1534 * We're using one, so create a matching key
1535 * on the newsk structure. If we fail to get
1536 * memory, then we end up not copying the key
1539 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1540 key->key, key->keylen, GFP_ATOMIC);
1541 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1545 if (__inet_inherit_port(sk, newsk) < 0)
1547 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1548 if (likely(*own_req)) {
1549 tcp_move_syn(newtp, req);
1550 ireq->ireq_opt = NULL;
1552 newinet->inet_opt = NULL;
1557 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1564 newinet->inet_opt = NULL;
1565 inet_csk_prepare_forced_close(newsk);
1569 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1571 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1573 #ifdef CONFIG_SYN_COOKIES
1574 const struct tcphdr *th = tcp_hdr(skb);
1577 sk = cookie_v4_check(sk, skb);
1582 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1583 struct tcphdr *th, u32 *cookie)
1586 #ifdef CONFIG_SYN_COOKIES
1587 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1588 &tcp_request_sock_ipv4_ops, sk, th);
1590 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1591 tcp_synq_overflow(sk);
1597 /* The socket must have it's spinlock held when we get
1598 * here, unless it is a TCP_LISTEN socket.
1600 * We have a potential double-lock case here, so even when
1601 * doing backlog processing we use the BH locking scheme.
1602 * This is because we cannot sleep with the original spinlock
1605 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1609 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1610 struct dst_entry *dst = sk->sk_rx_dst;
1612 sock_rps_save_rxhash(sk, skb);
1613 sk_mark_napi_id(sk, skb);
1615 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1616 !dst->ops->check(dst, 0)) {
1618 sk->sk_rx_dst = NULL;
1621 tcp_rcv_established(sk, skb);
1625 if (tcp_checksum_complete(skb))
1628 if (sk->sk_state == TCP_LISTEN) {
1629 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1634 if (tcp_child_process(sk, nsk, skb)) {
1641 sock_rps_save_rxhash(sk, skb);
1643 if (tcp_rcv_state_process(sk, skb)) {
1650 tcp_v4_send_reset(rsk, skb);
1653 /* Be careful here. If this function gets more complicated and
1654 * gcc suffers from register pressure on the x86, sk (in %ebx)
1655 * might be destroyed here. This current version compiles correctly,
1656 * but you have been warned.
1661 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1662 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1665 EXPORT_SYMBOL(tcp_v4_do_rcv);
1667 int tcp_v4_early_demux(struct sk_buff *skb)
1669 const struct iphdr *iph;
1670 const struct tcphdr *th;
1673 if (skb->pkt_type != PACKET_HOST)
1676 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1682 if (th->doff < sizeof(struct tcphdr) / 4)
1685 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1686 iph->saddr, th->source,
1687 iph->daddr, ntohs(th->dest),
1688 skb->skb_iif, inet_sdif(skb));
1691 skb->destructor = sock_edemux;
1692 if (sk_fullsock(sk)) {
1693 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1696 dst = dst_check(dst, 0);
1698 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1699 skb_dst_set_noref(skb, dst);
1705 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1707 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1708 struct skb_shared_info *shinfo;
1709 const struct tcphdr *th;
1710 struct tcphdr *thtail;
1711 struct sk_buff *tail;
1712 unsigned int hdrlen;
1717 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1718 * we can fix skb->truesize to its real value to avoid future drops.
1719 * This is valid because skb is not yet charged to the socket.
1720 * It has been noticed pure SACK packets were sometimes dropped
1721 * (if cooked by drivers without copybreak feature).
1727 if (unlikely(tcp_checksum_complete(skb))) {
1729 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1730 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1734 /* Attempt coalescing to last skb in backlog, even if we are
1736 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1738 th = (const struct tcphdr *)skb->data;
1739 hdrlen = th->doff * 4;
1740 shinfo = skb_shinfo(skb);
1742 if (!shinfo->gso_size)
1743 shinfo->gso_size = skb->len - hdrlen;
1745 if (!shinfo->gso_segs)
1746 shinfo->gso_segs = 1;
1748 tail = sk->sk_backlog.tail;
1751 thtail = (struct tcphdr *)tail->data;
1753 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1754 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1755 ((TCP_SKB_CB(tail)->tcp_flags |
1756 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1757 !((TCP_SKB_CB(tail)->tcp_flags &
1758 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1759 ((TCP_SKB_CB(tail)->tcp_flags ^
1760 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1761 #ifdef CONFIG_TLS_DEVICE
1762 tail->decrypted != skb->decrypted ||
1764 thtail->doff != th->doff ||
1765 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1768 __skb_pull(skb, hdrlen);
1769 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1770 thtail->window = th->window;
1772 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1774 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1775 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1777 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1778 * thtail->fin, so that the fast path in tcp_rcv_established()
1779 * is not entered if we append a packet with a FIN.
1780 * SYN, RST, URG are not present.
1781 * ACK is set on both packets.
1782 * PSH : we do not really care in TCP stack,
1783 * at least for 'GRO' packets.
1785 thtail->fin |= th->fin;
1786 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1788 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1789 TCP_SKB_CB(tail)->has_rxtstamp = true;
1790 tail->tstamp = skb->tstamp;
1791 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1794 /* Not as strict as GRO. We only need to carry mss max value */
1795 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1796 skb_shinfo(tail)->gso_size);
1798 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1799 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1801 sk->sk_backlog.len += delta;
1802 __NET_INC_STATS(sock_net(sk),
1803 LINUX_MIB_TCPBACKLOGCOALESCE);
1804 kfree_skb_partial(skb, fragstolen);
1807 __skb_push(skb, hdrlen);
1810 /* Only socket owner can try to collapse/prune rx queues
1811 * to reduce memory overhead, so add a little headroom here.
1812 * Few sockets backlog are possibly concurrently non empty.
1816 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1818 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1823 EXPORT_SYMBOL(tcp_add_backlog);
1825 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1827 struct tcphdr *th = (struct tcphdr *)skb->data;
1829 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1831 EXPORT_SYMBOL(tcp_filter);
1833 static void tcp_v4_restore_cb(struct sk_buff *skb)
1835 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1836 sizeof(struct inet_skb_parm));
1839 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1840 const struct tcphdr *th)
1842 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1843 * barrier() makes sure compiler wont play fool^Waliasing games.
1845 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1846 sizeof(struct inet_skb_parm));
1849 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1850 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1851 skb->len - th->doff * 4);
1852 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1853 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1854 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1855 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1856 TCP_SKB_CB(skb)->sacked = 0;
1857 TCP_SKB_CB(skb)->has_rxtstamp =
1858 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1865 int tcp_v4_rcv(struct sk_buff *skb)
1867 struct net *net = dev_net(skb->dev);
1868 struct sk_buff *skb_to_free;
1869 int sdif = inet_sdif(skb);
1870 int dif = inet_iif(skb);
1871 const struct iphdr *iph;
1872 const struct tcphdr *th;
1877 if (skb->pkt_type != PACKET_HOST)
1880 /* Count it even if it's bad */
1881 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1883 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1886 th = (const struct tcphdr *)skb->data;
1888 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1890 if (!pskb_may_pull(skb, th->doff * 4))
1893 /* An explanation is required here, I think.
1894 * Packet length and doff are validated by header prediction,
1895 * provided case of th->doff==0 is eliminated.
1896 * So, we defer the checks. */
1898 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1901 th = (const struct tcphdr *)skb->data;
1904 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1905 th->dest, sdif, &refcounted);
1910 if (sk->sk_state == TCP_TIME_WAIT)
1913 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1914 struct request_sock *req = inet_reqsk(sk);
1915 bool req_stolen = false;
1918 sk = req->rsk_listener;
1919 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1920 sk_drops_add(sk, skb);
1924 if (tcp_checksum_complete(skb)) {
1928 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1929 inet_csk_reqsk_queue_drop_and_put(sk, req);
1932 /* We own a reference on the listener, increase it again
1933 * as we might lose it too soon.
1938 if (!tcp_filter(sk, skb)) {
1939 th = (const struct tcphdr *)skb->data;
1941 tcp_v4_fill_cb(skb, iph, th);
1942 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1947 /* Another cpu got exclusive access to req
1948 * and created a full blown socket.
1949 * Try to feed this packet to this socket
1950 * instead of discarding it.
1952 tcp_v4_restore_cb(skb);
1956 goto discard_and_relse;
1960 tcp_v4_restore_cb(skb);
1961 } else if (tcp_child_process(sk, nsk, skb)) {
1962 tcp_v4_send_reset(nsk, skb);
1963 goto discard_and_relse;
1969 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1970 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1971 goto discard_and_relse;
1974 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1975 goto discard_and_relse;
1977 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
1978 goto discard_and_relse;
1982 if (tcp_filter(sk, skb))
1983 goto discard_and_relse;
1984 th = (const struct tcphdr *)skb->data;
1986 tcp_v4_fill_cb(skb, iph, th);
1990 if (sk->sk_state == TCP_LISTEN) {
1991 ret = tcp_v4_do_rcv(sk, skb);
1992 goto put_and_return;
1995 sk_incoming_cpu_update(sk);
1997 bh_lock_sock_nested(sk);
1998 tcp_segs_in(tcp_sk(sk), skb);
2000 if (!sock_owned_by_user(sk)) {
2001 skb_to_free = sk->sk_rx_skb_cache;
2002 sk->sk_rx_skb_cache = NULL;
2003 ret = tcp_v4_do_rcv(sk, skb);
2005 if (tcp_add_backlog(sk, skb))
2006 goto discard_and_relse;
2011 __kfree_skb(skb_to_free);
2020 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2023 tcp_v4_fill_cb(skb, iph, th);
2025 if (tcp_checksum_complete(skb)) {
2027 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2029 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2031 tcp_v4_send_reset(NULL, skb);
2035 /* Discard frame. */
2040 sk_drops_add(sk, skb);
2046 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2047 inet_twsk_put(inet_twsk(sk));
2051 tcp_v4_fill_cb(skb, iph, th);
2053 if (tcp_checksum_complete(skb)) {
2054 inet_twsk_put(inet_twsk(sk));
2057 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2059 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2062 iph->saddr, th->source,
2063 iph->daddr, th->dest,
2067 inet_twsk_deschedule_put(inet_twsk(sk));
2069 tcp_v4_restore_cb(skb);
2077 tcp_v4_timewait_ack(sk, skb);
2080 tcp_v4_send_reset(sk, skb);
2081 inet_twsk_deschedule_put(inet_twsk(sk));
2083 case TCP_TW_SUCCESS:;
2088 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2089 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2090 .twsk_unique = tcp_twsk_unique,
2091 .twsk_destructor= tcp_twsk_destructor,
2094 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2096 struct dst_entry *dst = skb_dst(skb);
2098 if (dst && dst_hold_safe(dst)) {
2099 sk->sk_rx_dst = dst;
2100 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2103 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2105 const struct inet_connection_sock_af_ops ipv4_specific = {
2106 .queue_xmit = ip_queue_xmit,
2107 .send_check = tcp_v4_send_check,
2108 .rebuild_header = inet_sk_rebuild_header,
2109 .sk_rx_dst_set = inet_sk_rx_dst_set,
2110 .conn_request = tcp_v4_conn_request,
2111 .syn_recv_sock = tcp_v4_syn_recv_sock,
2112 .net_header_len = sizeof(struct iphdr),
2113 .setsockopt = ip_setsockopt,
2114 .getsockopt = ip_getsockopt,
2115 .addr2sockaddr = inet_csk_addr2sockaddr,
2116 .sockaddr_len = sizeof(struct sockaddr_in),
2117 #ifdef CONFIG_COMPAT
2118 .compat_setsockopt = compat_ip_setsockopt,
2119 .compat_getsockopt = compat_ip_getsockopt,
2121 .mtu_reduced = tcp_v4_mtu_reduced,
2123 EXPORT_SYMBOL(ipv4_specific);
2125 #ifdef CONFIG_TCP_MD5SIG
2126 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2127 .md5_lookup = tcp_v4_md5_lookup,
2128 .calc_md5_hash = tcp_v4_md5_hash_skb,
2129 .md5_parse = tcp_v4_parse_md5_keys,
2133 /* NOTE: A lot of things set to zero explicitly by call to
2134 * sk_alloc() so need not be done here.
2136 static int tcp_v4_init_sock(struct sock *sk)
2138 struct inet_connection_sock *icsk = inet_csk(sk);
2142 icsk->icsk_af_ops = &ipv4_specific;
2144 #ifdef CONFIG_TCP_MD5SIG
2145 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2151 void tcp_v4_destroy_sock(struct sock *sk)
2153 struct tcp_sock *tp = tcp_sk(sk);
2155 trace_tcp_destroy_sock(sk);
2157 tcp_clear_xmit_timers(sk);
2159 tcp_cleanup_congestion_control(sk);
2161 tcp_cleanup_ulp(sk);
2163 /* Cleanup up the write buffer. */
2164 tcp_write_queue_purge(sk);
2166 /* Check if we want to disable active TFO */
2167 tcp_fastopen_active_disable_ofo_check(sk);
2169 /* Cleans up our, hopefully empty, out_of_order_queue. */
2170 skb_rbtree_purge(&tp->out_of_order_queue);
2172 #ifdef CONFIG_TCP_MD5SIG
2173 /* Clean up the MD5 key list, if any */
2174 if (tp->md5sig_info) {
2175 tcp_clear_md5_list(sk);
2176 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2177 tp->md5sig_info = NULL;
2181 /* Clean up a referenced TCP bind bucket. */
2182 if (inet_csk(sk)->icsk_bind_hash)
2185 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2187 /* If socket is aborted during connect operation */
2188 tcp_free_fastopen_req(tp);
2189 tcp_fastopen_destroy_cipher(sk);
2190 tcp_saved_syn_free(tp);
2192 sk_sockets_allocated_dec(sk);
2194 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2196 #ifdef CONFIG_PROC_FS
2197 /* Proc filesystem TCP sock list dumping. */
2200 * Get next listener socket follow cur. If cur is NULL, get first socket
2201 * starting from bucket given in st->bucket; when st->bucket is zero the
2202 * very first socket in the hash table is returned.
2204 static void *listening_get_next(struct seq_file *seq, void *cur)
2206 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2207 struct tcp_iter_state *st = seq->private;
2208 struct net *net = seq_file_net(seq);
2209 struct inet_listen_hashbucket *ilb;
2210 struct hlist_nulls_node *node;
2211 struct sock *sk = cur;
2215 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2216 spin_lock(&ilb->lock);
2217 sk = sk_nulls_head(&ilb->nulls_head);
2221 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2225 sk = sk_nulls_next(sk);
2227 sk_nulls_for_each_from(sk, node) {
2228 if (!net_eq(sock_net(sk), net))
2230 if (sk->sk_family == afinfo->family)
2233 spin_unlock(&ilb->lock);
2235 if (++st->bucket < INET_LHTABLE_SIZE)
2240 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2242 struct tcp_iter_state *st = seq->private;
2247 rc = listening_get_next(seq, NULL);
2249 while (rc && *pos) {
2250 rc = listening_get_next(seq, rc);
2256 static inline bool empty_bucket(const struct tcp_iter_state *st)
2258 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2262 * Get first established socket starting from bucket given in st->bucket.
2263 * If st->bucket is zero, the very first socket in the hash is returned.
2265 static void *established_get_first(struct seq_file *seq)
2267 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2268 struct tcp_iter_state *st = seq->private;
2269 struct net *net = seq_file_net(seq);
2273 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2275 struct hlist_nulls_node *node;
2276 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2278 /* Lockless fast path for the common case of empty buckets */
2279 if (empty_bucket(st))
2283 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2284 if (sk->sk_family != afinfo->family ||
2285 !net_eq(sock_net(sk), net)) {
2291 spin_unlock_bh(lock);
2297 static void *established_get_next(struct seq_file *seq, void *cur)
2299 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2300 struct sock *sk = cur;
2301 struct hlist_nulls_node *node;
2302 struct tcp_iter_state *st = seq->private;
2303 struct net *net = seq_file_net(seq);
2308 sk = sk_nulls_next(sk);
2310 sk_nulls_for_each_from(sk, node) {
2311 if (sk->sk_family == afinfo->family &&
2312 net_eq(sock_net(sk), net))
2316 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2318 return established_get_first(seq);
2321 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2323 struct tcp_iter_state *st = seq->private;
2327 rc = established_get_first(seq);
2330 rc = established_get_next(seq, rc);
2336 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2339 struct tcp_iter_state *st = seq->private;
2341 st->state = TCP_SEQ_STATE_LISTENING;
2342 rc = listening_get_idx(seq, &pos);
2345 st->state = TCP_SEQ_STATE_ESTABLISHED;
2346 rc = established_get_idx(seq, pos);
2352 static void *tcp_seek_last_pos(struct seq_file *seq)
2354 struct tcp_iter_state *st = seq->private;
2355 int offset = st->offset;
2356 int orig_num = st->num;
2359 switch (st->state) {
2360 case TCP_SEQ_STATE_LISTENING:
2361 if (st->bucket >= INET_LHTABLE_SIZE)
2363 st->state = TCP_SEQ_STATE_LISTENING;
2364 rc = listening_get_next(seq, NULL);
2365 while (offset-- && rc)
2366 rc = listening_get_next(seq, rc);
2370 st->state = TCP_SEQ_STATE_ESTABLISHED;
2372 case TCP_SEQ_STATE_ESTABLISHED:
2373 if (st->bucket > tcp_hashinfo.ehash_mask)
2375 rc = established_get_first(seq);
2376 while (offset-- && rc)
2377 rc = established_get_next(seq, rc);
2385 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2387 struct tcp_iter_state *st = seq->private;
2390 if (*pos && *pos == st->last_pos) {
2391 rc = tcp_seek_last_pos(seq);
2396 st->state = TCP_SEQ_STATE_LISTENING;
2400 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2403 st->last_pos = *pos;
2406 EXPORT_SYMBOL(tcp_seq_start);
2408 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2410 struct tcp_iter_state *st = seq->private;
2413 if (v == SEQ_START_TOKEN) {
2414 rc = tcp_get_idx(seq, 0);
2418 switch (st->state) {
2419 case TCP_SEQ_STATE_LISTENING:
2420 rc = listening_get_next(seq, v);
2422 st->state = TCP_SEQ_STATE_ESTABLISHED;
2425 rc = established_get_first(seq);
2428 case TCP_SEQ_STATE_ESTABLISHED:
2429 rc = established_get_next(seq, v);
2434 st->last_pos = *pos;
2437 EXPORT_SYMBOL(tcp_seq_next);
2439 void tcp_seq_stop(struct seq_file *seq, void *v)
2441 struct tcp_iter_state *st = seq->private;
2443 switch (st->state) {
2444 case TCP_SEQ_STATE_LISTENING:
2445 if (v != SEQ_START_TOKEN)
2446 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2448 case TCP_SEQ_STATE_ESTABLISHED:
2450 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2454 EXPORT_SYMBOL(tcp_seq_stop);
2456 static void get_openreq4(const struct request_sock *req,
2457 struct seq_file *f, int i)
2459 const struct inet_request_sock *ireq = inet_rsk(req);
2460 long delta = req->rsk_timer.expires - jiffies;
2462 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2463 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2468 ntohs(ireq->ir_rmt_port),
2470 0, 0, /* could print option size, but that is af dependent. */
2471 1, /* timers active (only the expire timer) */
2472 jiffies_delta_to_clock_t(delta),
2474 from_kuid_munged(seq_user_ns(f),
2475 sock_i_uid(req->rsk_listener)),
2476 0, /* non standard timer */
2477 0, /* open_requests have no inode */
2482 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2485 unsigned long timer_expires;
2486 const struct tcp_sock *tp = tcp_sk(sk);
2487 const struct inet_connection_sock *icsk = inet_csk(sk);
2488 const struct inet_sock *inet = inet_sk(sk);
2489 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2490 __be32 dest = inet->inet_daddr;
2491 __be32 src = inet->inet_rcv_saddr;
2492 __u16 destp = ntohs(inet->inet_dport);
2493 __u16 srcp = ntohs(inet->inet_sport);
2497 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2498 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2499 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2501 timer_expires = icsk->icsk_timeout;
2502 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2504 timer_expires = icsk->icsk_timeout;
2505 } else if (timer_pending(&sk->sk_timer)) {
2507 timer_expires = sk->sk_timer.expires;
2510 timer_expires = jiffies;
2513 state = inet_sk_state_load(sk);
2514 if (state == TCP_LISTEN)
2515 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2517 /* Because we don't lock the socket,
2518 * we might find a transient negative value.
2520 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2521 READ_ONCE(tp->copied_seq), 0);
2523 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2524 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2525 i, src, srcp, dest, destp, state,
2526 READ_ONCE(tp->write_seq) - tp->snd_una,
2529 jiffies_delta_to_clock_t(timer_expires - jiffies),
2530 icsk->icsk_retransmits,
2531 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2532 icsk->icsk_probes_out,
2534 refcount_read(&sk->sk_refcnt), sk,
2535 jiffies_to_clock_t(icsk->icsk_rto),
2536 jiffies_to_clock_t(icsk->icsk_ack.ato),
2537 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2539 state == TCP_LISTEN ?
2540 fastopenq->max_qlen :
2541 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2544 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2545 struct seq_file *f, int i)
2547 long delta = tw->tw_timer.expires - jiffies;
2551 dest = tw->tw_daddr;
2552 src = tw->tw_rcv_saddr;
2553 destp = ntohs(tw->tw_dport);
2554 srcp = ntohs(tw->tw_sport);
2556 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2557 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2558 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2559 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2560 refcount_read(&tw->tw_refcnt), tw);
2565 static int tcp4_seq_show(struct seq_file *seq, void *v)
2567 struct tcp_iter_state *st;
2568 struct sock *sk = v;
2570 seq_setwidth(seq, TMPSZ - 1);
2571 if (v == SEQ_START_TOKEN) {
2572 seq_puts(seq, " sl local_address rem_address st tx_queue "
2573 "rx_queue tr tm->when retrnsmt uid timeout "
2579 if (sk->sk_state == TCP_TIME_WAIT)
2580 get_timewait4_sock(v, seq, st->num);
2581 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2582 get_openreq4(v, seq, st->num);
2584 get_tcp4_sock(v, seq, st->num);
2590 static const struct seq_operations tcp4_seq_ops = {
2591 .show = tcp4_seq_show,
2592 .start = tcp_seq_start,
2593 .next = tcp_seq_next,
2594 .stop = tcp_seq_stop,
2597 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2601 static int __net_init tcp4_proc_init_net(struct net *net)
2603 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2604 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2609 static void __net_exit tcp4_proc_exit_net(struct net *net)
2611 remove_proc_entry("tcp", net->proc_net);
2614 static struct pernet_operations tcp4_net_ops = {
2615 .init = tcp4_proc_init_net,
2616 .exit = tcp4_proc_exit_net,
2619 int __init tcp4_proc_init(void)
2621 return register_pernet_subsys(&tcp4_net_ops);
2624 void tcp4_proc_exit(void)
2626 unregister_pernet_subsys(&tcp4_net_ops);
2628 #endif /* CONFIG_PROC_FS */
2630 struct proto tcp_prot = {
2632 .owner = THIS_MODULE,
2634 .pre_connect = tcp_v4_pre_connect,
2635 .connect = tcp_v4_connect,
2636 .disconnect = tcp_disconnect,
2637 .accept = inet_csk_accept,
2639 .init = tcp_v4_init_sock,
2640 .destroy = tcp_v4_destroy_sock,
2641 .shutdown = tcp_shutdown,
2642 .setsockopt = tcp_setsockopt,
2643 .getsockopt = tcp_getsockopt,
2644 .keepalive = tcp_set_keepalive,
2645 .recvmsg = tcp_recvmsg,
2646 .sendmsg = tcp_sendmsg,
2647 .sendpage = tcp_sendpage,
2648 .backlog_rcv = tcp_v4_do_rcv,
2649 .release_cb = tcp_release_cb,
2651 .unhash = inet_unhash,
2652 .get_port = inet_csk_get_port,
2653 .enter_memory_pressure = tcp_enter_memory_pressure,
2654 .leave_memory_pressure = tcp_leave_memory_pressure,
2655 .stream_memory_free = tcp_stream_memory_free,
2656 .sockets_allocated = &tcp_sockets_allocated,
2657 .orphan_count = &tcp_orphan_count,
2658 .memory_allocated = &tcp_memory_allocated,
2659 .memory_pressure = &tcp_memory_pressure,
2660 .sysctl_mem = sysctl_tcp_mem,
2661 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2662 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2663 .max_header = MAX_TCP_HEADER,
2664 .obj_size = sizeof(struct tcp_sock),
2665 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2666 .twsk_prot = &tcp_timewait_sock_ops,
2667 .rsk_prot = &tcp_request_sock_ops,
2668 .h.hashinfo = &tcp_hashinfo,
2669 .no_autobind = true,
2670 #ifdef CONFIG_COMPAT
2671 .compat_setsockopt = compat_tcp_setsockopt,
2672 .compat_getsockopt = compat_tcp_getsockopt,
2674 .diag_destroy = tcp_abort,
2676 EXPORT_SYMBOL(tcp_prot);
2678 static void __net_exit tcp_sk_exit(struct net *net)
2682 if (net->ipv4.tcp_congestion_control)
2683 bpf_module_put(net->ipv4.tcp_congestion_control,
2684 net->ipv4.tcp_congestion_control->owner);
2686 for_each_possible_cpu(cpu)
2687 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2688 free_percpu(net->ipv4.tcp_sk);
2691 static int __net_init tcp_sk_init(struct net *net)
2695 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2696 if (!net->ipv4.tcp_sk)
2699 for_each_possible_cpu(cpu) {
2702 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2706 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2708 /* Please enforce IP_DF and IPID==0 for RST and
2709 * ACK sent in SYN-RECV and TIME-WAIT state.
2711 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2713 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2716 net->ipv4.sysctl_tcp_ecn = 2;
2717 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2719 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2720 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2721 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2722 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2723 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2725 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2726 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2727 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2729 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2730 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2731 net->ipv4.sysctl_tcp_syncookies = 1;
2732 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2733 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2734 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2735 net->ipv4.sysctl_tcp_orphan_retries = 0;
2736 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2737 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2738 net->ipv4.sysctl_tcp_tw_reuse = 2;
2739 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2741 cnt = tcp_hashinfo.ehash_mask + 1;
2742 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2743 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2745 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2746 net->ipv4.sysctl_tcp_sack = 1;
2747 net->ipv4.sysctl_tcp_window_scaling = 1;
2748 net->ipv4.sysctl_tcp_timestamps = 1;
2749 net->ipv4.sysctl_tcp_early_retrans = 3;
2750 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2751 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2752 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2753 net->ipv4.sysctl_tcp_max_reordering = 300;
2754 net->ipv4.sysctl_tcp_dsack = 1;
2755 net->ipv4.sysctl_tcp_app_win = 31;
2756 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2757 net->ipv4.sysctl_tcp_frto = 2;
2758 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2759 /* This limits the percentage of the congestion window which we
2760 * will allow a single TSO frame to consume. Building TSO frames
2761 * which are too large can cause TCP streams to be bursty.
2763 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2764 /* Default TSQ limit of 16 TSO segments */
2765 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2766 /* rfc5961 challenge ack rate limiting */
2767 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2768 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2769 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2770 net->ipv4.sysctl_tcp_autocorking = 1;
2771 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2772 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2773 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2774 if (net != &init_net) {
2775 memcpy(net->ipv4.sysctl_tcp_rmem,
2776 init_net.ipv4.sysctl_tcp_rmem,
2777 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2778 memcpy(net->ipv4.sysctl_tcp_wmem,
2779 init_net.ipv4.sysctl_tcp_wmem,
2780 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2782 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2783 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2784 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2785 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2786 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2787 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2789 /* Reno is always built in */
2790 if (!net_eq(net, &init_net) &&
2791 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2792 init_net.ipv4.tcp_congestion_control->owner))
2793 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2795 net->ipv4.tcp_congestion_control = &tcp_reno;
2804 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2808 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2810 list_for_each_entry(net, net_exit_list, exit_list)
2811 tcp_fastopen_ctx_destroy(net);
2814 static struct pernet_operations __net_initdata tcp_sk_ops = {
2815 .init = tcp_sk_init,
2816 .exit = tcp_sk_exit,
2817 .exit_batch = tcp_sk_exit_batch,
2820 void __init tcp_v4_init(void)
2822 if (register_pernet_subsys(&tcp_sk_ops))
2823 panic("Failed to create the TCP control socket.\n");