net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60 #include <linux/sched.h>
  61
  62 #include <net/net_namespace.h>
  63 #include <net/icmp.h>
  64 #include <net/inet_hashtables.h>
  65 #include <net/tcp.h>
  66 #include <net/transp_v6.h>
  67 #include <net/ipv6.h>
  68 #include <net/inet_common.h>
  69 #include <net/timewait_sock.h>
  70 #include <net/xfrm.h>
  71 #include <net/secure_seq.h>
  72 #include <net/busy_poll.h>
  73 #include <net/rstreason.h>
  74
  75 #include <linux/inet.h>
  76 #include <linux/ipv6.h>
  77 #include <linux/stddef.h>
  78 #include <linux/proc_fs.h>
  79 #include <linux/seq_file.h>
  80 #include <linux/inetdevice.h>
  81 #include <linux/btf_ids.h>
  82
  83 #include <crypto/hash.h>
  84 #include <linux/scatterlist.h>
  85
  86 #include <trace/events/tcp.h>
  87
  88 #ifdef CONFIG_TCP_MD5SIG
  89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  90                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  91 #endif
  92
  93 struct inet_hashinfo tcp_hashinfo;
  94 EXPORT_SYMBOL(tcp_hashinfo);
  95
  96 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
  97         .bh_lock = INIT_LOCAL_LOCK(bh_lock),
  98 };
  99
 100 static DEFINE_MUTEX(tcp_exit_batch_mutex);
 101
 102 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
 103 {
 104         return secure_tcp_seq(ip_hdr(skb)->daddr,
 105                               ip_hdr(skb)->saddr,
 106                               tcp_hdr(skb)->dest,
 107                               tcp_hdr(skb)->source);
 108 }
 109
 110 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 111 {
 112         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 113 }
 114
 115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 116 {
 117         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 118         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 119         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 120         struct tcp_sock *tp = tcp_sk(sk);
 121         int ts_recent_stamp;
 122
 123         if (reuse == 2) {
 124                 /* Still does not detect *everything* that goes through
 125                  * lo, since we require a loopback src or dst address
 126                  * or direct binding to 'lo' interface.
 127                  */
 128                 bool loopback = false;
 129                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 130                         loopback = true;
 131 #if IS_ENABLED(CONFIG_IPV6)
 132                 if (tw->tw_family == AF_INET6) {
 133                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 134                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 135                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 136                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 137                                 loopback = true;
 138                 } else
 139 #endif
 140                 {
 141                         if (ipv4_is_loopback(tw->tw_daddr) ||
 142                             ipv4_is_loopback(tw->tw_rcv_saddr))
 143                                 loopback = true;
 144                 }
 145                 if (!loopback)
 146                         reuse = 0;
 147         }
 148
 149         /* With PAWS, it is safe from the viewpoint
 150            of data integrity. Even without PAWS it is safe provided sequence
 151            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 152
 153            Actually, the idea is close to VJ's one, only timestamp cache is
 154            held not per host, but per port pair and TW bucket is used as state
 155            holder.
 156
 157            If TW bucket has been already destroyed we fall back to VJ's scheme
 158            and use initial timestamp retrieved from peer table.
 159          */
 160         ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
 161         if (ts_recent_stamp &&
 162             (!twp || (reuse && time_after32(ktime_get_seconds(),
 163                                             ts_recent_stamp)))) {
 164                 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
 165                  * and releasing the bucket lock.
 166                  */
 167                 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
 168                         return 0;
 169
 170                 /* In case of repair and re-using TIME-WAIT sockets we still
 171                  * want to be sure that it is safe as above but honor the
 172                  * sequence numbers and time stamps set as part of the repair
 173                  * process.
 174                  *
 175                  * Without this check re-using a TIME-WAIT socket with TCP
 176                  * repair would accumulate a -1 on the repair assigned
 177                  * sequence number. The first time it is reused the sequence
 178                  * is -1, the second time -2, etc. This fixes that issue
 179                  * without appearing to create any others.
 180                  */
 181                 if (likely(!tp->repair)) {
 182                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 183
 184                         if (!seq)
 185                                 seq = 1;
 186                         WRITE_ONCE(tp->write_seq, seq);
 187                         tp->rx_opt.ts_recent       = READ_ONCE(tcptw->tw_ts_recent);
 188                         tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
 189                 }
 190
 191                 return 1;
 192         }
 193
 194         return 0;
 195 }
 196 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 197
 198 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 199                               int addr_len)
 200 {
 201         /* This check is replicated from tcp_v4_connect() and intended to
 202          * prevent BPF program called below from accessing bytes that are out
 203          * of the bound specified by user in addr_len.
 204          */
 205         if (addr_len < sizeof(struct sockaddr_in))
 206                 return -EINVAL;
 207
 208         sock_owned_by_me(sk);
 209
 210         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
 211 }
 212
 213 /* This will initiate an outgoing connection. */
 214 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 215 {
 216         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 217         struct inet_timewait_death_row *tcp_death_row;
 218         struct inet_sock *inet = inet_sk(sk);
 219         struct tcp_sock *tp = tcp_sk(sk);
 220         struct ip_options_rcu *inet_opt;
 221         struct net *net = sock_net(sk);
 222         __be16 orig_sport, orig_dport;
 223         __be32 daddr, nexthop;
 224         struct flowi4 *fl4;
 225         struct rtable *rt;
 226         int err;
 227
 228         if (addr_len < sizeof(struct sockaddr_in))
 229                 return -EINVAL;
 230
 231         if (usin->sin_family != AF_INET)
 232                 return -EAFNOSUPPORT;
 233
 234         nexthop = daddr = usin->sin_addr.s_addr;
 235         inet_opt = rcu_dereference_protected(inet->inet_opt,
 236                                              lockdep_sock_is_held(sk));
 237         if (inet_opt && inet_opt->opt.srr) {
 238                 if (!daddr)
 239                         return -EINVAL;
 240                 nexthop = inet_opt->opt.faddr;
 241         }
 242
 243         orig_sport = inet->inet_sport;
 244         orig_dport = usin->sin_port;
 245         fl4 = &inet->cork.fl.u.ip4;
 246         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 247                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 248                               orig_dport, sk);
 249         if (IS_ERR(rt)) {
 250                 err = PTR_ERR(rt);
 251                 if (err == -ENETUNREACH)
 252                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 253                 return err;
 254         }
 255
 256         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 257                 ip_rt_put(rt);
 258                 return -ENETUNREACH;
 259         }
 260
 261         if (!inet_opt || !inet_opt->opt.srr)
 262                 daddr = fl4->daddr;
 263
 264         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 265
 266         if (!inet->inet_saddr) {
 267                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
 268                 if (err) {
 269                         ip_rt_put(rt);
 270                         return err;
 271                 }
 272         } else {
 273                 sk_rcv_saddr_set(sk, inet->inet_saddr);
 274         }
 275
 276         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 277                 /* Reset inherited state */
 278                 tp->rx_opt.ts_recent       = 0;
 279                 tp->rx_opt.ts_recent_stamp = 0;
 280                 if (likely(!tp->repair))
 281                         WRITE_ONCE(tp->write_seq, 0);
 282         }
 283
 284         inet->inet_dport = usin->sin_port;
 285         sk_daddr_set(sk, daddr);
 286
 287         inet_csk(sk)->icsk_ext_hdr_len = 0;
 288         if (inet_opt)
 289                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 290
 291         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 292
 293         /* Socket identity is still unknown (sport may be zero).
 294          * However we set state to SYN-SENT and not releasing socket
 295          * lock select source port, enter ourselves into the hash tables and
 296          * complete initialization after this.
 297          */
 298         tcp_set_state(sk, TCP_SYN_SENT);
 299         err = inet_hash_connect(tcp_death_row, sk);
 300         if (err)
 301                 goto failure;
 302
 303         sk_set_txhash(sk);
 304
 305         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 306                                inet->inet_sport, inet->inet_dport, sk);
 307         if (IS_ERR(rt)) {
 308                 err = PTR_ERR(rt);
 309                 rt = NULL;
 310                 goto failure;
 311         }
 312         tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
 313         /* OK, now commit destination to socket.  */
 314         sk->sk_gso_type = SKB_GSO_TCPV4;
 315         sk_setup_caps(sk, &rt->dst);
 316         rt = NULL;
 317
 318         if (likely(!tp->repair)) {
 319                 if (!tp->write_seq)
 320                         WRITE_ONCE(tp->write_seq,
 321                                    secure_tcp_seq(inet->inet_saddr,
 322                                                   inet->inet_daddr,
 323                                                   inet->inet_sport,
 324                                                   usin->sin_port));
 325                 WRITE_ONCE(tp->tsoffset,
 326                            secure_tcp_ts_off(net, inet->inet_saddr,
 327                                              inet->inet_daddr));
 328         }
 329
 330         atomic_set(&inet->inet_id, get_random_u16());
 331
 332         if (tcp_fastopen_defer_connect(sk, &err))
 333                 return err;
 334         if (err)
 335                 goto failure;
 336
 337         err = tcp_connect(sk);
 338
 339         if (err)
 340                 goto failure;
 341
 342         return 0;
 343
 344 failure:
 345         /*
 346          * This unhashes the socket and releases the local port,
 347          * if necessary.
 348          */
 349         tcp_set_state(sk, TCP_CLOSE);
 350         inet_bhash2_reset_saddr(sk);
 351         ip_rt_put(rt);
 352         sk->sk_route_caps = 0;
 353         inet->inet_dport = 0;
 354         return err;
 355 }
 356 EXPORT_SYMBOL(tcp_v4_connect);
 357
 358 /*
 359  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 360  * It can be called through tcp_release_cb() if socket was owned by user
 361  * at the time tcp_v4_err() was called to handle ICMP message.
 362  */
 363 void tcp_v4_mtu_reduced(struct sock *sk)
 364 {
 365         struct inet_sock *inet = inet_sk(sk);
 366         struct dst_entry *dst;
 367         u32 mtu;
 368
 369         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 370                 return;
 371         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 372         dst = inet_csk_update_pmtu(sk, mtu);
 373         if (!dst)
 374                 return;
 375
 376         /* Something is about to be wrong... Remember soft error
 377          * for the case, if this connection will not able to recover.
 378          */
 379         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 380                 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
 381
 382         mtu = dst_mtu(dst);
 383
 384         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 385             ip_sk_accept_pmtu(sk) &&
 386             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 387                 tcp_sync_mss(sk, mtu);
 388
 389                 /* Resend the TCP packet because it's
 390                  * clear that the old packet has been
 391                  * dropped. This is the new "fast" path mtu
 392                  * discovery.
 393                  */
 394                 tcp_simple_retransmit(sk);
 395         } /* else let the usual retransmit timer handle it */
 396 }
 397 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 398
 399 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 400 {
 401         struct dst_entry *dst = __sk_dst_check(sk, 0);
 402
 403         if (dst)
 404                 dst->ops->redirect(dst, sk, skb);
 405 }
 406
 407
 408 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 409 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 410 {
 411         struct request_sock *req = inet_reqsk(sk);
 412         struct net *net = sock_net(sk);
 413
 414         /* ICMPs are not backlogged, hence we cannot get
 415          * an established socket here.
 416          */
 417         if (seq != tcp_rsk(req)->snt_isn) {
 418                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 419         } else if (abort) {
 420                 /*
 421                  * Still in SYN_RECV, just remove it silently.
 422                  * There is no good way to pass the error to the newly
 423                  * created socket, and POSIX does not want network
 424                  * errors returned from accept().
 425                  */
 426                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 427                 tcp_listendrop(req->rsk_listener);
 428         }
 429         reqsk_put(req);
 430 }
 431 EXPORT_SYMBOL(tcp_req_err);
 432
 433 /* TCP-LD (RFC 6069) logic */
 434 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 435 {
 436         struct inet_connection_sock *icsk = inet_csk(sk);
 437         struct tcp_sock *tp = tcp_sk(sk);
 438         struct sk_buff *skb;
 439         s32 remaining;
 440         u32 delta_us;
 441
 442         if (sock_owned_by_user(sk))
 443                 return;
 444
 445         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 446             !icsk->icsk_backoff)
 447                 return;
 448
 449         skb = tcp_rtx_queue_head(sk);
 450         if (WARN_ON_ONCE(!skb))
 451                 return;
 452
 453         icsk->icsk_backoff--;
 454         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 455         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 456
 457         tcp_mstamp_refresh(tp);
 458         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 459         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 460
 461         if (remaining > 0) {
 462                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 463                                           remaining, TCP_RTO_MAX);
 464         } else {
 465                 /* RTO revert clocked out retransmission.
 466                  * Will retransmit now.
 467                  */
 468                 tcp_retransmit_timer(sk);
 469         }
 470 }
 471 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 472
 473 /*
 474  * This routine is called by the ICMP module when it gets some
 475  * sort of error condition.  If err < 0 then the socket should
 476  * be closed and the error returned to the user.  If err > 0
 477  * it's just the icmp type << 8 | icmp code.  After adjustment
 478  * header points to the first 8 bytes of the tcp header.  We need
 479  * to find the appropriate port.
 480  *
 481  * The locking strategy used here is very "optimistic". When
 482  * someone else accesses the socket the ICMP is just dropped
 483  * and for some paths there is no check at all.
 484  * A more general error queue to queue errors for later handling
 485  * is probably better.
 486  *
 487  */
 488
 489 int tcp_v4_err(struct sk_buff *skb, u32 info)
 490 {
 491         const struct iphdr *iph = (const struct iphdr *)skb->data;
 492         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 493         struct tcp_sock *tp;
 494         const int type = icmp_hdr(skb)->type;
 495         const int code = icmp_hdr(skb)->code;
 496         struct sock *sk;
 497         struct request_sock *fastopen;
 498         u32 seq, snd_una;
 499         int err;
 500         struct net *net = dev_net(skb->dev);
 501
 502         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
 503                                        iph->daddr, th->dest, iph->saddr,
 504                                        ntohs(th->source), inet_iif(skb), 0);
 505         if (!sk) {
 506                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 507                 return -ENOENT;
 508         }
 509         if (sk->sk_state == TCP_TIME_WAIT) {
 510                 /* To increase the counter of ignored icmps for TCP-AO */
 511                 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
 512                 inet_twsk_put(inet_twsk(sk));
 513                 return 0;
 514         }
 515         seq = ntohl(th->seq);
 516         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 517                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 518                                      type == ICMP_TIME_EXCEEDED ||
 519                                      (type == ICMP_DEST_UNREACH &&
 520                                       (code == ICMP_NET_UNREACH ||
 521                                        code == ICMP_HOST_UNREACH)));
 522                 return 0;
 523         }
 524
 525         if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
 526                 sock_put(sk);
 527                 return 0;
 528         }
 529
 530         bh_lock_sock(sk);
 531         /* If too many ICMPs get dropped on busy
 532          * servers this needs to be solved differently.
 533          * We do take care of PMTU discovery (RFC1191) special case :
 534          * we can receive locally generated ICMP messages while socket is held.
 535          */
 536         if (sock_owned_by_user(sk)) {
 537                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 538                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 539         }
 540         if (sk->sk_state == TCP_CLOSE)
 541                 goto out;
 542
 543         if (static_branch_unlikely(&ip4_min_ttl)) {
 544                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 545                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 546                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 547                         goto out;
 548                 }
 549         }
 550
 551         tp = tcp_sk(sk);
 552         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 553         fastopen = rcu_dereference(tp->fastopen_rsk);
 554         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 555         if (sk->sk_state != TCP_LISTEN &&
 556             !between(seq, snd_una, tp->snd_nxt)) {
 557                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 558                 goto out;
 559         }
 560
 561         switch (type) {
 562         case ICMP_REDIRECT:
 563                 if (!sock_owned_by_user(sk))
 564                         do_redirect(skb, sk);
 565                 goto out;
 566         case ICMP_SOURCE_QUENCH:
 567                 /* Just silently ignore these. */
 568                 goto out;
 569         case ICMP_PARAMETERPROB:
 570                 err = EPROTO;
 571                 break;
 572         case ICMP_DEST_UNREACH:
 573                 if (code > NR_ICMP_UNREACH)
 574                         goto out;
 575
 576                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 577                         /* We are not interested in TCP_LISTEN and open_requests
 578                          * (SYN-ACKs send out by Linux are always <576bytes so
 579                          * they should go through unfragmented).
 580                          */
 581                         if (sk->sk_state == TCP_LISTEN)
 582                                 goto out;
 583
 584                         WRITE_ONCE(tp->mtu_info, info);
 585                         if (!sock_owned_by_user(sk)) {
 586                                 tcp_v4_mtu_reduced(sk);
 587                         } else {
 588                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 589                                         sock_hold(sk);
 590                         }
 591                         goto out;
 592                 }
 593
 594                 err = icmp_err_convert[code].errno;
 595                 /* check if this ICMP message allows revert of backoff.
 596                  * (see RFC 6069)
 597                  */
 598                 if (!fastopen &&
 599                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 600                         tcp_ld_RTO_revert(sk, seq);
 601                 break;
 602         case ICMP_TIME_EXCEEDED:
 603                 err = EHOSTUNREACH;
 604                 break;
 605         default:
 606                 goto out;
 607         }
 608
 609         switch (sk->sk_state) {
 610         case TCP_SYN_SENT:
 611         case TCP_SYN_RECV:
 612                 /* Only in fast or simultaneous open. If a fast open socket is
 613                  * already accepted it is treated as a connected one below.
 614                  */
 615                 if (fastopen && !fastopen->sk)
 616                         break;
 617
 618                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 619
 620                 if (!sock_owned_by_user(sk))
 621                         tcp_done_with_error(sk, err);
 622                 else
 623                         WRITE_ONCE(sk->sk_err_soft, err);
 624                 goto out;
 625         }
 626
 627         /* If we've already connected we will keep trying
 628          * until we time out, or the user gives up.
 629          *
 630          * rfc1122 4.2.3.9 allows to consider as hard errors
 631          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 632          * but it is obsoleted by pmtu discovery).
 633          *
 634          * Note, that in modern internet, where routing is unreliable
 635          * and in each dark corner broken firewalls sit, sending random
 636          * errors ordered by their masters even this two messages finally lose
 637          * their original sense (even Linux sends invalid PORT_UNREACHs)
 638          *
 639          * Now we are in compliance with RFCs.
 640          *                                                      --ANK (980905)
 641          */
 642
 643         if (!sock_owned_by_user(sk) &&
 644             inet_test_bit(RECVERR, sk)) {
 645                 WRITE_ONCE(sk->sk_err, err);
 646                 sk_error_report(sk);
 647         } else  { /* Only an error on timeout */
 648                 WRITE_ONCE(sk->sk_err_soft, err);
 649         }
 650
 651 out:
 652         bh_unlock_sock(sk);
 653         sock_put(sk);
 654         return 0;
 655 }
 656
 657 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 658 {
 659         struct tcphdr *th = tcp_hdr(skb);
 660
 661         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 662         skb->csum_start = skb_transport_header(skb) - skb->head;
 663         skb->csum_offset = offsetof(struct tcphdr, check);
 664 }
 665
 666 /* This routine computes an IPv4 TCP checksum. */
 667 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 668 {
 669         const struct inet_sock *inet = inet_sk(sk);
 670
 671         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 672 }
 673 EXPORT_SYMBOL(tcp_v4_send_check);
 674
 675 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
 676
 677 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
 678                                  const struct tcp_ao_hdr *aoh,
 679                                  struct ip_reply_arg *arg, struct tcphdr *reply,
 680                                  __be32 reply_options[REPLY_OPTIONS_LEN])
 681 {
 682 #ifdef CONFIG_TCP_AO
 683         int sdif = tcp_v4_sdif(skb);
 684         int dif = inet_iif(skb);
 685         int l3index = sdif ? dif : 0;
 686         bool allocated_traffic_key;
 687         struct tcp_ao_key *key;
 688         char *traffic_key;
 689         bool drop = true;
 690         u32 ao_sne = 0;
 691         u8 keyid;
 692
 693         rcu_read_lock();
 694         if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
 695                                  &key, &traffic_key, &allocated_traffic_key,
 696                                  &keyid, &ao_sne))
 697                 goto out;
 698
 699         reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
 700                                  (aoh->rnext_keyid << 8) | keyid);
 701         arg->iov[0].iov_len += tcp_ao_len_aligned(key);
 702         reply->doff = arg->iov[0].iov_len / 4;
 703
 704         if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
 705                             key, traffic_key,
 706                             (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
 707                             (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
 708                             reply, ao_sne))
 709                 goto out;
 710         drop = false;
 711 out:
 712         rcu_read_unlock();
 713         if (allocated_traffic_key)
 714                 kfree(traffic_key);
 715         return drop;
 716 #else
 717         return true;
 718 #endif
 719 }
 720
 721 /*
 722  *      This routine will send an RST to the other tcp.
 723  *
 724  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 725  *                    for reset.
 726  *      Answer: if a packet caused RST, it is not for a socket
 727  *              existing in our system, if it is matched to a socket,
 728  *              it is just duplicate segment or bug in other side's TCP.
 729  *              So that we build reply only basing on parameters
 730  *              arrived with segment.
 731  *      Exception: precedence violation. We do not implement it in any case.
 732  */
 733
 734 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
 735                               enum sk_rst_reason reason)
 736 {
 737         const struct tcphdr *th = tcp_hdr(skb);
 738         struct {
 739                 struct tcphdr th;
 740                 __be32 opt[REPLY_OPTIONS_LEN];
 741         } rep;
 742         const __u8 *md5_hash_location = NULL;
 743         const struct tcp_ao_hdr *aoh;
 744         struct ip_reply_arg arg;
 745 #ifdef CONFIG_TCP_MD5SIG
 746         struct tcp_md5sig_key *key = NULL;
 747         unsigned char newhash[16];
 748         struct sock *sk1 = NULL;
 749         int genhash;
 750 #endif
 751         u64 transmit_time = 0;
 752         struct sock *ctl_sk;
 753         struct net *net;
 754         u32 txhash = 0;
 755
 756         /* Never send a reset in response to a reset. */
 757         if (th->rst)
 758                 return;
 759
 760         /* If sk not NULL, it means we did a successful lookup and incoming
 761          * route had to be correct. prequeue might have dropped our dst.
 762          */
 763         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 764                 return;
 765
 766         /* Swap the send and the receive. */
 767         memset(&rep, 0, sizeof(rep));
 768         rep.th.dest   = th->source;
 769         rep.th.source = th->dest;
 770         rep.th.doff   = sizeof(struct tcphdr) / 4;
 771         rep.th.rst    = 1;
 772
 773         if (th->ack) {
 774                 rep.th.seq = th->ack_seq;
 775         } else {
 776                 rep.th.ack = 1;
 777                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 778                                        skb->len - (th->doff << 2));
 779         }
 780
 781         memset(&arg, 0, sizeof(arg));
 782         arg.iov[0].iov_base = (unsigned char *)&rep;
 783         arg.iov[0].iov_len  = sizeof(rep.th);
 784
 785         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 786
 787         /* Invalid TCP option size or twice included auth */
 788         if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
 789                 return;
 790
 791         if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
 792                 return;
 793
 794 #ifdef CONFIG_TCP_MD5SIG
 795         rcu_read_lock();
 796         if (sk && sk_fullsock(sk)) {
 797                 const union tcp_md5_addr *addr;
 798                 int l3index;
 799
 800                 /* sdif set, means packet ingressed via a device
 801                  * in an L3 domain and inet_iif is set to it.
 802                  */
 803                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 804                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 805                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 806         } else if (md5_hash_location) {
 807                 const union tcp_md5_addr *addr;
 808                 int sdif = tcp_v4_sdif(skb);
 809                 int dif = inet_iif(skb);
 810                 int l3index;
 811
 812                 /*
 813                  * active side is lost. Try to find listening socket through
 814                  * source port, and then find md5 key through listening socket.
 815                  * we are not loose security here:
 816                  * Incoming packet is checked with md5 hash with finding key,
 817                  * no RST generated if md5 hash doesn't match.
 818                  */
 819                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
 820                                              NULL, 0, ip_hdr(skb)->saddr,
 821                                              th->source, ip_hdr(skb)->daddr,
 822                                              ntohs(th->source), dif, sdif);
 823                 /* don't send rst if it can't find key */
 824                 if (!sk1)
 825                         goto out;
 826
 827                 /* sdif set, means packet ingressed via a device
 828                  * in an L3 domain and dif is set to it.
 829                  */
 830                 l3index = sdif ? dif : 0;
 831                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 832                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 833                 if (!key)
 834                         goto out;
 835
 836
 837                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 838                 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
 839                         goto out;
 840
 841         }
 842
 843         if (key) {
 844                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 845                                    (TCPOPT_NOP << 16) |
 846                                    (TCPOPT_MD5SIG << 8) |
 847                                    TCPOLEN_MD5SIG);
 848                 /* Update length and the length the header thinks exists */
 849                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 850                 rep.th.doff = arg.iov[0].iov_len / 4;
 851
 852                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 853                                      key, ip_hdr(skb)->saddr,
 854                                      ip_hdr(skb)->daddr, &rep.th);
 855         }
 856 #endif
 857         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 858         if (rep.opt[0] == 0) {
 859                 __be32 mrst = mptcp_reset_option(skb);
 860
 861                 if (mrst) {
 862                         rep.opt[0] = mrst;
 863                         arg.iov[0].iov_len += sizeof(mrst);
 864                         rep.th.doff = arg.iov[0].iov_len / 4;
 865                 }
 866         }
 867
 868         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 869                                       ip_hdr(skb)->saddr, /* XXX */
 870                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 871         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 872         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 873
 874         /* When socket is gone, all binding information is lost.
 875          * routing might fail in this case. No choice here, if we choose to force
 876          * input interface, we will misroute in case of asymmetric route.
 877          */
 878         if (sk)
 879                 arg.bound_dev_if = sk->sk_bound_dev_if;
 880
 881         trace_tcp_send_reset(sk, skb, reason);
 882
 883         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 884                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 885
 886         arg.tos = ip_hdr(skb)->tos;
 887         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 888         local_bh_disable();
 889         local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
 890         ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
 891
 892         sock_net_set(ctl_sk, net);
 893         if (sk) {
 894                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 895                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 896                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 897                                    inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
 898                 transmit_time = tcp_transmit_time(sk);
 899                 xfrm_sk_clone_policy(ctl_sk, sk);
 900                 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
 901                          inet_twsk(sk)->tw_txhash : sk->sk_txhash;
 902         } else {
 903                 ctl_sk->sk_mark = 0;
 904                 ctl_sk->sk_priority = 0;
 905         }
 906         ip_send_unicast_reply(ctl_sk,
 907                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 908                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 909                               &arg, arg.iov[0].iov_len,
 910                               transmit_time, txhash);
 911
 912         xfrm_sk_free_policy(ctl_sk);
 913         sock_net_set(ctl_sk, &init_net);
 914         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 915         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 916         local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
 917         local_bh_enable();
 918
 919 #ifdef CONFIG_TCP_MD5SIG
 920 out:
 921         rcu_read_unlock();
 922 #endif
 923 }
 924
 925 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 926    outside socket context is ugly, certainly. What can I do?
 927  */
 928
 929 static void tcp_v4_send_ack(const struct sock *sk,
 930                             struct sk_buff *skb, u32 seq, u32 ack,
 931                             u32 win, u32 tsval, u32 tsecr, int oif,
 932                             struct tcp_key *key,
 933                             int reply_flags, u8 tos, u32 txhash)
 934 {
 935         const struct tcphdr *th = tcp_hdr(skb);
 936         struct {
 937                 struct tcphdr th;
 938                 __be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
 939         } rep;
 940         struct net *net = sock_net(sk);
 941         struct ip_reply_arg arg;
 942         struct sock *ctl_sk;
 943         u64 transmit_time;
 944
 945         memset(&rep.th, 0, sizeof(struct tcphdr));
 946         memset(&arg, 0, sizeof(arg));
 947
 948         arg.iov[0].iov_base = (unsigned char *)&rep;
 949         arg.iov[0].iov_len  = sizeof(rep.th);
 950         if (tsecr) {
 951                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 952                                    (TCPOPT_TIMESTAMP << 8) |
 953                                    TCPOLEN_TIMESTAMP);
 954                 rep.opt[1] = htonl(tsval);
 955                 rep.opt[2] = htonl(tsecr);
 956                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 957         }
 958
 959         /* Swap the send and the receive. */
 960         rep.th.dest    = th->source;
 961         rep.th.source  = th->dest;
 962         rep.th.doff    = arg.iov[0].iov_len / 4;
 963         rep.th.seq     = htonl(seq);
 964         rep.th.ack_seq = htonl(ack);
 965         rep.th.ack     = 1;
 966         rep.th.window  = htons(win);
 967
 968 #ifdef CONFIG_TCP_MD5SIG
 969         if (tcp_key_is_md5(key)) {
 970                 int offset = (tsecr) ? 3 : 0;
 971
 972                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 973                                           (TCPOPT_NOP << 16) |
 974                                           (TCPOPT_MD5SIG << 8) |
 975                                           TCPOLEN_MD5SIG);
 976                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 977                 rep.th.doff = arg.iov[0].iov_len/4;
 978
 979                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 980                                     key->md5_key, ip_hdr(skb)->saddr,
 981                                     ip_hdr(skb)->daddr, &rep.th);
 982         }
 983 #endif
 984 #ifdef CONFIG_TCP_AO
 985         if (tcp_key_is_ao(key)) {
 986                 int offset = (tsecr) ? 3 : 0;
 987
 988                 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
 989                                           (tcp_ao_len(key->ao_key) << 16) |
 990                                           (key->ao_key->sndid << 8) |
 991                                           key->rcv_next);
 992                 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
 993                 rep.th.doff = arg.iov[0].iov_len / 4;
 994
 995                 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
 996                                 key->ao_key, key->traffic_key,
 997                                 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
 998                                 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
 999                                 &rep.th, key->sne);
1000         }
1001 #endif
1002         arg.flags = reply_flags;
1003         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1004                                       ip_hdr(skb)->saddr, /* XXX */
1005                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1006         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1007         if (oif)
1008                 arg.bound_dev_if = oif;
1009         arg.tos = tos;
1010         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1011         local_bh_disable();
1012         local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1013         ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1014         sock_net_set(ctl_sk, net);
1015         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1016                            inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1017         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1018                            inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1019         transmit_time = tcp_transmit_time(sk);
1020         ip_send_unicast_reply(ctl_sk,
1021                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
1022                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1023                               &arg, arg.iov[0].iov_len,
1024                               transmit_time, txhash);
1025
1026         sock_net_set(ctl_sk, &init_net);
1027         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1028         local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1029         local_bh_enable();
1030 }
1031
1032 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1033 {
1034         struct inet_timewait_sock *tw = inet_twsk(sk);
1035         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1036         struct tcp_key key = {};
1037 #ifdef CONFIG_TCP_AO
1038         struct tcp_ao_info *ao_info;
1039
1040         if (static_branch_unlikely(&tcp_ao_needed.key)) {
1041                 /* FIXME: the segment to-be-acked is not verified yet */
1042                 ao_info = rcu_dereference(tcptw->ao_info);
1043                 if (ao_info) {
1044                         const struct tcp_ao_hdr *aoh;
1045
1046                         if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1047                                 inet_twsk_put(tw);
1048                                 return;
1049                         }
1050
1051                         if (aoh)
1052                                 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1053                 }
1054         }
1055         if (key.ao_key) {
1056                 struct tcp_ao_key *rnext_key;
1057
1058                 key.traffic_key = snd_other_key(key.ao_key);
1059                 key.sne = READ_ONCE(ao_info->snd_sne);
1060                 rnext_key = READ_ONCE(ao_info->rnext_key);
1061                 key.rcv_next = rnext_key->rcvid;
1062                 key.type = TCP_KEY_AO;
1063 #else
1064         if (0) {
1065 #endif
1066         } else if (static_branch_tcp_md5()) {
1067                 key.md5_key = tcp_twsk_md5_key(tcptw);
1068                 if (key.md5_key)
1069                         key.type = TCP_KEY_MD5;
1070         }
1071
1072         tcp_v4_send_ack(sk, skb,
1073                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1074                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1075                         tcp_tw_tsval(tcptw),
1076                         READ_ONCE(tcptw->tw_ts_recent),
1077                         tw->tw_bound_dev_if, &key,
1078                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1079                         tw->tw_tos,
1080                         tw->tw_txhash);
1081
1082         inet_twsk_put(tw);
1083 }
1084
1085 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1086                                   struct request_sock *req)
1087 {
1088         struct tcp_key key = {};
1089
1090         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1091          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1092          */
1093         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1094                                              tcp_sk(sk)->snd_nxt;
1095
1096 #ifdef CONFIG_TCP_AO
1097         if (static_branch_unlikely(&tcp_ao_needed.key) &&
1098             tcp_rsk_used_ao(req)) {
1099                 const union tcp_md5_addr *addr;
1100                 const struct tcp_ao_hdr *aoh;
1101                 int l3index;
1102
1103                 /* Invalid TCP option size or twice included auth */
1104                 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1105                         return;
1106                 if (!aoh)
1107                         return;
1108
1109                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1110                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1111                 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1112                                               aoh->rnext_keyid, -1);
1113                 if (unlikely(!key.ao_key)) {
1114                         /* Send ACK with any matching MKT for the peer */
1115                         key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1116                         /* Matching key disappeared (user removed the key?)
1117                          * let the handshake timeout.
1118                          */
1119                         if (!key.ao_key) {
1120                                 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1121                                                      addr,
1122                                                      ntohs(tcp_hdr(skb)->source),
1123                                                      &ip_hdr(skb)->daddr,
1124                                                      ntohs(tcp_hdr(skb)->dest));
1125                                 return;
1126                         }
1127                 }
1128                 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1129                 if (!key.traffic_key)
1130                         return;
1131
1132                 key.type = TCP_KEY_AO;
1133                 key.rcv_next = aoh->keyid;
1134                 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1135 #else
1136         if (0) {
1137 #endif
1138         } else if (static_branch_tcp_md5()) {
1139                 const union tcp_md5_addr *addr;
1140                 int l3index;
1141
1142                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1143                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1144                 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1145                 if (key.md5_key)
1146                         key.type = TCP_KEY_MD5;
1147         }
1148
1149         tcp_v4_send_ack(sk, skb, seq,
1150                         tcp_rsk(req)->rcv_nxt,
1151                         tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1152                         tcp_rsk_tsval(tcp_rsk(req)),
1153                         READ_ONCE(req->ts_recent),
1154                         0, &key,
1155                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1156                         ip_hdr(skb)->tos,
1157                         READ_ONCE(tcp_rsk(req)->txhash));
1158         if (tcp_key_is_ao(&key))
1159                 kfree(key.traffic_key);
1160 }
1161
1162 /*
1163  *      Send a SYN-ACK after having received a SYN.
1164  *      This still operates on a request_sock only, not on a big
1165  *      socket.
1166  */
1167 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1168                               struct flowi *fl,
1169                               struct request_sock *req,
1170                               struct tcp_fastopen_cookie *foc,
1171                               enum tcp_synack_type synack_type,
1172                               struct sk_buff *syn_skb)
1173 {
1174         const struct inet_request_sock *ireq = inet_rsk(req);
1175         struct flowi4 fl4;
1176         int err = -1;
1177         struct sk_buff *skb;
1178         u8 tos;
1179
1180         /* First, grab a route. */
1181         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1182                 return -1;
1183
1184         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1185
1186         if (skb) {
1187                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1188
1189                 tos = READ_ONCE(inet_sk(sk)->tos);
1190
1191                 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1192                         tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1193                               (tos & INET_ECN_MASK);
1194
1195                 if (!INET_ECN_is_capable(tos) &&
1196                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1197                         tos |= INET_ECN_ECT_0;
1198
1199                 rcu_read_lock();
1200                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1201                                             ireq->ir_rmt_addr,
1202                                             rcu_dereference(ireq->ireq_opt),
1203                                             tos);
1204                 rcu_read_unlock();
1205                 err = net_xmit_eval(err);
1206         }
1207
1208         return err;
1209 }
1210
1211 /*
1212  *      IPv4 request_sock destructor.
1213  */
1214 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1215 {
1216         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1217 }
1218
1219 #ifdef CONFIG_TCP_MD5SIG
1220 /*
1221  * RFC2385 MD5 checksumming requires a mapping of
1222  * IP address->MD5 Key.
1223  * We need to maintain these in the sk structure.
1224  */
1225
1226 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1227 EXPORT_SYMBOL(tcp_md5_needed);
1228
1229 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1230 {
1231         if (!old)
1232                 return true;
1233
1234         /* l3index always overrides non-l3index */
1235         if (old->l3index && new->l3index == 0)
1236                 return false;
1237         if (old->l3index == 0 && new->l3index)
1238                 return true;
1239
1240         return old->prefixlen < new->prefixlen;
1241 }
1242
1243 /* Find the Key structure for an address.  */
1244 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1245                                            const union tcp_md5_addr *addr,
1246                                            int family, bool any_l3index)
1247 {
1248         const struct tcp_sock *tp = tcp_sk(sk);
1249         struct tcp_md5sig_key *key;
1250         const struct tcp_md5sig_info *md5sig;
1251         __be32 mask;
1252         struct tcp_md5sig_key *best_match = NULL;
1253         bool match;
1254
1255         /* caller either holds rcu_read_lock() or socket lock */
1256         md5sig = rcu_dereference_check(tp->md5sig_info,
1257                                        lockdep_sock_is_held(sk));
1258         if (!md5sig)
1259                 return NULL;
1260
1261         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1262                                  lockdep_sock_is_held(sk)) {
1263                 if (key->family != family)
1264                         continue;
1265                 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1266                     key->l3index != l3index)
1267                         continue;
1268                 if (family == AF_INET) {
1269                         mask = inet_make_mask(key->prefixlen);
1270                         match = (key->addr.a4.s_addr & mask) ==
1271                                 (addr->a4.s_addr & mask);
1272 #if IS_ENABLED(CONFIG_IPV6)
1273                 } else if (family == AF_INET6) {
1274                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1275                                                   key->prefixlen);
1276 #endif
1277                 } else {
1278                         match = false;
1279                 }
1280
1281                 if (match && better_md5_match(best_match, key))
1282                         best_match = key;
1283         }
1284         return best_match;
1285 }
1286 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1287
1288 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1289                                                       const union tcp_md5_addr *addr,
1290                                                       int family, u8 prefixlen,
1291                                                       int l3index, u8 flags)
1292 {
1293         const struct tcp_sock *tp = tcp_sk(sk);
1294         struct tcp_md5sig_key *key;
1295         unsigned int size = sizeof(struct in_addr);
1296         const struct tcp_md5sig_info *md5sig;
1297
1298         /* caller either holds rcu_read_lock() or socket lock */
1299         md5sig = rcu_dereference_check(tp->md5sig_info,
1300                                        lockdep_sock_is_held(sk));
1301         if (!md5sig)
1302                 return NULL;
1303 #if IS_ENABLED(CONFIG_IPV6)
1304         if (family == AF_INET6)
1305                 size = sizeof(struct in6_addr);
1306 #endif
1307         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1308                                  lockdep_sock_is_held(sk)) {
1309                 if (key->family != family)
1310                         continue;
1311                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1312                         continue;
1313                 if (key->l3index != l3index)
1314                         continue;
1315                 if (!memcmp(&key->addr, addr, size) &&
1316                     key->prefixlen == prefixlen)
1317                         return key;
1318         }
1319         return NULL;
1320 }
1321
1322 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1323                                          const struct sock *addr_sk)
1324 {
1325         const union tcp_md5_addr *addr;
1326         int l3index;
1327
1328         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1329                                                  addr_sk->sk_bound_dev_if);
1330         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1331         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1332 }
1333 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1334
1335 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1336 {
1337         struct tcp_sock *tp = tcp_sk(sk);
1338         struct tcp_md5sig_info *md5sig;
1339
1340         md5sig = kmalloc(sizeof(*md5sig), gfp);
1341         if (!md5sig)
1342                 return -ENOMEM;
1343
1344         sk_gso_disable(sk);
1345         INIT_HLIST_HEAD(&md5sig->head);
1346         rcu_assign_pointer(tp->md5sig_info, md5sig);
1347         return 0;
1348 }
1349
1350 /* This can be called on a newly created socket, from other files */
1351 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1352                             int family, u8 prefixlen, int l3index, u8 flags,
1353                             const u8 *newkey, u8 newkeylen, gfp_t gfp)
1354 {
1355         /* Add Key to the list */
1356         struct tcp_md5sig_key *key;
1357         struct tcp_sock *tp = tcp_sk(sk);
1358         struct tcp_md5sig_info *md5sig;
1359
1360         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1361         if (key) {
1362                 /* Pre-existing entry - just update that one.
1363                  * Note that the key might be used concurrently.
1364                  * data_race() is telling kcsan that we do not care of
1365                  * key mismatches, since changing MD5 key on live flows
1366                  * can lead to packet drops.
1367                  */
1368                 data_race(memcpy(key->key, newkey, newkeylen));
1369
1370                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1371                  * Also note that a reader could catch new key->keylen value
1372                  * but old key->key[], this is the reason we use __GFP_ZERO
1373                  * at sock_kmalloc() time below these lines.
1374                  */
1375                 WRITE_ONCE(key->keylen, newkeylen);
1376
1377                 return 0;
1378         }
1379
1380         md5sig = rcu_dereference_protected(tp->md5sig_info,
1381                                            lockdep_sock_is_held(sk));
1382
1383         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1384         if (!key)
1385                 return -ENOMEM;
1386
1387         memcpy(key->key, newkey, newkeylen);
1388         key->keylen = newkeylen;
1389         key->family = family;
1390         key->prefixlen = prefixlen;
1391         key->l3index = l3index;
1392         key->flags = flags;
1393         memcpy(&key->addr, addr,
1394                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1395                                                                  sizeof(struct in_addr));
1396         hlist_add_head_rcu(&key->node, &md5sig->head);
1397         return 0;
1398 }
1399
1400 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1401                    int family, u8 prefixlen, int l3index, u8 flags,
1402                    const u8 *newkey, u8 newkeylen)
1403 {
1404         struct tcp_sock *tp = tcp_sk(sk);
1405
1406         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1407                 if (tcp_md5_alloc_sigpool())
1408                         return -ENOMEM;
1409
1410                 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1411                         tcp_md5_release_sigpool();
1412                         return -ENOMEM;
1413                 }
1414
1415                 if (!static_branch_inc(&tcp_md5_needed.key)) {
1416                         struct tcp_md5sig_info *md5sig;
1417
1418                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1419                         rcu_assign_pointer(tp->md5sig_info, NULL);
1420                         kfree_rcu(md5sig, rcu);
1421                         tcp_md5_release_sigpool();
1422                         return -EUSERS;
1423                 }
1424         }
1425
1426         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1427                                 newkey, newkeylen, GFP_KERNEL);
1428 }
1429 EXPORT_SYMBOL(tcp_md5_do_add);
1430
1431 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1432                      int family, u8 prefixlen, int l3index,
1433                      struct tcp_md5sig_key *key)
1434 {
1435         struct tcp_sock *tp = tcp_sk(sk);
1436
1437         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1438                 tcp_md5_add_sigpool();
1439
1440                 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1441                         tcp_md5_release_sigpool();
1442                         return -ENOMEM;
1443                 }
1444
1445                 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1446                         struct tcp_md5sig_info *md5sig;
1447
1448                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1449                         net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1450                         rcu_assign_pointer(tp->md5sig_info, NULL);
1451                         kfree_rcu(md5sig, rcu);
1452                         tcp_md5_release_sigpool();
1453                         return -EUSERS;
1454                 }
1455         }
1456
1457         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1458                                 key->flags, key->key, key->keylen,
1459                                 sk_gfp_mask(sk, GFP_ATOMIC));
1460 }
1461 EXPORT_SYMBOL(tcp_md5_key_copy);
1462
1463 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1464                    u8 prefixlen, int l3index, u8 flags)
1465 {
1466         struct tcp_md5sig_key *key;
1467
1468         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1469         if (!key)
1470                 return -ENOENT;
1471         hlist_del_rcu(&key->node);
1472         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1473         kfree_rcu(key, rcu);
1474         return 0;
1475 }
1476 EXPORT_SYMBOL(tcp_md5_do_del);
1477
1478 void tcp_clear_md5_list(struct sock *sk)
1479 {
1480         struct tcp_sock *tp = tcp_sk(sk);
1481         struct tcp_md5sig_key *key;
1482         struct hlist_node *n;
1483         struct tcp_md5sig_info *md5sig;
1484
1485         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1486
1487         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1488                 hlist_del_rcu(&key->node);
1489                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1490                 kfree_rcu(key, rcu);
1491         }
1492 }
1493
1494 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1495                                  sockptr_t optval, int optlen)
1496 {
1497         struct tcp_md5sig cmd;
1498         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1499         const union tcp_md5_addr *addr;
1500         u8 prefixlen = 32;
1501         int l3index = 0;
1502         bool l3flag;
1503         u8 flags;
1504
1505         if (optlen < sizeof(cmd))
1506                 return -EINVAL;
1507
1508         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1509                 return -EFAULT;
1510
1511         if (sin->sin_family != AF_INET)
1512                 return -EINVAL;
1513
1514         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1515         l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1516
1517         if (optname == TCP_MD5SIG_EXT &&
1518             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1519                 prefixlen = cmd.tcpm_prefixlen;
1520                 if (prefixlen > 32)
1521                         return -EINVAL;
1522         }
1523
1524         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1525             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1526                 struct net_device *dev;
1527
1528                 rcu_read_lock();
1529                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1530                 if (dev && netif_is_l3_master(dev))
1531                         l3index = dev->ifindex;
1532
1533                 rcu_read_unlock();
1534
1535                 /* ok to reference set/not set outside of rcu;
1536                  * right now device MUST be an L3 master
1537                  */
1538                 if (!dev || !l3index)
1539                         return -EINVAL;
1540         }
1541
1542         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1543
1544         if (!cmd.tcpm_keylen)
1545                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1546
1547         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1548                 return -EINVAL;
1549
1550         /* Don't allow keys for peers that have a matching TCP-AO key.
1551          * See the comment in tcp_ao_add_cmd()
1552          */
1553         if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1554                 return -EKEYREJECTED;
1555
1556         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1557                               cmd.tcpm_key, cmd.tcpm_keylen);
1558 }
1559
1560 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1561                                    __be32 daddr, __be32 saddr,
1562                                    const struct tcphdr *th, int nbytes)
1563 {
1564         struct tcp4_pseudohdr *bp;
1565         struct scatterlist sg;
1566         struct tcphdr *_th;
1567
1568         bp = hp->scratch;
1569         bp->saddr = saddr;
1570         bp->daddr = daddr;
1571         bp->pad = 0;
1572         bp->protocol = IPPROTO_TCP;
1573         bp->len = cpu_to_be16(nbytes);
1574
1575         _th = (struct tcphdr *)(bp + 1);
1576         memcpy(_th, th, sizeof(*th));
1577         _th->check = 0;
1578
1579         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1580         ahash_request_set_crypt(hp->req, &sg, NULL,
1581                                 sizeof(*bp) + sizeof(*th));
1582         return crypto_ahash_update(hp->req);
1583 }
1584
1585 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1586                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1587 {
1588         struct tcp_sigpool hp;
1589
1590         if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1591                 goto clear_hash_nostart;
1592
1593         if (crypto_ahash_init(hp.req))
1594                 goto clear_hash;
1595         if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1596                 goto clear_hash;
1597         if (tcp_md5_hash_key(&hp, key))
1598                 goto clear_hash;
1599         ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1600         if (crypto_ahash_final(hp.req))
1601                 goto clear_hash;
1602
1603         tcp_sigpool_end(&hp);
1604         return 0;
1605
1606 clear_hash:
1607         tcp_sigpool_end(&hp);
1608 clear_hash_nostart:
1609         memset(md5_hash, 0, 16);
1610         return 1;
1611 }
1612
1613 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1614                         const struct sock *sk,
1615                         const struct sk_buff *skb)
1616 {
1617         const struct tcphdr *th = tcp_hdr(skb);
1618         struct tcp_sigpool hp;
1619         __be32 saddr, daddr;
1620
1621         if (sk) { /* valid for establish/request sockets */
1622                 saddr = sk->sk_rcv_saddr;
1623                 daddr = sk->sk_daddr;
1624         } else {
1625                 const struct iphdr *iph = ip_hdr(skb);
1626                 saddr = iph->saddr;
1627                 daddr = iph->daddr;
1628         }
1629
1630         if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1631                 goto clear_hash_nostart;
1632
1633         if (crypto_ahash_init(hp.req))
1634                 goto clear_hash;
1635
1636         if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1637                 goto clear_hash;
1638         if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1639                 goto clear_hash;
1640         if (tcp_md5_hash_key(&hp, key))
1641                 goto clear_hash;
1642         ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1643         if (crypto_ahash_final(hp.req))
1644                 goto clear_hash;
1645
1646         tcp_sigpool_end(&hp);
1647         return 0;
1648
1649 clear_hash:
1650         tcp_sigpool_end(&hp);
1651 clear_hash_nostart:
1652         memset(md5_hash, 0, 16);
1653         return 1;
1654 }
1655 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1656
1657 #endif
1658
1659 static void tcp_v4_init_req(struct request_sock *req,
1660                             const struct sock *sk_listener,
1661                             struct sk_buff *skb)
1662 {
1663         struct inet_request_sock *ireq = inet_rsk(req);
1664         struct net *net = sock_net(sk_listener);
1665
1666         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1667         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1668         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1669 }
1670
1671 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1672                                           struct sk_buff *skb,
1673                                           struct flowi *fl,
1674                                           struct request_sock *req,
1675                                           u32 tw_isn)
1676 {
1677         tcp_v4_init_req(req, sk, skb);
1678
1679         if (security_inet_conn_request(sk, skb, req))
1680                 return NULL;
1681
1682         return inet_csk_route_req(sk, &fl->u.ip4, req);
1683 }
1684
1685 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1686         .family         =       PF_INET,
1687         .obj_size       =       sizeof(struct tcp_request_sock),
1688         .rtx_syn_ack    =       tcp_rtx_synack,
1689         .send_ack       =       tcp_v4_reqsk_send_ack,
1690         .destructor     =       tcp_v4_reqsk_destructor,
1691         .send_reset     =       tcp_v4_send_reset,
1692         .syn_ack_timeout =      tcp_syn_ack_timeout,
1693 };
1694
1695 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1696         .mss_clamp      =       TCP_MSS_DEFAULT,
1697 #ifdef CONFIG_TCP_MD5SIG
1698         .req_md5_lookup =       tcp_v4_md5_lookup,
1699         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1700 #endif
1701 #ifdef CONFIG_TCP_AO
1702         .ao_lookup      =       tcp_v4_ao_lookup_rsk,
1703         .ao_calc_key    =       tcp_v4_ao_calc_key_rsk,
1704         .ao_synack_hash =       tcp_v4_ao_synack_hash,
1705 #endif
1706 #ifdef CONFIG_SYN_COOKIES
1707         .cookie_init_seq =      cookie_v4_init_sequence,
1708 #endif
1709         .route_req      =       tcp_v4_route_req,
1710         .init_seq       =       tcp_v4_init_seq,
1711         .init_ts_off    =       tcp_v4_init_ts_off,
1712         .send_synack    =       tcp_v4_send_synack,
1713 };
1714
1715 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1716 {
1717         /* Never answer to SYNs send to broadcast or multicast */
1718         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1719                 goto drop;
1720
1721         return tcp_conn_request(&tcp_request_sock_ops,
1722                                 &tcp_request_sock_ipv4_ops, sk, skb);
1723
1724 drop:
1725         tcp_listendrop(sk);
1726         return 0;
1727 }
1728 EXPORT_SYMBOL(tcp_v4_conn_request);
1729
1730
1731 /*
1732  * The three way handshake has completed - we got a valid synack -
1733  * now create the new socket.
1734  */
1735 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1736                                   struct request_sock *req,
1737                                   struct dst_entry *dst,
1738                                   struct request_sock *req_unhash,
1739                                   bool *own_req)
1740 {
1741         struct inet_request_sock *ireq;
1742         bool found_dup_sk = false;
1743         struct inet_sock *newinet;
1744         struct tcp_sock *newtp;
1745         struct sock *newsk;
1746 #ifdef CONFIG_TCP_MD5SIG
1747         const union tcp_md5_addr *addr;
1748         struct tcp_md5sig_key *key;
1749         int l3index;
1750 #endif
1751         struct ip_options_rcu *inet_opt;
1752
1753         if (sk_acceptq_is_full(sk))
1754                 goto exit_overflow;
1755
1756         newsk = tcp_create_openreq_child(sk, req, skb);
1757         if (!newsk)
1758                 goto exit_nonewsk;
1759
1760         newsk->sk_gso_type = SKB_GSO_TCPV4;
1761         inet_sk_rx_dst_set(newsk, skb);
1762
1763         newtp                 = tcp_sk(newsk);
1764         newinet               = inet_sk(newsk);
1765         ireq                  = inet_rsk(req);
1766         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1767         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1768         newsk->sk_bound_dev_if = ireq->ir_iif;
1769         newinet->inet_saddr   = ireq->ir_loc_addr;
1770         inet_opt              = rcu_dereference(ireq->ireq_opt);
1771         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1772         newinet->mc_index     = inet_iif(skb);
1773         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1774         newinet->rcv_tos      = ip_hdr(skb)->tos;
1775         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1776         if (inet_opt)
1777                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1778         atomic_set(&newinet->inet_id, get_random_u16());
1779
1780         /* Set ToS of the new socket based upon the value of incoming SYN.
1781          * ECT bits are set later in tcp_init_transfer().
1782          */
1783         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1784                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1785
1786         if (!dst) {
1787                 dst = inet_csk_route_child_sock(sk, newsk, req);
1788                 if (!dst)
1789                         goto put_and_exit;
1790         } else {
1791                 /* syncookie case : see end of cookie_v4_check() */
1792         }
1793         sk_setup_caps(newsk, dst);
1794
1795         tcp_ca_openreq_child(newsk, dst);
1796
1797         tcp_sync_mss(newsk, dst_mtu(dst));
1798         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1799
1800         tcp_initialize_rcv_mss(newsk);
1801
1802 #ifdef CONFIG_TCP_MD5SIG
1803         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1804         /* Copy over the MD5 key from the original socket */
1805         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1806         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1807         if (key && !tcp_rsk_used_ao(req)) {
1808                 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1809                         goto put_and_exit;
1810                 sk_gso_disable(newsk);
1811         }
1812 #endif
1813 #ifdef CONFIG_TCP_AO
1814         if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1815                 goto put_and_exit; /* OOM, release back memory */
1816 #endif
1817
1818         if (__inet_inherit_port(sk, newsk) < 0)
1819                 goto put_and_exit;
1820         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1821                                        &found_dup_sk);
1822         if (likely(*own_req)) {
1823                 tcp_move_syn(newtp, req);
1824                 ireq->ireq_opt = NULL;
1825         } else {
1826                 newinet->inet_opt = NULL;
1827
1828                 if (!req_unhash && found_dup_sk) {
1829                         /* This code path should only be executed in the
1830                          * syncookie case only
1831                          */
1832                         bh_unlock_sock(newsk);
1833                         sock_put(newsk);
1834                         newsk = NULL;
1835                 }
1836         }
1837         return newsk;
1838
1839 exit_overflow:
1840         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1841 exit_nonewsk:
1842         dst_release(dst);
1843 exit:
1844         tcp_listendrop(sk);
1845         return NULL;
1846 put_and_exit:
1847         newinet->inet_opt = NULL;
1848         inet_csk_prepare_forced_close(newsk);
1849         tcp_done(newsk);
1850         goto exit;
1851 }
1852 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1853
1854 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1855 {
1856 #ifdef CONFIG_SYN_COOKIES
1857         const struct tcphdr *th = tcp_hdr(skb);
1858
1859         if (!th->syn)
1860                 sk = cookie_v4_check(sk, skb);
1861 #endif
1862         return sk;
1863 }
1864
1865 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1866                          struct tcphdr *th, u32 *cookie)
1867 {
1868         u16 mss = 0;
1869 #ifdef CONFIG_SYN_COOKIES
1870         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1871                                     &tcp_request_sock_ipv4_ops, sk, th);
1872         if (mss) {
1873                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1874                 tcp_synq_overflow(sk);
1875         }
1876 #endif
1877         return mss;
1878 }
1879
1880 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1881                                                            u32));
1882 /* The socket must have it's spinlock held when we get
1883  * here, unless it is a TCP_LISTEN socket.
1884  *
1885  * We have a potential double-lock case here, so even when
1886  * doing backlog processing we use the BH locking scheme.
1887  * This is because we cannot sleep with the original spinlock
1888  * held.
1889  */
1890 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1891 {
1892         enum skb_drop_reason reason;
1893         struct sock *rsk;
1894
1895         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1896                 struct dst_entry *dst;
1897
1898                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1899                                                 lockdep_sock_is_held(sk));
1900
1901                 sock_rps_save_rxhash(sk, skb);
1902                 sk_mark_napi_id(sk, skb);
1903                 if (dst) {
1904                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1905                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1906                                              dst, 0)) {
1907                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1908                                 dst_release(dst);
1909                         }
1910                 }
1911                 tcp_rcv_established(sk, skb);
1912                 return 0;
1913         }
1914
1915         if (tcp_checksum_complete(skb))
1916                 goto csum_err;
1917
1918         if (sk->sk_state == TCP_LISTEN) {
1919                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1920
1921                 if (!nsk)
1922                         return 0;
1923                 if (nsk != sk) {
1924                         reason = tcp_child_process(sk, nsk, skb);
1925                         if (reason) {
1926                                 rsk = nsk;
1927                                 goto reset;
1928                         }
1929                         return 0;
1930                 }
1931         } else
1932                 sock_rps_save_rxhash(sk, skb);
1933
1934         reason = tcp_rcv_state_process(sk, skb);
1935         if (reason) {
1936                 rsk = sk;
1937                 goto reset;
1938         }
1939         return 0;
1940
1941 reset:
1942         tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1943 discard:
1944         sk_skb_reason_drop(sk, skb, reason);
1945         /* Be careful here. If this function gets more complicated and
1946          * gcc suffers from register pressure on the x86, sk (in %ebx)
1947          * might be destroyed here. This current version compiles correctly,
1948          * but you have been warned.
1949          */
1950         return 0;
1951
1952 csum_err:
1953         reason = SKB_DROP_REASON_TCP_CSUM;
1954         trace_tcp_bad_csum(skb);
1955         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1956         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1957         goto discard;
1958 }
1959 EXPORT_SYMBOL(tcp_v4_do_rcv);
1960
1961 int tcp_v4_early_demux(struct sk_buff *skb)
1962 {
1963         struct net *net = dev_net(skb->dev);
1964         const struct iphdr *iph;
1965         const struct tcphdr *th;
1966         struct sock *sk;
1967
1968         if (skb->pkt_type != PACKET_HOST)
1969                 return 0;
1970
1971         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1972                 return 0;
1973
1974         iph = ip_hdr(skb);
1975         th = tcp_hdr(skb);
1976
1977         if (th->doff < sizeof(struct tcphdr) / 4)
1978                 return 0;
1979
1980         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1981                                        iph->saddr, th->source,
1982                                        iph->daddr, ntohs(th->dest),
1983                                        skb->skb_iif, inet_sdif(skb));
1984         if (sk) {
1985                 skb->sk = sk;
1986                 skb->destructor = sock_edemux;
1987                 if (sk_fullsock(sk)) {
1988                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1989
1990                         if (dst)
1991                                 dst = dst_check(dst, 0);
1992                         if (dst &&
1993                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1994                                 skb_dst_set_noref(skb, dst);
1995                 }
1996         }
1997         return 0;
1998 }
1999
2000 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2001                      enum skb_drop_reason *reason)
2002 {
2003         u32 tail_gso_size, tail_gso_segs;
2004         struct skb_shared_info *shinfo;
2005         const struct tcphdr *th;
2006         struct tcphdr *thtail;
2007         struct sk_buff *tail;
2008         unsigned int hdrlen;
2009         bool fragstolen;
2010         u32 gso_segs;
2011         u32 gso_size;
2012         u64 limit;
2013         int delta;
2014
2015         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2016          * we can fix skb->truesize to its real value to avoid future drops.
2017          * This is valid because skb is not yet charged to the socket.
2018          * It has been noticed pure SACK packets were sometimes dropped
2019          * (if cooked by drivers without copybreak feature).
2020          */
2021         skb_condense(skb);
2022
2023         skb_dst_drop(skb);
2024
2025         if (unlikely(tcp_checksum_complete(skb))) {
2026                 bh_unlock_sock(sk);
2027                 trace_tcp_bad_csum(skb);
2028                 *reason = SKB_DROP_REASON_TCP_CSUM;
2029                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2030                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2031                 return true;
2032         }
2033
2034         /* Attempt coalescing to last skb in backlog, even if we are
2035          * above the limits.
2036          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2037          */
2038         th = (const struct tcphdr *)skb->data;
2039         hdrlen = th->doff * 4;
2040
2041         tail = sk->sk_backlog.tail;
2042         if (!tail)
2043                 goto no_coalesce;
2044         thtail = (struct tcphdr *)tail->data;
2045
2046         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2047             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2048             ((TCP_SKB_CB(tail)->tcp_flags |
2049               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2050             !((TCP_SKB_CB(tail)->tcp_flags &
2051               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2052             ((TCP_SKB_CB(tail)->tcp_flags ^
2053               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2054             !tcp_skb_can_collapse_rx(tail, skb) ||
2055             thtail->doff != th->doff ||
2056             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2057                 goto no_coalesce;
2058
2059         __skb_pull(skb, hdrlen);
2060
2061         shinfo = skb_shinfo(skb);
2062         gso_size = shinfo->gso_size ?: skb->len;
2063         gso_segs = shinfo->gso_segs ?: 1;
2064
2065         shinfo = skb_shinfo(tail);
2066         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2067         tail_gso_segs = shinfo->gso_segs ?: 1;
2068
2069         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2070                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2071
2072                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2073                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2074                         thtail->window = th->window;
2075                 }
2076
2077                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2078                  * thtail->fin, so that the fast path in tcp_rcv_established()
2079                  * is not entered if we append a packet with a FIN.
2080                  * SYN, RST, URG are not present.
2081                  * ACK is set on both packets.
2082                  * PSH : we do not really care in TCP stack,
2083                  *       at least for 'GRO' packets.
2084                  */
2085                 thtail->fin |= th->fin;
2086                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2087
2088                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2089                         TCP_SKB_CB(tail)->has_rxtstamp = true;
2090                         tail->tstamp = skb->tstamp;
2091                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2092                 }
2093
2094                 /* Not as strict as GRO. We only need to carry mss max value */
2095                 shinfo->gso_size = max(gso_size, tail_gso_size);
2096                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2097
2098                 sk->sk_backlog.len += delta;
2099                 __NET_INC_STATS(sock_net(sk),
2100                                 LINUX_MIB_TCPBACKLOGCOALESCE);
2101                 kfree_skb_partial(skb, fragstolen);
2102                 return false;
2103         }
2104         __skb_push(skb, hdrlen);
2105
2106 no_coalesce:
2107         /* sk->sk_backlog.len is reset only at the end of __release_sock().
2108          * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2109          * sk_rcvbuf in normal conditions.
2110          */
2111         limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2112
2113         limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2114
2115         /* Only socket owner can try to collapse/prune rx queues
2116          * to reduce memory overhead, so add a little headroom here.
2117          * Few sockets backlog are possibly concurrently non empty.
2118          */
2119         limit += 64 * 1024;
2120
2121         limit = min_t(u64, limit, UINT_MAX);
2122
2123         if (unlikely(sk_add_backlog(sk, skb, limit))) {
2124                 bh_unlock_sock(sk);
2125                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2126                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2127                 return true;
2128         }
2129         return false;
2130 }
2131 EXPORT_SYMBOL(tcp_add_backlog);
2132
2133 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2134 {
2135         struct tcphdr *th = (struct tcphdr *)skb->data;
2136
2137         return sk_filter_trim_cap(sk, skb, th->doff * 4);
2138 }
2139 EXPORT_SYMBOL(tcp_filter);
2140
2141 static void tcp_v4_restore_cb(struct sk_buff *skb)
2142 {
2143         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2144                 sizeof(struct inet_skb_parm));
2145 }
2146
2147 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2148                            const struct tcphdr *th)
2149 {
2150         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2151          * barrier() makes sure compiler wont play fool^Waliasing games.
2152          */
2153         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2154                 sizeof(struct inet_skb_parm));
2155         barrier();
2156
2157         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2158         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2159                                     skb->len - th->doff * 4);
2160         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2161         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2162         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2163         TCP_SKB_CB(skb)->sacked  = 0;
2164         TCP_SKB_CB(skb)->has_rxtstamp =
2165                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2166 }
2167
2168 /*
2169  *      From tcp_input.c
2170  */
2171
2172 int tcp_v4_rcv(struct sk_buff *skb)
2173 {
2174         struct net *net = dev_net(skb->dev);
2175         enum skb_drop_reason drop_reason;
2176         int sdif = inet_sdif(skb);
2177         int dif = inet_iif(skb);
2178         const struct iphdr *iph;
2179         const struct tcphdr *th;
2180         struct sock *sk = NULL;
2181         bool refcounted;
2182         int ret;
2183         u32 isn;
2184
2185         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2186         if (skb->pkt_type != PACKET_HOST)
2187                 goto discard_it;
2188
2189         /* Count it even if it's bad */
2190         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2191
2192         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2193                 goto discard_it;
2194
2195         th = (const struct tcphdr *)skb->data;
2196
2197         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2198                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2199                 goto bad_packet;
2200         }
2201         if (!pskb_may_pull(skb, th->doff * 4))
2202                 goto discard_it;
2203
2204         /* An explanation is required here, I think.
2205          * Packet length and doff are validated by header prediction,
2206          * provided case of th->doff==0 is eliminated.
2207          * So, we defer the checks. */
2208
2209         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2210                 goto csum_error;
2211
2212         th = (const struct tcphdr *)skb->data;
2213         iph = ip_hdr(skb);
2214 lookup:
2215         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2216                                skb, __tcp_hdrlen(th), th->source,
2217                                th->dest, sdif, &refcounted);
2218         if (!sk)
2219                 goto no_tcp_socket;
2220
2221         if (sk->sk_state == TCP_TIME_WAIT)
2222                 goto do_time_wait;
2223
2224         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2225                 struct request_sock *req = inet_reqsk(sk);
2226                 bool req_stolen = false;
2227                 struct sock *nsk;
2228
2229                 sk = req->rsk_listener;
2230                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2231                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2232                 else
2233                         drop_reason = tcp_inbound_hash(sk, req, skb,
2234                                                        &iph->saddr, &iph->daddr,
2235                                                        AF_INET, dif, sdif);
2236                 if (unlikely(drop_reason)) {
2237                         sk_drops_add(sk, skb);
2238                         reqsk_put(req);
2239                         goto discard_it;
2240                 }
2241                 if (tcp_checksum_complete(skb)) {
2242                         reqsk_put(req);
2243                         goto csum_error;
2244                 }
2245                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2246                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2247                         if (!nsk) {
2248                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2249                                 goto lookup;
2250                         }
2251                         sk = nsk;
2252                         /* reuseport_migrate_sock() has already held one sk_refcnt
2253                          * before returning.
2254                          */
2255                 } else {
2256                         /* We own a reference on the listener, increase it again
2257                          * as we might lose it too soon.
2258                          */
2259                         sock_hold(sk);
2260                 }
2261                 refcounted = true;
2262                 nsk = NULL;
2263                 if (!tcp_filter(sk, skb)) {
2264                         th = (const struct tcphdr *)skb->data;
2265                         iph = ip_hdr(skb);
2266                         tcp_v4_fill_cb(skb, iph, th);
2267                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2268                 } else {
2269                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2270                 }
2271                 if (!nsk) {
2272                         reqsk_put(req);
2273                         if (req_stolen) {
2274                                 /* Another cpu got exclusive access to req
2275                                  * and created a full blown socket.
2276                                  * Try to feed this packet to this socket
2277                                  * instead of discarding it.
2278                                  */
2279                                 tcp_v4_restore_cb(skb);
2280                                 sock_put(sk);
2281                                 goto lookup;
2282                         }
2283                         goto discard_and_relse;
2284                 }
2285                 nf_reset_ct(skb);
2286                 if (nsk == sk) {
2287                         reqsk_put(req);
2288                         tcp_v4_restore_cb(skb);
2289                 } else {
2290                         drop_reason = tcp_child_process(sk, nsk, skb);
2291                         if (drop_reason) {
2292                                 enum sk_rst_reason rst_reason;
2293
2294                                 rst_reason = sk_rst_convert_drop_reason(drop_reason);
2295                                 tcp_v4_send_reset(nsk, skb, rst_reason);
2296                                 goto discard_and_relse;
2297                         }
2298                         sock_put(sk);
2299                         return 0;
2300                 }
2301         }
2302
2303 process:
2304         if (static_branch_unlikely(&ip4_min_ttl)) {
2305                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2306                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2307                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2308                         drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2309                         goto discard_and_relse;
2310                 }
2311         }
2312
2313         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2314                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2315                 goto discard_and_relse;
2316         }
2317
2318         drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2319                                        AF_INET, dif, sdif);
2320         if (drop_reason)
2321                 goto discard_and_relse;
2322
2323         nf_reset_ct(skb);
2324
2325         if (tcp_filter(sk, skb)) {
2326                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2327                 goto discard_and_relse;
2328         }
2329         th = (const struct tcphdr *)skb->data;
2330         iph = ip_hdr(skb);
2331         tcp_v4_fill_cb(skb, iph, th);
2332
2333         skb->dev = NULL;
2334
2335         if (sk->sk_state == TCP_LISTEN) {
2336                 ret = tcp_v4_do_rcv(sk, skb);
2337                 goto put_and_return;
2338         }
2339
2340         sk_incoming_cpu_update(sk);
2341
2342         bh_lock_sock_nested(sk);
2343         tcp_segs_in(tcp_sk(sk), skb);
2344         ret = 0;
2345         if (!sock_owned_by_user(sk)) {
2346                 ret = tcp_v4_do_rcv(sk, skb);
2347         } else {
2348                 if (tcp_add_backlog(sk, skb, &drop_reason))
2349                         goto discard_and_relse;
2350         }
2351         bh_unlock_sock(sk);
2352
2353 put_and_return:
2354         if (refcounted)
2355                 sock_put(sk);
2356
2357         return ret;
2358
2359 no_tcp_socket:
2360         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2361         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2362                 goto discard_it;
2363
2364         tcp_v4_fill_cb(skb, iph, th);
2365
2366         if (tcp_checksum_complete(skb)) {
2367 csum_error:
2368                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2369                 trace_tcp_bad_csum(skb);
2370                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2371 bad_packet:
2372                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2373         } else {
2374                 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2375         }
2376
2377 discard_it:
2378         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2379         /* Discard frame. */
2380         sk_skb_reason_drop(sk, skb, drop_reason);
2381         return 0;
2382
2383 discard_and_relse:
2384         sk_drops_add(sk, skb);
2385         if (refcounted)
2386                 sock_put(sk);
2387         goto discard_it;
2388
2389 do_time_wait:
2390         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2391                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2392                 inet_twsk_put(inet_twsk(sk));
2393                 goto discard_it;
2394         }
2395
2396         tcp_v4_fill_cb(skb, iph, th);
2397
2398         if (tcp_checksum_complete(skb)) {
2399                 inet_twsk_put(inet_twsk(sk));
2400                 goto csum_error;
2401         }
2402         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2403         case TCP_TW_SYN: {
2404                 struct sock *sk2 = inet_lookup_listener(net,
2405                                                         net->ipv4.tcp_death_row.hashinfo,
2406                                                         skb, __tcp_hdrlen(th),
2407                                                         iph->saddr, th->source,
2408                                                         iph->daddr, th->dest,
2409                                                         inet_iif(skb),
2410                                                         sdif);
2411                 if (sk2) {
2412                         inet_twsk_deschedule_put(inet_twsk(sk));
2413                         sk = sk2;
2414                         tcp_v4_restore_cb(skb);
2415                         refcounted = false;
2416                         __this_cpu_write(tcp_tw_isn, isn);
2417                         goto process;
2418                 }
2419         }
2420                 /* to ACK */
2421                 fallthrough;
2422         case TCP_TW_ACK:
2423                 tcp_v4_timewait_ack(sk, skb);
2424                 break;
2425         case TCP_TW_RST:
2426                 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2427                 inet_twsk_deschedule_put(inet_twsk(sk));
2428                 goto discard_it;
2429         case TCP_TW_SUCCESS:;
2430         }
2431         goto discard_it;
2432 }
2433
2434 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2435         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2436         .twsk_destructor= tcp_twsk_destructor,
2437 };
2438
2439 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2440 {
2441         struct dst_entry *dst = skb_dst(skb);
2442
2443         if (dst && dst_hold_safe(dst)) {
2444                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2445                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2446         }
2447 }
2448 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2449
2450 const struct inet_connection_sock_af_ops ipv4_specific = {
2451         .queue_xmit        = ip_queue_xmit,
2452         .send_check        = tcp_v4_send_check,
2453         .rebuild_header    = inet_sk_rebuild_header,
2454         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2455         .conn_request      = tcp_v4_conn_request,
2456         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2457         .net_header_len    = sizeof(struct iphdr),
2458         .setsockopt        = ip_setsockopt,
2459         .getsockopt        = ip_getsockopt,
2460         .addr2sockaddr     = inet_csk_addr2sockaddr,
2461         .sockaddr_len      = sizeof(struct sockaddr_in),
2462         .mtu_reduced       = tcp_v4_mtu_reduced,
2463 };
2464 EXPORT_SYMBOL(ipv4_specific);
2465
2466 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2467 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2468 #ifdef CONFIG_TCP_MD5SIG
2469         .md5_lookup             = tcp_v4_md5_lookup,
2470         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2471         .md5_parse              = tcp_v4_parse_md5_keys,
2472 #endif
2473 #ifdef CONFIG_TCP_AO
2474         .ao_lookup              = tcp_v4_ao_lookup,
2475         .calc_ao_hash           = tcp_v4_ao_hash_skb,
2476         .ao_parse               = tcp_v4_parse_ao,
2477         .ao_calc_key_sk         = tcp_v4_ao_calc_key_sk,
2478 #endif
2479 };
2480 #endif
2481
2482 /* NOTE: A lot of things set to zero explicitly by call to
2483  *       sk_alloc() so need not be done here.
2484  */
2485 static int tcp_v4_init_sock(struct sock *sk)
2486 {
2487         struct inet_connection_sock *icsk = inet_csk(sk);
2488
2489         tcp_init_sock(sk);
2490
2491         icsk->icsk_af_ops = &ipv4_specific;
2492
2493 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2494         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2495 #endif
2496
2497         return 0;
2498 }
2499
2500 #ifdef CONFIG_TCP_MD5SIG
2501 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2502 {
2503         struct tcp_md5sig_info *md5sig;
2504
2505         md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2506         kfree(md5sig);
2507         static_branch_slow_dec_deferred(&tcp_md5_needed);
2508         tcp_md5_release_sigpool();
2509 }
2510 #endif
2511
2512 void tcp_v4_destroy_sock(struct sock *sk)
2513 {
2514         struct tcp_sock *tp = tcp_sk(sk);
2515
2516         trace_tcp_destroy_sock(sk);
2517
2518         tcp_clear_xmit_timers(sk);
2519
2520         tcp_cleanup_congestion_control(sk);
2521
2522         tcp_cleanup_ulp(sk);
2523
2524         /* Cleanup up the write buffer. */
2525         tcp_write_queue_purge(sk);
2526
2527         /* Check if we want to disable active TFO */
2528         tcp_fastopen_active_disable_ofo_check(sk);
2529
2530         /* Cleans up our, hopefully empty, out_of_order_queue. */
2531         skb_rbtree_purge(&tp->out_of_order_queue);
2532
2533 #ifdef CONFIG_TCP_MD5SIG
2534         /* Clean up the MD5 key list, if any */
2535         if (tp->md5sig_info) {
2536                 struct tcp_md5sig_info *md5sig;
2537
2538                 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2539                 tcp_clear_md5_list(sk);
2540                 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2541                 rcu_assign_pointer(tp->md5sig_info, NULL);
2542         }
2543 #endif
2544         tcp_ao_destroy_sock(sk, false);
2545
2546         /* Clean up a referenced TCP bind bucket. */
2547         if (inet_csk(sk)->icsk_bind_hash)
2548                 inet_put_port(sk);
2549
2550         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2551
2552         /* If socket is aborted during connect operation */
2553         tcp_free_fastopen_req(tp);
2554         tcp_fastopen_destroy_cipher(sk);
2555         tcp_saved_syn_free(tp);
2556
2557         sk_sockets_allocated_dec(sk);
2558 }
2559 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2560
2561 #ifdef CONFIG_PROC_FS
2562 /* Proc filesystem TCP sock list dumping. */
2563
2564 static unsigned short seq_file_family(const struct seq_file *seq);
2565
2566 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2567 {
2568         unsigned short family = seq_file_family(seq);
2569
2570         /* AF_UNSPEC is used as a match all */
2571         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2572                 net_eq(sock_net(sk), seq_file_net(seq)));
2573 }
2574
2575 /* Find a non empty bucket (starting from st->bucket)
2576  * and return the first sk from it.
2577  */
2578 static void *listening_get_first(struct seq_file *seq)
2579 {
2580         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2581         struct tcp_iter_state *st = seq->private;
2582
2583         st->offset = 0;
2584         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2585                 struct inet_listen_hashbucket *ilb2;
2586                 struct hlist_nulls_node *node;
2587                 struct sock *sk;
2588
2589                 ilb2 = &hinfo->lhash2[st->bucket];
2590                 if (hlist_nulls_empty(&ilb2->nulls_head))
2591                         continue;
2592
2593                 spin_lock(&ilb2->lock);
2594                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2595                         if (seq_sk_match(seq, sk))
2596                                 return sk;
2597                 }
2598                 spin_unlock(&ilb2->lock);
2599         }
2600
2601         return NULL;
2602 }
2603
2604 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2605  * If "cur" is the last one in the st->bucket,
2606  * call listening_get_first() to return the first sk of the next
2607  * non empty bucket.
2608  */
2609 static void *listening_get_next(struct seq_file *seq, void *cur)
2610 {
2611         struct tcp_iter_state *st = seq->private;
2612         struct inet_listen_hashbucket *ilb2;
2613         struct hlist_nulls_node *node;
2614         struct inet_hashinfo *hinfo;
2615         struct sock *sk = cur;
2616
2617         ++st->num;
2618         ++st->offset;
2619
2620         sk = sk_nulls_next(sk);
2621         sk_nulls_for_each_from(sk, node) {
2622                 if (seq_sk_match(seq, sk))
2623                         return sk;
2624         }
2625
2626         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2627         ilb2 = &hinfo->lhash2[st->bucket];
2628         spin_unlock(&ilb2->lock);
2629         ++st->bucket;
2630         return listening_get_first(seq);
2631 }
2632
2633 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2634 {
2635         struct tcp_iter_state *st = seq->private;
2636         void *rc;
2637
2638         st->bucket = 0;
2639         st->offset = 0;
2640         rc = listening_get_first(seq);
2641
2642         while (rc && *pos) {
2643                 rc = listening_get_next(seq, rc);
2644                 --*pos;
2645         }
2646         return rc;
2647 }
2648
2649 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2650                                 const struct tcp_iter_state *st)
2651 {
2652         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2653 }
2654
2655 /*
2656  * Get first established socket starting from bucket given in st->bucket.
2657  * If st->bucket is zero, the very first socket in the hash is returned.
2658  */
2659 static void *established_get_first(struct seq_file *seq)
2660 {
2661         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2662         struct tcp_iter_state *st = seq->private;
2663
2664         st->offset = 0;
2665         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2666                 struct sock *sk;
2667                 struct hlist_nulls_node *node;
2668                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2669
2670                 cond_resched();
2671
2672                 /* Lockless fast path for the common case of empty buckets */
2673                 if (empty_bucket(hinfo, st))
2674                         continue;
2675
2676                 spin_lock_bh(lock);
2677                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2678                         if (seq_sk_match(seq, sk))
2679                                 return sk;
2680                 }
2681                 spin_unlock_bh(lock);
2682         }
2683
2684         return NULL;
2685 }
2686
2687 static void *established_get_next(struct seq_file *seq, void *cur)
2688 {
2689         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2690         struct tcp_iter_state *st = seq->private;
2691         struct hlist_nulls_node *node;
2692         struct sock *sk = cur;
2693
2694         ++st->num;
2695         ++st->offset;
2696
2697         sk = sk_nulls_next(sk);
2698
2699         sk_nulls_for_each_from(sk, node) {
2700                 if (seq_sk_match(seq, sk))
2701                         return sk;
2702         }
2703
2704         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2705         ++st->bucket;
2706         return established_get_first(seq);
2707 }
2708
2709 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2710 {
2711         struct tcp_iter_state *st = seq->private;
2712         void *rc;
2713
2714         st->bucket = 0;
2715         rc = established_get_first(seq);
2716
2717         while (rc && pos) {
2718                 rc = established_get_next(seq, rc);
2719                 --pos;
2720         }
2721         return rc;
2722 }
2723
2724 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2725 {
2726         void *rc;
2727         struct tcp_iter_state *st = seq->private;
2728
2729         st->state = TCP_SEQ_STATE_LISTENING;
2730         rc        = listening_get_idx(seq, &pos);
2731
2732         if (!rc) {
2733                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2734                 rc        = established_get_idx(seq, pos);
2735         }
2736
2737         return rc;
2738 }
2739
2740 static void *tcp_seek_last_pos(struct seq_file *seq)
2741 {
2742         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2743         struct tcp_iter_state *st = seq->private;
2744         int bucket = st->bucket;
2745         int offset = st->offset;
2746         int orig_num = st->num;
2747         void *rc = NULL;
2748
2749         switch (st->state) {
2750         case TCP_SEQ_STATE_LISTENING:
2751                 if (st->bucket > hinfo->lhash2_mask)
2752                         break;
2753                 rc = listening_get_first(seq);
2754                 while (offset-- && rc && bucket == st->bucket)
2755                         rc = listening_get_next(seq, rc);
2756                 if (rc)
2757                         break;
2758                 st->bucket = 0;
2759                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2760                 fallthrough;
2761         case TCP_SEQ_STATE_ESTABLISHED:
2762                 if (st->bucket > hinfo->ehash_mask)
2763                         break;
2764                 rc = established_get_first(seq);
2765                 while (offset-- && rc && bucket == st->bucket)
2766                         rc = established_get_next(seq, rc);
2767         }
2768
2769         st->num = orig_num;
2770
2771         return rc;
2772 }
2773
2774 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2775 {
2776         struct tcp_iter_state *st = seq->private;
2777         void *rc;
2778
2779         if (*pos && *pos == st->last_pos) {
2780                 rc = tcp_seek_last_pos(seq);
2781                 if (rc)
2782                         goto out;
2783         }
2784
2785         st->state = TCP_SEQ_STATE_LISTENING;
2786         st->num = 0;
2787         st->bucket = 0;
2788         st->offset = 0;
2789         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2790
2791 out:
2792         st->last_pos = *pos;
2793         return rc;
2794 }
2795 EXPORT_SYMBOL(tcp_seq_start);
2796
2797 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2798 {
2799         struct tcp_iter_state *st = seq->private;
2800         void *rc = NULL;
2801
2802         if (v == SEQ_START_TOKEN) {
2803                 rc = tcp_get_idx(seq, 0);
2804                 goto out;
2805         }
2806
2807         switch (st->state) {
2808         case TCP_SEQ_STATE_LISTENING:
2809                 rc = listening_get_next(seq, v);
2810                 if (!rc) {
2811                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2812                         st->bucket = 0;
2813                         st->offset = 0;
2814                         rc        = established_get_first(seq);
2815                 }
2816                 break;
2817         case TCP_SEQ_STATE_ESTABLISHED:
2818                 rc = established_get_next(seq, v);
2819                 break;
2820         }
2821 out:
2822         ++*pos;
2823         st->last_pos = *pos;
2824         return rc;
2825 }
2826 EXPORT_SYMBOL(tcp_seq_next);
2827
2828 void tcp_seq_stop(struct seq_file *seq, void *v)
2829 {
2830         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2831         struct tcp_iter_state *st = seq->private;
2832
2833         switch (st->state) {
2834         case TCP_SEQ_STATE_LISTENING:
2835                 if (v != SEQ_START_TOKEN)
2836                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2837                 break;
2838         case TCP_SEQ_STATE_ESTABLISHED:
2839                 if (v)
2840                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2841                 break;
2842         }
2843 }
2844 EXPORT_SYMBOL(tcp_seq_stop);
2845
2846 static void get_openreq4(const struct request_sock *req,
2847                          struct seq_file *f, int i)
2848 {
2849         const struct inet_request_sock *ireq = inet_rsk(req);
2850         long delta = req->rsk_timer.expires - jiffies;
2851
2852         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2853                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2854                 i,
2855                 ireq->ir_loc_addr,
2856                 ireq->ir_num,
2857                 ireq->ir_rmt_addr,
2858                 ntohs(ireq->ir_rmt_port),
2859                 TCP_SYN_RECV,
2860                 0, 0, /* could print option size, but that is af dependent. */
2861                 1,    /* timers active (only the expire timer) */
2862                 jiffies_delta_to_clock_t(delta),
2863                 req->num_timeout,
2864                 from_kuid_munged(seq_user_ns(f),
2865                                  sock_i_uid(req->rsk_listener)),
2866                 0,  /* non standard timer */
2867                 0, /* open_requests have no inode */
2868                 0,
2869                 req);
2870 }
2871
2872 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2873 {
2874         int timer_active;
2875         unsigned long timer_expires;
2876         const struct tcp_sock *tp = tcp_sk(sk);
2877         const struct inet_connection_sock *icsk = inet_csk(sk);
2878         const struct inet_sock *inet = inet_sk(sk);
2879         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2880         __be32 dest = inet->inet_daddr;
2881         __be32 src = inet->inet_rcv_saddr;
2882         __u16 destp = ntohs(inet->inet_dport);
2883         __u16 srcp = ntohs(inet->inet_sport);
2884         int rx_queue;
2885         int state;
2886
2887         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2888             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2889             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2890                 timer_active    = 1;
2891                 timer_expires   = icsk->icsk_timeout;
2892         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2893                 timer_active    = 4;
2894                 timer_expires   = icsk->icsk_timeout;
2895         } else if (timer_pending(&sk->sk_timer)) {
2896                 timer_active    = 2;
2897                 timer_expires   = sk->sk_timer.expires;
2898         } else {
2899                 timer_active    = 0;
2900                 timer_expires = jiffies;
2901         }
2902
2903         state = inet_sk_state_load(sk);
2904         if (state == TCP_LISTEN)
2905                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2906         else
2907                 /* Because we don't lock the socket,
2908                  * we might find a transient negative value.
2909                  */
2910                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2911                                       READ_ONCE(tp->copied_seq), 0);
2912
2913         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2914                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2915                 i, src, srcp, dest, destp, state,
2916                 READ_ONCE(tp->write_seq) - tp->snd_una,
2917                 rx_queue,
2918                 timer_active,
2919                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2920                 icsk->icsk_retransmits,
2921                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2922                 icsk->icsk_probes_out,
2923                 sock_i_ino(sk),
2924                 refcount_read(&sk->sk_refcnt), sk,
2925                 jiffies_to_clock_t(icsk->icsk_rto),
2926                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2927                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2928                 tcp_snd_cwnd(tp),
2929                 state == TCP_LISTEN ?
2930                     fastopenq->max_qlen :
2931                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2932 }
2933
2934 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2935                                struct seq_file *f, int i)
2936 {
2937         long delta = tw->tw_timer.expires - jiffies;
2938         __be32 dest, src;
2939         __u16 destp, srcp;
2940
2941         dest  = tw->tw_daddr;
2942         src   = tw->tw_rcv_saddr;
2943         destp = ntohs(tw->tw_dport);
2944         srcp  = ntohs(tw->tw_sport);
2945
2946         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2947                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2948                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2949                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2950                 refcount_read(&tw->tw_refcnt), tw);
2951 }
2952
2953 #define TMPSZ 150
2954
2955 static int tcp4_seq_show(struct seq_file *seq, void *v)
2956 {
2957         struct tcp_iter_state *st;
2958         struct sock *sk = v;
2959
2960         seq_setwidth(seq, TMPSZ - 1);
2961         if (v == SEQ_START_TOKEN) {
2962                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2963                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2964                            "inode");
2965                 goto out;
2966         }
2967         st = seq->private;
2968
2969         if (sk->sk_state == TCP_TIME_WAIT)
2970                 get_timewait4_sock(v, seq, st->num);
2971         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2972                 get_openreq4(v, seq, st->num);
2973         else
2974                 get_tcp4_sock(v, seq, st->num);
2975 out:
2976         seq_pad(seq, '\n');
2977         return 0;
2978 }
2979
2980 #ifdef CONFIG_BPF_SYSCALL
2981 struct bpf_tcp_iter_state {
2982         struct tcp_iter_state state;
2983         unsigned int cur_sk;
2984         unsigned int end_sk;
2985         unsigned int max_sk;
2986         struct sock **batch;
2987         bool st_bucket_done;
2988 };
2989
2990 struct bpf_iter__tcp {
2991         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2992         __bpf_md_ptr(struct sock_common *, sk_common);
2993         uid_t uid __aligned(8);
2994 };
2995
2996 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2997                              struct sock_common *sk_common, uid_t uid)
2998 {
2999         struct bpf_iter__tcp ctx;
3000
3001         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3002         ctx.meta = meta;
3003         ctx.sk_common = sk_common;
3004         ctx.uid = uid;
3005         return bpf_iter_run_prog(prog, &ctx);
3006 }
3007
3008 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3009 {
3010         while (iter->cur_sk < iter->end_sk)
3011                 sock_gen_put(iter->batch[iter->cur_sk++]);
3012 }
3013
3014 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3015                                       unsigned int new_batch_sz)
3016 {
3017         struct sock **new_batch;
3018
3019         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3020                              GFP_USER | __GFP_NOWARN);
3021         if (!new_batch)
3022                 return -ENOMEM;
3023
3024         bpf_iter_tcp_put_batch(iter);
3025         kvfree(iter->batch);
3026         iter->batch = new_batch;
3027         iter->max_sk = new_batch_sz;
3028
3029         return 0;
3030 }
3031
3032 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3033                                                  struct sock *start_sk)
3034 {
3035         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3036         struct bpf_tcp_iter_state *iter = seq->private;
3037         struct tcp_iter_state *st = &iter->state;
3038         struct hlist_nulls_node *node;
3039         unsigned int expected = 1;
3040         struct sock *sk;
3041
3042         sock_hold(start_sk);
3043         iter->batch[iter->end_sk++] = start_sk;
3044
3045         sk = sk_nulls_next(start_sk);
3046         sk_nulls_for_each_from(sk, node) {
3047                 if (seq_sk_match(seq, sk)) {
3048                         if (iter->end_sk < iter->max_sk) {
3049                                 sock_hold(sk);
3050                                 iter->batch[iter->end_sk++] = sk;
3051                         }
3052                         expected++;
3053                 }
3054         }
3055         spin_unlock(&hinfo->lhash2[st->bucket].lock);
3056
3057         return expected;
3058 }
3059
3060 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3061                                                    struct sock *start_sk)
3062 {
3063         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3064         struct bpf_tcp_iter_state *iter = seq->private;
3065         struct tcp_iter_state *st = &iter->state;
3066         struct hlist_nulls_node *node;
3067         unsigned int expected = 1;
3068         struct sock *sk;
3069
3070         sock_hold(start_sk);
3071         iter->batch[iter->end_sk++] = start_sk;
3072
3073         sk = sk_nulls_next(start_sk);
3074         sk_nulls_for_each_from(sk, node) {
3075                 if (seq_sk_match(seq, sk)) {
3076                         if (iter->end_sk < iter->max_sk) {
3077                                 sock_hold(sk);
3078                                 iter->batch[iter->end_sk++] = sk;
3079                         }
3080                         expected++;
3081                 }
3082         }
3083         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3084
3085         return expected;
3086 }
3087
3088 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3089 {
3090         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3091         struct bpf_tcp_iter_state *iter = seq->private;
3092         struct tcp_iter_state *st = &iter->state;
3093         unsigned int expected;
3094         bool resized = false;
3095         struct sock *sk;
3096
3097         /* The st->bucket is done.  Directly advance to the next
3098          * bucket instead of having the tcp_seek_last_pos() to skip
3099          * one by one in the current bucket and eventually find out
3100          * it has to advance to the next bucket.
3101          */
3102         if (iter->st_bucket_done) {
3103                 st->offset = 0;
3104                 st->bucket++;
3105                 if (st->state == TCP_SEQ_STATE_LISTENING &&
3106                     st->bucket > hinfo->lhash2_mask) {
3107                         st->state = TCP_SEQ_STATE_ESTABLISHED;
3108                         st->bucket = 0;
3109                 }
3110         }
3111
3112 again:
3113         /* Get a new batch */
3114         iter->cur_sk = 0;
3115         iter->end_sk = 0;
3116         iter->st_bucket_done = false;
3117
3118         sk = tcp_seek_last_pos(seq);
3119         if (!sk)
3120                 return NULL; /* Done */
3121
3122         if (st->state == TCP_SEQ_STATE_LISTENING)
3123                 expected = bpf_iter_tcp_listening_batch(seq, sk);
3124         else
3125                 expected = bpf_iter_tcp_established_batch(seq, sk);
3126
3127         if (iter->end_sk == expected) {
3128                 iter->st_bucket_done = true;
3129                 return sk;
3130         }
3131
3132         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3133                 resized = true;
3134                 goto again;
3135         }
3136
3137         return sk;
3138 }
3139
3140 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3141 {
3142         /* bpf iter does not support lseek, so it always
3143          * continue from where it was stop()-ped.
3144          */
3145         if (*pos)
3146                 return bpf_iter_tcp_batch(seq);
3147
3148         return SEQ_START_TOKEN;
3149 }
3150
3151 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3152 {
3153         struct bpf_tcp_iter_state *iter = seq->private;
3154         struct tcp_iter_state *st = &iter->state;
3155         struct sock *sk;
3156
3157         /* Whenever seq_next() is called, the iter->cur_sk is
3158          * done with seq_show(), so advance to the next sk in
3159          * the batch.
3160          */
3161         if (iter->cur_sk < iter->end_sk) {
3162                 /* Keeping st->num consistent in tcp_iter_state.
3163                  * bpf_iter_tcp does not use st->num.
3164                  * meta.seq_num is used instead.
3165                  */
3166                 st->num++;
3167                 /* Move st->offset to the next sk in the bucket such that
3168                  * the future start() will resume at st->offset in
3169                  * st->bucket.  See tcp_seek_last_pos().
3170                  */
3171                 st->offset++;
3172                 sock_gen_put(iter->batch[iter->cur_sk++]);
3173         }
3174
3175         if (iter->cur_sk < iter->end_sk)
3176                 sk = iter->batch[iter->cur_sk];
3177         else
3178                 sk = bpf_iter_tcp_batch(seq);
3179
3180         ++*pos;
3181         /* Keeping st->last_pos consistent in tcp_iter_state.
3182          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3183          */
3184         st->last_pos = *pos;
3185         return sk;
3186 }
3187
3188 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3189 {
3190         struct bpf_iter_meta meta;
3191         struct bpf_prog *prog;
3192         struct sock *sk = v;
3193         uid_t uid;
3194         int ret;
3195
3196         if (v == SEQ_START_TOKEN)
3197                 return 0;
3198
3199         if (sk_fullsock(sk))
3200                 lock_sock(sk);
3201
3202         if (unlikely(sk_unhashed(sk))) {
3203                 ret = SEQ_SKIP;
3204                 goto unlock;
3205         }
3206
3207         if (sk->sk_state == TCP_TIME_WAIT) {
3208                 uid = 0;
3209         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3210                 const struct request_sock *req = v;
3211
3212                 uid = from_kuid_munged(seq_user_ns(seq),
3213                                        sock_i_uid(req->rsk_listener));
3214         } else {
3215                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3216         }
3217
3218         meta.seq = seq;
3219         prog = bpf_iter_get_info(&meta, false);
3220         ret = tcp_prog_seq_show(prog, &meta, v, uid);
3221
3222 unlock:
3223         if (sk_fullsock(sk))
3224                 release_sock(sk);
3225         return ret;
3226
3227 }
3228
3229 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3230 {
3231         struct bpf_tcp_iter_state *iter = seq->private;
3232         struct bpf_iter_meta meta;
3233         struct bpf_prog *prog;
3234
3235         if (!v) {
3236                 meta.seq = seq;
3237                 prog = bpf_iter_get_info(&meta, true);
3238                 if (prog)
3239                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
3240         }
3241
3242         if (iter->cur_sk < iter->end_sk) {
3243                 bpf_iter_tcp_put_batch(iter);
3244                 iter->st_bucket_done = false;
3245         }
3246 }
3247
3248 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3249         .show           = bpf_iter_tcp_seq_show,
3250         .start          = bpf_iter_tcp_seq_start,
3251         .next           = bpf_iter_tcp_seq_next,
3252         .stop           = bpf_iter_tcp_seq_stop,
3253 };
3254 #endif
3255 static unsigned short seq_file_family(const struct seq_file *seq)
3256 {
3257         const struct tcp_seq_afinfo *afinfo;
3258
3259 #ifdef CONFIG_BPF_SYSCALL
3260         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3261         if (seq->op == &bpf_iter_tcp_seq_ops)
3262                 return AF_UNSPEC;
3263 #endif
3264
3265         /* Iterated from proc fs */
3266         afinfo = pde_data(file_inode(seq->file));
3267         return afinfo->family;
3268 }
3269
3270 static const struct seq_operations tcp4_seq_ops = {
3271         .show           = tcp4_seq_show,
3272         .start          = tcp_seq_start,
3273         .next           = tcp_seq_next,
3274         .stop           = tcp_seq_stop,
3275 };
3276
3277 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3278         .family         = AF_INET,
3279 };
3280
3281 static int __net_init tcp4_proc_init_net(struct net *net)
3282 {
3283         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3284                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3285                 return -ENOMEM;
3286         return 0;
3287 }
3288
3289 static void __net_exit tcp4_proc_exit_net(struct net *net)
3290 {
3291         remove_proc_entry("tcp", net->proc_net);
3292 }
3293
3294 static struct pernet_operations tcp4_net_ops = {
3295         .init = tcp4_proc_init_net,
3296         .exit = tcp4_proc_exit_net,
3297 };
3298
3299 int __init tcp4_proc_init(void)
3300 {
3301         return register_pernet_subsys(&tcp4_net_ops);
3302 }
3303
3304 void tcp4_proc_exit(void)
3305 {
3306         unregister_pernet_subsys(&tcp4_net_ops);
3307 }
3308 #endif /* CONFIG_PROC_FS */
3309
3310 /* @wake is one when sk_stream_write_space() calls us.
3311  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3312  * This mimics the strategy used in sock_def_write_space().
3313  */
3314 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3315 {
3316         const struct tcp_sock *tp = tcp_sk(sk);
3317         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3318                             READ_ONCE(tp->snd_nxt);
3319
3320         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3321 }
3322 EXPORT_SYMBOL(tcp_stream_memory_free);
3323
3324 struct proto tcp_prot = {
3325         .name                   = "TCP",
3326         .owner                  = THIS_MODULE,
3327         .close                  = tcp_close,
3328         .pre_connect            = tcp_v4_pre_connect,
3329         .connect                = tcp_v4_connect,
3330         .disconnect             = tcp_disconnect,
3331         .accept                 = inet_csk_accept,
3332         .ioctl                  = tcp_ioctl,
3333         .init                   = tcp_v4_init_sock,
3334         .destroy                = tcp_v4_destroy_sock,
3335         .shutdown               = tcp_shutdown,
3336         .setsockopt             = tcp_setsockopt,
3337         .getsockopt             = tcp_getsockopt,
3338         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3339         .keepalive              = tcp_set_keepalive,
3340         .recvmsg                = tcp_recvmsg,
3341         .sendmsg                = tcp_sendmsg,
3342         .splice_eof             = tcp_splice_eof,
3343         .backlog_rcv            = tcp_v4_do_rcv,
3344         .release_cb             = tcp_release_cb,
3345         .hash                   = inet_hash,
3346         .unhash                 = inet_unhash,
3347         .get_port               = inet_csk_get_port,
3348         .put_port               = inet_put_port,
3349 #ifdef CONFIG_BPF_SYSCALL
3350         .psock_update_sk_prot   = tcp_bpf_update_proto,
3351 #endif
3352         .enter_memory_pressure  = tcp_enter_memory_pressure,
3353         .leave_memory_pressure  = tcp_leave_memory_pressure,
3354         .stream_memory_free     = tcp_stream_memory_free,
3355         .sockets_allocated      = &tcp_sockets_allocated,
3356         .orphan_count           = &tcp_orphan_count,
3357
3358         .memory_allocated       = &tcp_memory_allocated,
3359         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3360
3361         .memory_pressure        = &tcp_memory_pressure,
3362         .sysctl_mem             = sysctl_tcp_mem,
3363         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3364         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3365         .max_header             = MAX_TCP_HEADER,
3366         .obj_size               = sizeof(struct tcp_sock),
3367         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3368         .twsk_prot              = &tcp_timewait_sock_ops,
3369         .rsk_prot               = &tcp_request_sock_ops,
3370         .h.hashinfo             = NULL,
3371         .no_autobind            = true,
3372         .diag_destroy           = tcp_abort,
3373 };
3374 EXPORT_SYMBOL(tcp_prot);
3375
3376 static void __net_exit tcp_sk_exit(struct net *net)
3377 {
3378         if (net->ipv4.tcp_congestion_control)
3379                 bpf_module_put(net->ipv4.tcp_congestion_control,
3380                                net->ipv4.tcp_congestion_control->owner);
3381 }
3382
3383 static void __net_init tcp_set_hashinfo(struct net *net)
3384 {
3385         struct inet_hashinfo *hinfo;
3386         unsigned int ehash_entries;
3387         struct net *old_net;
3388
3389         if (net_eq(net, &init_net))
3390                 goto fallback;
3391
3392         old_net = current->nsproxy->net_ns;
3393         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3394         if (!ehash_entries)
3395                 goto fallback;
3396
3397         ehash_entries = roundup_pow_of_two(ehash_entries);
3398         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3399         if (!hinfo) {
3400                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3401                         "for a netns, fallback to the global one\n",
3402                         ehash_entries);
3403 fallback:
3404                 hinfo = &tcp_hashinfo;
3405                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3406         }
3407
3408         net->ipv4.tcp_death_row.hashinfo = hinfo;
3409         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3410         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3411 }
3412
3413 static int __net_init tcp_sk_init(struct net *net)
3414 {
3415         net->ipv4.sysctl_tcp_ecn = 2;
3416         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3417
3418         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3419         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3420         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3421         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3422         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3423
3424         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3425         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3426         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3427
3428         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3429         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3430         net->ipv4.sysctl_tcp_syncookies = 1;
3431         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3432         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3433         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3434         net->ipv4.sysctl_tcp_orphan_retries = 0;
3435         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3436         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3437         net->ipv4.sysctl_tcp_tw_reuse = 2;
3438         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3439
3440         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3441         tcp_set_hashinfo(net);
3442
3443         net->ipv4.sysctl_tcp_sack = 1;
3444         net->ipv4.sysctl_tcp_window_scaling = 1;
3445         net->ipv4.sysctl_tcp_timestamps = 1;
3446         net->ipv4.sysctl_tcp_early_retrans = 3;
3447         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3448         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3449         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3450         net->ipv4.sysctl_tcp_max_reordering = 300;
3451         net->ipv4.sysctl_tcp_dsack = 1;
3452         net->ipv4.sysctl_tcp_app_win = 31;
3453         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3454         net->ipv4.sysctl_tcp_frto = 2;
3455         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3456         /* This limits the percentage of the congestion window which we
3457          * will allow a single TSO frame to consume.  Building TSO frames
3458          * which are too large can cause TCP streams to be bursty.
3459          */
3460         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3461         /* Default TSQ limit of 16 TSO segments */
3462         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3463
3464         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3465         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3466
3467         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3468         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3469         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3470         net->ipv4.sysctl_tcp_autocorking = 1;
3471         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3472         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3473         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3474         if (net != &init_net) {
3475                 memcpy(net->ipv4.sysctl_tcp_rmem,
3476                        init_net.ipv4.sysctl_tcp_rmem,
3477                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3478                 memcpy(net->ipv4.sysctl_tcp_wmem,
3479                        init_net.ipv4.sysctl_tcp_wmem,
3480                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3481         }
3482         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3483         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3484         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3485         net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3486         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3487         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3488         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3489
3490         /* Set default values for PLB */
3491         net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3492         net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3493         net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3494         net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3495         /* Default congestion threshold for PLB to mark a round is 50% */
3496         net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3497
3498         /* Reno is always built in */
3499         if (!net_eq(net, &init_net) &&
3500             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3501                                init_net.ipv4.tcp_congestion_control->owner))
3502                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3503         else
3504                 net->ipv4.tcp_congestion_control = &tcp_reno;
3505
3506         net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3507         net->ipv4.sysctl_tcp_shrink_window = 0;
3508
3509         net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3510         net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3511
3512         return 0;
3513 }
3514
3515 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3516 {
3517         struct net *net;
3518
3519         /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3520          * and failed setup_net error unwinding path are serialized.
3521          *
3522          * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3523          * net_exit_list, the thread that dismantles a particular twsk must
3524          * do so without other thread progressing to refcount_dec_and_test() of
3525          * tcp_death_row.tw_refcount.
3526          */
3527         mutex_lock(&tcp_exit_batch_mutex);
3528
3529         tcp_twsk_purge(net_exit_list);
3530
3531         list_for_each_entry(net, net_exit_list, exit_list) {
3532                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3533                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3534                 tcp_fastopen_ctx_destroy(net);
3535         }
3536
3537         mutex_unlock(&tcp_exit_batch_mutex);
3538 }
3539
3540 static struct pernet_operations __net_initdata tcp_sk_ops = {
3541        .init       = tcp_sk_init,
3542        .exit       = tcp_sk_exit,
3543        .exit_batch = tcp_sk_exit_batch,
3544 };
3545
3546 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3547 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3548                      struct sock_common *sk_common, uid_t uid)
3549
3550 #define INIT_BATCH_SZ 16
3551
3552 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3553 {
3554         struct bpf_tcp_iter_state *iter = priv_data;
3555         int err;
3556
3557         err = bpf_iter_init_seq_net(priv_data, aux);
3558         if (err)
3559                 return err;
3560
3561         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3562         if (err) {
3563                 bpf_iter_fini_seq_net(priv_data);
3564                 return err;
3565         }
3566
3567         return 0;
3568 }
3569
3570 static void bpf_iter_fini_tcp(void *priv_data)
3571 {
3572         struct bpf_tcp_iter_state *iter = priv_data;
3573
3574         bpf_iter_fini_seq_net(priv_data);
3575         kvfree(iter->batch);
3576 }
3577
3578 static const struct bpf_iter_seq_info tcp_seq_info = {
3579         .seq_ops                = &bpf_iter_tcp_seq_ops,
3580         .init_seq_private       = bpf_iter_init_tcp,
3581         .fini_seq_private       = bpf_iter_fini_tcp,
3582         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3583 };
3584
3585 static const struct bpf_func_proto *
3586 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3587                             const struct bpf_prog *prog)
3588 {
3589         switch (func_id) {
3590         case BPF_FUNC_setsockopt:
3591                 return &bpf_sk_setsockopt_proto;
3592         case BPF_FUNC_getsockopt:
3593                 return &bpf_sk_getsockopt_proto;
3594         default:
3595                 return NULL;
3596         }
3597 }
3598
3599 static struct bpf_iter_reg tcp_reg_info = {
3600         .target                 = "tcp",
3601         .ctx_arg_info_size      = 1,
3602         .ctx_arg_info           = {
3603                 { offsetof(struct bpf_iter__tcp, sk_common),
3604                   PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3605         },
3606         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3607         .seq_info               = &tcp_seq_info,
3608 };
3609
3610 static void __init bpf_iter_register(void)
3611 {
3612         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3613         if (bpf_iter_reg_target(&tcp_reg_info))
3614                 pr_warn("Warning: could not register bpf iterator tcp\n");
3615 }
3616
3617 #endif
3618
3619 void __init tcp_v4_init(void)
3620 {
3621         int cpu, res;
3622
3623         for_each_possible_cpu(cpu) {
3624                 struct sock *sk;
3625
3626                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3627                                            IPPROTO_TCP, &init_net);
3628                 if (res)
3629                         panic("Failed to create the TCP control socket.\n");
3630                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3631
3632                 /* Please enforce IP_DF and IPID==0 for RST and
3633                  * ACK sent in SYN-RECV and TIME-WAIT state.
3634                  */
3635                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3636
3637                 sk->sk_clockid = CLOCK_MONOTONIC;
3638
3639                 per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3640         }
3641         if (register_pernet_subsys(&tcp_sk_ops))
3642                 panic("Failed to create the TCP control socket.\n");
3643
3644 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3645         bpf_iter_register();
3646 #endif
3647 }