net/ipv4/tcp_ipv4.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Implementation of the Transmission Control Protocol(TCP).
   8  *
   9  *              IPv4 specific functions
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  */
  18
  19 /*
  20  * Changes:
  21  *              David S. Miller :       New socket lookup architecture.
  22  *                                      This code is dedicated to John Dyson.
  23  *              David S. Miller :       Change semantics of established hash,
  24  *                                      half is devoted to TIME_WAIT sockets
  25  *                                      and the rest go in the other half.
  26  *              Andi Kleen :            Add support for syncookies and fixed
  27  *                                      some bugs: ip options weren't passed to
  28  *                                      the TCP layer, missed a check for an
  29  *                                      ACK bit.
  30  *              Andi Kleen :            Implemented fast path mtu discovery.
  31  *                                      Fixed many serious bugs in the
  32  *                                      request_sock handling and moved
  33  *                                      most of it into the af independent code.
  34  *                                      Added tail drop and some other bugfixes.
  35  *                                      Added new listen semantics.
  36  *              Mike McLagan    :       Routing by source
  37  *      Juan Jose Ciarlante:            ip_dynaddr bits
  38  *              Andi Kleen:             various fixes.
  39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40  *                                      coma.
  41  *      Andi Kleen              :       Fix new listen.
  42  *      Andi Kleen              :       Fix accept error reporting.
  43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45  *                                      a single port at the same time.
  46  */
  47
  48 #define pr_fmt(fmt) "TCP: " fmt
  49
  50 #include <linux/bottom_half.h>
  51 #include <linux/types.h>
  52 #include <linux/fcntl.h>
  53 #include <linux/module.h>
  54 #include <linux/random.h>
  55 #include <linux/cache.h>
  56 #include <linux/jhash.h>
  57 #include <linux/init.h>
  58 #include <linux/times.h>
  59 #include <linux/slab.h>
  60
  61 #include <net/net_namespace.h>
  62 #include <net/icmp.h>
  63 #include <net/inet_hashtables.h>
  64 #include <net/tcp.h>
  65 #include <net/transp_v6.h>
  66 #include <net/ipv6.h>
  67 #include <net/inet_common.h>
  68 #include <net/timewait_sock.h>
  69 #include <net/xfrm.h>
  70 #include <net/secure_seq.h>
  71 #include <net/busy_poll.h>
  72
  73 #include <linux/inet.h>
  74 #include <linux/ipv6.h>
  75 #include <linux/stddef.h>
  76 #include <linux/proc_fs.h>
  77 #include <linux/seq_file.h>
  78 #include <linux/inetdevice.h>
  79 #include <linux/btf_ids.h>
  80
  81 #include <crypto/hash.h>
  82 #include <linux/scatterlist.h>
  83
  84 #include <trace/events/tcp.h>
  85
  86 #ifdef CONFIG_TCP_MD5SIG
  87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89 #endif
  90
  91 struct inet_hashinfo tcp_hashinfo;
  92 EXPORT_SYMBOL(tcp_hashinfo);
  93
  94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  95 {
  96         return secure_tcp_seq(ip_hdr(skb)->daddr,
  97                               ip_hdr(skb)->saddr,
  98                               tcp_hdr(skb)->dest,
  99                               tcp_hdr(skb)->source);
 100 }
 101
 102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 103 {
 104         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         const struct inet_timewait_sock *tw = inet_twsk(sktw);
 110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111         struct tcp_sock *tp = tcp_sk(sk);
 112         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 113
 114         if (reuse == 2) {
 115                 /* Still does not detect *everything* that goes through
 116                  * lo, since we require a loopback src or dst address
 117                  * or direct binding to 'lo' interface.
 118                  */
 119                 bool loopback = false;
 120                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 121                         loopback = true;
 122 #if IS_ENABLED(CONFIG_IPV6)
 123                 if (tw->tw_family == AF_INET6) {
 124                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 125                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 126                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 128                                 loopback = true;
 129                 } else
 130 #endif
 131                 {
 132                         if (ipv4_is_loopback(tw->tw_daddr) ||
 133                             ipv4_is_loopback(tw->tw_rcv_saddr))
 134                                 loopback = true;
 135                 }
 136                 if (!loopback)
 137                         reuse = 0;
 138         }
 139
 140         /* With PAWS, it is safe from the viewpoint
 141            of data integrity. Even without PAWS it is safe provided sequence
 142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144            Actually, the idea is close to VJ's one, only timestamp cache is
 145            held not per host, but per port pair and TW bucket is used as state
 146            holder.
 147
 148            If TW bucket has been already destroyed we fall back to VJ's scheme
 149            and use initial timestamp retrieved from peer table.
 150          */
 151         if (tcptw->tw_ts_recent_stamp &&
 152             (!twp || (reuse && time_after32(ktime_get_seconds(),
 153                                             tcptw->tw_ts_recent_stamp)))) {
 154                 /* In case of repair and re-using TIME-WAIT sockets we still
 155                  * want to be sure that it is safe as above but honor the
 156                  * sequence numbers and time stamps set as part of the repair
 157                  * process.
 158                  *
 159                  * Without this check re-using a TIME-WAIT socket with TCP
 160                  * repair would accumulate a -1 on the repair assigned
 161                  * sequence number. The first time it is reused the sequence
 162                  * is -1, the second time -2, etc. This fixes that issue
 163                  * without appearing to create any others.
 164                  */
 165                 if (likely(!tp->repair)) {
 166                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 167
 168                         if (!seq)
 169                                 seq = 1;
 170                         WRITE_ONCE(tp->write_seq, seq);
 171                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 172                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 173                 }
 174                 sock_hold(sktw);
 175                 return 1;
 176         }
 177
 178         return 0;
 179 }
 180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 181
 182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 183                               int addr_len)
 184 {
 185         /* This check is replicated from tcp_v4_connect() and intended to
 186          * prevent BPF program called below from accessing bytes that are out
 187          * of the bound specified by user in addr_len.
 188          */
 189         if (addr_len < sizeof(struct sockaddr_in))
 190                 return -EINVAL;
 191
 192         sock_owned_by_me(sk);
 193
 194         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 195 }
 196
 197 /* This will initiate an outgoing connection. */
 198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 199 {
 200         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 201         struct inet_sock *inet = inet_sk(sk);
 202         struct tcp_sock *tp = tcp_sk(sk);
 203         __be16 orig_sport, orig_dport;
 204         __be32 daddr, nexthop;
 205         struct flowi4 *fl4;
 206         struct rtable *rt;
 207         int err;
 208         struct ip_options_rcu *inet_opt;
 209         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 210
 211         if (addr_len < sizeof(struct sockaddr_in))
 212                 return -EINVAL;
 213
 214         if (usin->sin_family != AF_INET)
 215                 return -EAFNOSUPPORT;
 216
 217         nexthop = daddr = usin->sin_addr.s_addr;
 218         inet_opt = rcu_dereference_protected(inet->inet_opt,
 219                                              lockdep_sock_is_held(sk));
 220         if (inet_opt && inet_opt->opt.srr) {
 221                 if (!daddr)
 222                         return -EINVAL;
 223                 nexthop = inet_opt->opt.faddr;
 224         }
 225
 226         orig_sport = inet->inet_sport;
 227         orig_dport = usin->sin_port;
 228         fl4 = &inet->cork.fl.u.ip4;
 229         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 230                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 231                               IPPROTO_TCP,
 232                               orig_sport, orig_dport, sk);
 233         if (IS_ERR(rt)) {
 234                 err = PTR_ERR(rt);
 235                 if (err == -ENETUNREACH)
 236                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 237                 return err;
 238         }
 239
 240         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 241                 ip_rt_put(rt);
 242                 return -ENETUNREACH;
 243         }
 244
 245         if (!inet_opt || !inet_opt->opt.srr)
 246                 daddr = fl4->daddr;
 247
 248         if (!inet->inet_saddr)
 249                 inet->inet_saddr = fl4->saddr;
 250         sk_rcv_saddr_set(sk, inet->inet_saddr);
 251
 252         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 253                 /* Reset inherited state */
 254                 tp->rx_opt.ts_recent       = 0;
 255                 tp->rx_opt.ts_recent_stamp = 0;
 256                 if (likely(!tp->repair))
 257                         WRITE_ONCE(tp->write_seq, 0);
 258         }
 259
 260         inet->inet_dport = usin->sin_port;
 261         sk_daddr_set(sk, daddr);
 262
 263         inet_csk(sk)->icsk_ext_hdr_len = 0;
 264         if (inet_opt)
 265                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 266
 267         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 268
 269         /* Socket identity is still unknown (sport may be zero).
 270          * However we set state to SYN-SENT and not releasing socket
 271          * lock select source port, enter ourselves into the hash tables and
 272          * complete initialization after this.
 273          */
 274         tcp_set_state(sk, TCP_SYN_SENT);
 275         err = inet_hash_connect(tcp_death_row, sk);
 276         if (err)
 277                 goto failure;
 278
 279         sk_set_txhash(sk);
 280
 281         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 282                                inet->inet_sport, inet->inet_dport, sk);
 283         if (IS_ERR(rt)) {
 284                 err = PTR_ERR(rt);
 285                 rt = NULL;
 286                 goto failure;
 287         }
 288         /* OK, now commit destination to socket.  */
 289         sk->sk_gso_type = SKB_GSO_TCPV4;
 290         sk_setup_caps(sk, &rt->dst);
 291         rt = NULL;
 292
 293         if (likely(!tp->repair)) {
 294                 if (!tp->write_seq)
 295                         WRITE_ONCE(tp->write_seq,
 296                                    secure_tcp_seq(inet->inet_saddr,
 297                                                   inet->inet_daddr,
 298                                                   inet->inet_sport,
 299                                                   usin->sin_port));
 300                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 301                                                  inet->inet_saddr,
 302                                                  inet->inet_daddr);
 303         }
 304
 305         inet->inet_id = prandom_u32();
 306
 307         if (tcp_fastopen_defer_connect(sk, &err))
 308                 return err;
 309         if (err)
 310                 goto failure;
 311
 312         err = tcp_connect(sk);
 313
 314         if (err)
 315                 goto failure;
 316
 317         return 0;
 318
 319 failure:
 320         /*
 321          * This unhashes the socket and releases the local port,
 322          * if necessary.
 323          */
 324         tcp_set_state(sk, TCP_CLOSE);
 325         ip_rt_put(rt);
 326         sk->sk_route_caps = 0;
 327         inet->inet_dport = 0;
 328         return err;
 329 }
 330 EXPORT_SYMBOL(tcp_v4_connect);
 331
 332 /*
 333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 334  * It can be called through tcp_release_cb() if socket was owned by user
 335  * at the time tcp_v4_err() was called to handle ICMP message.
 336  */
 337 void tcp_v4_mtu_reduced(struct sock *sk)
 338 {
 339         struct inet_sock *inet = inet_sk(sk);
 340         struct dst_entry *dst;
 341         u32 mtu;
 342
 343         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 344                 return;
 345         mtu = tcp_sk(sk)->mtu_info;
 346         dst = inet_csk_update_pmtu(sk, mtu);
 347         if (!dst)
 348                 return;
 349
 350         /* Something is about to be wrong... Remember soft error
 351          * for the case, if this connection will not able to recover.
 352          */
 353         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 354                 sk->sk_err_soft = EMSGSIZE;
 355
 356         mtu = dst_mtu(dst);
 357
 358         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 359             ip_sk_accept_pmtu(sk) &&
 360             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 361                 tcp_sync_mss(sk, mtu);
 362
 363                 /* Resend the TCP packet because it's
 364                  * clear that the old packet has been
 365                  * dropped. This is the new "fast" path mtu
 366                  * discovery.
 367                  */
 368                 tcp_simple_retransmit(sk);
 369         } /* else let the usual retransmit timer handle it */
 370 }
 371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 372
 373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 374 {
 375         struct dst_entry *dst = __sk_dst_check(sk, 0);
 376
 377         if (dst)
 378                 dst->ops->redirect(dst, sk, skb);
 379 }
 380
 381
 382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 384 {
 385         struct request_sock *req = inet_reqsk(sk);
 386         struct net *net = sock_net(sk);
 387
 388         /* ICMPs are not backlogged, hence we cannot get
 389          * an established socket here.
 390          */
 391         if (seq != tcp_rsk(req)->snt_isn) {
 392                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393         } else if (abort) {
 394                 /*
 395                  * Still in SYN_RECV, just remove it silently.
 396                  * There is no good way to pass the error to the newly
 397                  * created socket, and POSIX does not want network
 398                  * errors returned from accept().
 399                  */
 400                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 401                 tcp_listendrop(req->rsk_listener);
 402         }
 403         reqsk_put(req);
 404 }
 405 EXPORT_SYMBOL(tcp_req_err);
 406
 407 /* TCP-LD (RFC 6069) logic */
 408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 409 {
 410         struct inet_connection_sock *icsk = inet_csk(sk);
 411         struct tcp_sock *tp = tcp_sk(sk);
 412         struct sk_buff *skb;
 413         s32 remaining;
 414         u32 delta_us;
 415
 416         if (sock_owned_by_user(sk))
 417                 return;
 418
 419         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420             !icsk->icsk_backoff)
 421                 return;
 422
 423         skb = tcp_rtx_queue_head(sk);
 424         if (WARN_ON_ONCE(!skb))
 425                 return;
 426
 427         icsk->icsk_backoff--;
 428         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 429         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 430
 431         tcp_mstamp_refresh(tp);
 432         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 433         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 434
 435         if (remaining > 0) {
 436                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 437                                           remaining, TCP_RTO_MAX);
 438         } else {
 439                 /* RTO revert clocked out retransmission.
 440                  * Will retransmit now.
 441                  */
 442                 tcp_retransmit_timer(sk);
 443         }
 444 }
 445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
 446
 447 /*
 448  * This routine is called by the ICMP module when it gets some
 449  * sort of error condition.  If err < 0 then the socket should
 450  * be closed and the error returned to the user.  If err > 0
 451  * it's just the icmp type << 8 | icmp code.  After adjustment
 452  * header points to the first 8 bytes of the tcp header.  We need
 453  * to find the appropriate port.
 454  *
 455  * The locking strategy used here is very "optimistic". When
 456  * someone else accesses the socket the ICMP is just dropped
 457  * and for some paths there is no check at all.
 458  * A more general error queue to queue errors for later handling
 459  * is probably better.
 460  *
 461  */
 462
 463 int tcp_v4_err(struct sk_buff *skb, u32 info)
 464 {
 465         const struct iphdr *iph = (const struct iphdr *)skb->data;
 466         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 467         struct tcp_sock *tp;
 468         struct inet_sock *inet;
 469         const int type = icmp_hdr(skb)->type;
 470         const int code = icmp_hdr(skb)->code;
 471         struct sock *sk;
 472         struct request_sock *fastopen;
 473         u32 seq, snd_una;
 474         int err;
 475         struct net *net = dev_net(skb->dev);
 476
 477         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 478                                        th->dest, iph->saddr, ntohs(th->source),
 479                                        inet_iif(skb), 0);
 480         if (!sk) {
 481                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 482                 return -ENOENT;
 483         }
 484         if (sk->sk_state == TCP_TIME_WAIT) {
 485                 inet_twsk_put(inet_twsk(sk));
 486                 return 0;
 487         }
 488         seq = ntohl(th->seq);
 489         if (sk->sk_state == TCP_NEW_SYN_RECV) {
 490                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 491                                      type == ICMP_TIME_EXCEEDED ||
 492                                      (type == ICMP_DEST_UNREACH &&
 493                                       (code == ICMP_NET_UNREACH ||
 494                                        code == ICMP_HOST_UNREACH)));
 495                 return 0;
 496         }
 497
 498         bh_lock_sock(sk);
 499         /* If too many ICMPs get dropped on busy
 500          * servers this needs to be solved differently.
 501          * We do take care of PMTU discovery (RFC1191) special case :
 502          * we can receive locally generated ICMP messages while socket is held.
 503          */
 504         if (sock_owned_by_user(sk)) {
 505                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 506                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 507         }
 508         if (sk->sk_state == TCP_CLOSE)
 509                 goto out;
 510
 511         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 512                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 513                 goto out;
 514         }
 515
 516         tp = tcp_sk(sk);
 517         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 518         fastopen = rcu_dereference(tp->fastopen_rsk);
 519         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 520         if (sk->sk_state != TCP_LISTEN &&
 521             !between(seq, snd_una, tp->snd_nxt)) {
 522                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 523                 goto out;
 524         }
 525
 526         switch (type) {
 527         case ICMP_REDIRECT:
 528                 if (!sock_owned_by_user(sk))
 529                         do_redirect(skb, sk);
 530                 goto out;
 531         case ICMP_SOURCE_QUENCH:
 532                 /* Just silently ignore these. */
 533                 goto out;
 534         case ICMP_PARAMETERPROB:
 535                 err = EPROTO;
 536                 break;
 537         case ICMP_DEST_UNREACH:
 538                 if (code > NR_ICMP_UNREACH)
 539                         goto out;
 540
 541                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 542                         /* We are not interested in TCP_LISTEN and open_requests
 543                          * (SYN-ACKs send out by Linux are always <576bytes so
 544                          * they should go through unfragmented).
 545                          */
 546                         if (sk->sk_state == TCP_LISTEN)
 547                                 goto out;
 548
 549                         tp->mtu_info = info;
 550                         if (!sock_owned_by_user(sk)) {
 551                                 tcp_v4_mtu_reduced(sk);
 552                         } else {
 553                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 554                                         sock_hold(sk);
 555                         }
 556                         goto out;
 557                 }
 558
 559                 err = icmp_err_convert[code].errno;
 560                 /* check if this ICMP message allows revert of backoff.
 561                  * (see RFC 6069)
 562                  */
 563                 if (!fastopen &&
 564                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 565                         tcp_ld_RTO_revert(sk, seq);
 566                 break;
 567         case ICMP_TIME_EXCEEDED:
 568                 err = EHOSTUNREACH;
 569                 break;
 570         default:
 571                 goto out;
 572         }
 573
 574         switch (sk->sk_state) {
 575         case TCP_SYN_SENT:
 576         case TCP_SYN_RECV:
 577                 /* Only in fast or simultaneous open. If a fast open socket is
 578                  * already accepted it is treated as a connected one below.
 579                  */
 580                 if (fastopen && !fastopen->sk)
 581                         break;
 582
 583                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 584
 585                 if (!sock_owned_by_user(sk)) {
 586                         sk->sk_err = err;
 587
 588                         sk->sk_error_report(sk);
 589
 590                         tcp_done(sk);
 591                 } else {
 592                         sk->sk_err_soft = err;
 593                 }
 594                 goto out;
 595         }
 596
 597         /* If we've already connected we will keep trying
 598          * until we time out, or the user gives up.
 599          *
 600          * rfc1122 4.2.3.9 allows to consider as hard errors
 601          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 602          * but it is obsoleted by pmtu discovery).
 603          *
 604          * Note, that in modern internet, where routing is unreliable
 605          * and in each dark corner broken firewalls sit, sending random
 606          * errors ordered by their masters even this two messages finally lose
 607          * their original sense (even Linux sends invalid PORT_UNREACHs)
 608          *
 609          * Now we are in compliance with RFCs.
 610          *                                                      --ANK (980905)
 611          */
 612
 613         inet = inet_sk(sk);
 614         if (!sock_owned_by_user(sk) && inet->recverr) {
 615                 sk->sk_err = err;
 616                 sk->sk_error_report(sk);
 617         } else  { /* Only an error on timeout */
 618                 sk->sk_err_soft = err;
 619         }
 620
 621 out:
 622         bh_unlock_sock(sk);
 623         sock_put(sk);
 624         return 0;
 625 }
 626
 627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 628 {
 629         struct tcphdr *th = tcp_hdr(skb);
 630
 631         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 632         skb->csum_start = skb_transport_header(skb) - skb->head;
 633         skb->csum_offset = offsetof(struct tcphdr, check);
 634 }
 635
 636 /* This routine computes an IPv4 TCP checksum. */
 637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 638 {
 639         const struct inet_sock *inet = inet_sk(sk);
 640
 641         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 642 }
 643 EXPORT_SYMBOL(tcp_v4_send_check);
 644
 645 /*
 646  *      This routine will send an RST to the other tcp.
 647  *
 648  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 649  *                    for reset.
 650  *      Answer: if a packet caused RST, it is not for a socket
 651  *              existing in our system, if it is matched to a socket,
 652  *              it is just duplicate segment or bug in other side's TCP.
 653  *              So that we build reply only basing on parameters
 654  *              arrived with segment.
 655  *      Exception: precedence violation. We do not implement it in any case.
 656  */
 657
 658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 659 {
 660         const struct tcphdr *th = tcp_hdr(skb);
 661         struct {
 662                 struct tcphdr th;
 663 #ifdef CONFIG_TCP_MD5SIG
 664                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 665 #endif
 666         } rep;
 667         struct ip_reply_arg arg;
 668 #ifdef CONFIG_TCP_MD5SIG
 669         struct tcp_md5sig_key *key = NULL;
 670         const __u8 *hash_location = NULL;
 671         unsigned char newhash[16];
 672         int genhash;
 673         struct sock *sk1 = NULL;
 674 #endif
 675         u64 transmit_time = 0;
 676         struct sock *ctl_sk;
 677         struct net *net;
 678
 679         /* Never send a reset in response to a reset. */
 680         if (th->rst)
 681                 return;
 682
 683         /* If sk not NULL, it means we did a successful lookup and incoming
 684          * route had to be correct. prequeue might have dropped our dst.
 685          */
 686         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 687                 return;
 688
 689         /* Swap the send and the receive. */
 690         memset(&rep, 0, sizeof(rep));
 691         rep.th.dest   = th->source;
 692         rep.th.source = th->dest;
 693         rep.th.doff   = sizeof(struct tcphdr) / 4;
 694         rep.th.rst    = 1;
 695
 696         if (th->ack) {
 697                 rep.th.seq = th->ack_seq;
 698         } else {
 699                 rep.th.ack = 1;
 700                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 701                                        skb->len - (th->doff << 2));
 702         }
 703
 704         memset(&arg, 0, sizeof(arg));
 705         arg.iov[0].iov_base = (unsigned char *)&rep;
 706         arg.iov[0].iov_len  = sizeof(rep.th);
 707
 708         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 709 #ifdef CONFIG_TCP_MD5SIG
 710         rcu_read_lock();
 711         hash_location = tcp_parse_md5sig_option(th);
 712         if (sk && sk_fullsock(sk)) {
 713                 const union tcp_md5_addr *addr;
 714                 int l3index;
 715
 716                 /* sdif set, means packet ingressed via a device
 717                  * in an L3 domain and inet_iif is set to it.
 718                  */
 719                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 720                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 721                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 722         } else if (hash_location) {
 723                 const union tcp_md5_addr *addr;
 724                 int sdif = tcp_v4_sdif(skb);
 725                 int dif = inet_iif(skb);
 726                 int l3index;
 727
 728                 /*
 729                  * active side is lost. Try to find listening socket through
 730                  * source port, and then find md5 key through listening socket.
 731                  * we are not loose security here:
 732                  * Incoming packet is checked with md5 hash with finding key,
 733                  * no RST generated if md5 hash doesn't match.
 734                  */
 735                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 736                                              ip_hdr(skb)->saddr,
 737                                              th->source, ip_hdr(skb)->daddr,
 738                                              ntohs(th->source), dif, sdif);
 739                 /* don't send rst if it can't find key */
 740                 if (!sk1)
 741                         goto out;
 742
 743                 /* sdif set, means packet ingressed via a device
 744                  * in an L3 domain and dif is set to it.
 745                  */
 746                 l3index = sdif ? dif : 0;
 747                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 748                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 749                 if (!key)
 750                         goto out;
 751
 752
 753                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 754                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 755                         goto out;
 756
 757         }
 758
 759         if (key) {
 760                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 761                                    (TCPOPT_NOP << 16) |
 762                                    (TCPOPT_MD5SIG << 8) |
 763                                    TCPOLEN_MD5SIG);
 764                 /* Update length and the length the header thinks exists */
 765                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 766                 rep.th.doff = arg.iov[0].iov_len / 4;
 767
 768                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 769                                      key, ip_hdr(skb)->saddr,
 770                                      ip_hdr(skb)->daddr, &rep.th);
 771         }
 772 #endif
 773         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 774                                       ip_hdr(skb)->saddr, /* XXX */
 775                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 776         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 777         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 778
 779         /* When socket is gone, all binding information is lost.
 780          * routing might fail in this case. No choice here, if we choose to force
 781          * input interface, we will misroute in case of asymmetric route.
 782          */
 783         if (sk) {
 784                 arg.bound_dev_if = sk->sk_bound_dev_if;
 785                 if (sk_fullsock(sk))
 786                         trace_tcp_send_reset(sk, skb);
 787         }
 788
 789         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 790                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 791
 792         arg.tos = ip_hdr(skb)->tos;
 793         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 794         local_bh_disable();
 795         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 796         if (sk) {
 797                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 798                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
 799                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 800                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
 801                 transmit_time = tcp_transmit_time(sk);
 802         }
 803         ip_send_unicast_reply(ctl_sk,
 804                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 805                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 806                               &arg, arg.iov[0].iov_len,
 807                               transmit_time);
 808
 809         ctl_sk->sk_mark = 0;
 810         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 811         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 812         local_bh_enable();
 813
 814 #ifdef CONFIG_TCP_MD5SIG
 815 out:
 816         rcu_read_unlock();
 817 #endif
 818 }
 819
 820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 821    outside socket context is ugly, certainly. What can I do?
 822  */
 823
 824 static void tcp_v4_send_ack(const struct sock *sk,
 825                             struct sk_buff *skb, u32 seq, u32 ack,
 826                             u32 win, u32 tsval, u32 tsecr, int oif,
 827                             struct tcp_md5sig_key *key,
 828                             int reply_flags, u8 tos)
 829 {
 830         const struct tcphdr *th = tcp_hdr(skb);
 831         struct {
 832                 struct tcphdr th;
 833                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 834 #ifdef CONFIG_TCP_MD5SIG
 835                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 836 #endif
 837                         ];
 838         } rep;
 839         struct net *net = sock_net(sk);
 840         struct ip_reply_arg arg;
 841         struct sock *ctl_sk;
 842         u64 transmit_time;
 843
 844         memset(&rep.th, 0, sizeof(struct tcphdr));
 845         memset(&arg, 0, sizeof(arg));
 846
 847         arg.iov[0].iov_base = (unsigned char *)&rep;
 848         arg.iov[0].iov_len  = sizeof(rep.th);
 849         if (tsecr) {
 850                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 851                                    (TCPOPT_TIMESTAMP << 8) |
 852                                    TCPOLEN_TIMESTAMP);
 853                 rep.opt[1] = htonl(tsval);
 854                 rep.opt[2] = htonl(tsecr);
 855                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 856         }
 857
 858         /* Swap the send and the receive. */
 859         rep.th.dest    = th->source;
 860         rep.th.source  = th->dest;
 861         rep.th.doff    = arg.iov[0].iov_len / 4;
 862         rep.th.seq     = htonl(seq);
 863         rep.th.ack_seq = htonl(ack);
 864         rep.th.ack     = 1;
 865         rep.th.window  = htons(win);
 866
 867 #ifdef CONFIG_TCP_MD5SIG
 868         if (key) {
 869                 int offset = (tsecr) ? 3 : 0;
 870
 871                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 872                                           (TCPOPT_NOP << 16) |
 873                                           (TCPOPT_MD5SIG << 8) |
 874                                           TCPOLEN_MD5SIG);
 875                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 876                 rep.th.doff = arg.iov[0].iov_len/4;
 877
 878                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 879                                     key, ip_hdr(skb)->saddr,
 880                                     ip_hdr(skb)->daddr, &rep.th);
 881         }
 882 #endif
 883         arg.flags = reply_flags;
 884         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 885                                       ip_hdr(skb)->saddr, /* XXX */
 886                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 887         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 888         if (oif)
 889                 arg.bound_dev_if = oif;
 890         arg.tos = tos;
 891         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 892         local_bh_disable();
 893         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 894         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 895                            inet_twsk(sk)->tw_mark : sk->sk_mark;
 896         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 897                            inet_twsk(sk)->tw_priority : sk->sk_priority;
 898         transmit_time = tcp_transmit_time(sk);
 899         ip_send_unicast_reply(ctl_sk,
 900                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 901                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 902                               &arg, arg.iov[0].iov_len,
 903                               transmit_time);
 904
 905         ctl_sk->sk_mark = 0;
 906         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 907         local_bh_enable();
 908 }
 909
 910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 911 {
 912         struct inet_timewait_sock *tw = inet_twsk(sk);
 913         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 914
 915         tcp_v4_send_ack(sk, skb,
 916                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 917                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 918                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 919                         tcptw->tw_ts_recent,
 920                         tw->tw_bound_dev_if,
 921                         tcp_twsk_md5_key(tcptw),
 922                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 923                         tw->tw_tos
 924                         );
 925
 926         inet_twsk_put(tw);
 927 }
 928
 929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 930                                   struct request_sock *req)
 931 {
 932         const union tcp_md5_addr *addr;
 933         int l3index;
 934
 935         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 936          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 937          */
 938         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 939                                              tcp_sk(sk)->snd_nxt;
 940
 941         /* RFC 7323 2.3
 942          * The window field (SEG.WND) of every outgoing segment, with the
 943          * exception of <SYN> segments, MUST be right-shifted by
 944          * Rcv.Wind.Shift bits:
 945          */
 946         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 947         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 948         tcp_v4_send_ack(sk, skb, seq,
 949                         tcp_rsk(req)->rcv_nxt,
 950                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 951                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 952                         req->ts_recent,
 953                         0,
 954                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 955                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 956                         ip_hdr(skb)->tos);
 957 }
 958
 959 /*
 960  *      Send a SYN-ACK after having received a SYN.
 961  *      This still operates on a request_sock only, not on a big
 962  *      socket.
 963  */
 964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 965                               struct flowi *fl,
 966                               struct request_sock *req,
 967                               struct tcp_fastopen_cookie *foc,
 968                               enum tcp_synack_type synack_type,
 969                               struct sk_buff *syn_skb)
 970 {
 971         const struct inet_request_sock *ireq = inet_rsk(req);
 972         struct flowi4 fl4;
 973         int err = -1;
 974         struct sk_buff *skb;
 975         u8 tos;
 976
 977         /* First, grab a route. */
 978         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 979                 return -1;
 980
 981         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 982
 983         tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
 984                         tcp_rsk(req)->syn_tos : inet_sk(sk)->tos;
 985
 986         if (skb) {
 987                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 988
 989                 rcu_read_lock();
 990                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 991                                             ireq->ir_rmt_addr,
 992                                             rcu_dereference(ireq->ireq_opt),
 993                                             tos & ~INET_ECN_MASK);
 994                 rcu_read_unlock();
 995                 err = net_xmit_eval(err);
 996         }
 997
 998         return err;
 999 }
1000
1001 /*
1002  *      IPv4 request_sock destructor.
1003  */
1004 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1005 {
1006         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1007 }
1008
1009 #ifdef CONFIG_TCP_MD5SIG
1010 /*
1011  * RFC2385 MD5 checksumming requires a mapping of
1012  * IP address->MD5 Key.
1013  * We need to maintain these in the sk structure.
1014  */
1015
1016 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1017 EXPORT_SYMBOL(tcp_md5_needed);
1018
1019 /* Find the Key structure for an address.  */
1020 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1021                                            const union tcp_md5_addr *addr,
1022                                            int family)
1023 {
1024         const struct tcp_sock *tp = tcp_sk(sk);
1025         struct tcp_md5sig_key *key;
1026         const struct tcp_md5sig_info *md5sig;
1027         __be32 mask;
1028         struct tcp_md5sig_key *best_match = NULL;
1029         bool match;
1030
1031         /* caller either holds rcu_read_lock() or socket lock */
1032         md5sig = rcu_dereference_check(tp->md5sig_info,
1033                                        lockdep_sock_is_held(sk));
1034         if (!md5sig)
1035                 return NULL;
1036
1037         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1038                                  lockdep_sock_is_held(sk)) {
1039                 if (key->family != family)
1040                         continue;
1041                 if (key->l3index && key->l3index != l3index)
1042                         continue;
1043                 if (family == AF_INET) {
1044                         mask = inet_make_mask(key->prefixlen);
1045                         match = (key->addr.a4.s_addr & mask) ==
1046                                 (addr->a4.s_addr & mask);
1047 #if IS_ENABLED(CONFIG_IPV6)
1048                 } else if (family == AF_INET6) {
1049                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1050                                                   key->prefixlen);
1051 #endif
1052                 } else {
1053                         match = false;
1054                 }
1055
1056                 if (match && (!best_match ||
1057                               key->prefixlen > best_match->prefixlen))
1058                         best_match = key;
1059         }
1060         return best_match;
1061 }
1062 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1063
1064 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1065                                                       const union tcp_md5_addr *addr,
1066                                                       int family, u8 prefixlen,
1067                                                       int l3index)
1068 {
1069         const struct tcp_sock *tp = tcp_sk(sk);
1070         struct tcp_md5sig_key *key;
1071         unsigned int size = sizeof(struct in_addr);
1072         const struct tcp_md5sig_info *md5sig;
1073
1074         /* caller either holds rcu_read_lock() or socket lock */
1075         md5sig = rcu_dereference_check(tp->md5sig_info,
1076                                        lockdep_sock_is_held(sk));
1077         if (!md5sig)
1078                 return NULL;
1079 #if IS_ENABLED(CONFIG_IPV6)
1080         if (family == AF_INET6)
1081                 size = sizeof(struct in6_addr);
1082 #endif
1083         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1084                                  lockdep_sock_is_held(sk)) {
1085                 if (key->family != family)
1086                         continue;
1087                 if (key->l3index && key->l3index != l3index)
1088                         continue;
1089                 if (!memcmp(&key->addr, addr, size) &&
1090                     key->prefixlen == prefixlen)
1091                         return key;
1092         }
1093         return NULL;
1094 }
1095
1096 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1097                                          const struct sock *addr_sk)
1098 {
1099         const union tcp_md5_addr *addr;
1100         int l3index;
1101
1102         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1103                                                  addr_sk->sk_bound_dev_if);
1104         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1105         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1106 }
1107 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1108
1109 /* This can be called on a newly created socket, from other files */
1110 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1111                    int family, u8 prefixlen, int l3index,
1112                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1113 {
1114         /* Add Key to the list */
1115         struct tcp_md5sig_key *key;
1116         struct tcp_sock *tp = tcp_sk(sk);
1117         struct tcp_md5sig_info *md5sig;
1118
1119         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1120         if (key) {
1121                 /* Pre-existing entry - just update that one.
1122                  * Note that the key might be used concurrently.
1123                  * data_race() is telling kcsan that we do not care of
1124                  * key mismatches, since changing MD5 key on live flows
1125                  * can lead to packet drops.
1126                  */
1127                 data_race(memcpy(key->key, newkey, newkeylen));
1128
1129                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1130                  * Also note that a reader could catch new key->keylen value
1131                  * but old key->key[], this is the reason we use __GFP_ZERO
1132                  * at sock_kmalloc() time below these lines.
1133                  */
1134                 WRITE_ONCE(key->keylen, newkeylen);
1135
1136                 return 0;
1137         }
1138
1139         md5sig = rcu_dereference_protected(tp->md5sig_info,
1140                                            lockdep_sock_is_held(sk));
1141         if (!md5sig) {
1142                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1143                 if (!md5sig)
1144                         return -ENOMEM;
1145
1146                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1147                 INIT_HLIST_HEAD(&md5sig->head);
1148                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1149         }
1150
1151         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1152         if (!key)
1153                 return -ENOMEM;
1154         if (!tcp_alloc_md5sig_pool()) {
1155                 sock_kfree_s(sk, key, sizeof(*key));
1156                 return -ENOMEM;
1157         }
1158
1159         memcpy(key->key, newkey, newkeylen);
1160         key->keylen = newkeylen;
1161         key->family = family;
1162         key->prefixlen = prefixlen;
1163         key->l3index = l3index;
1164         memcpy(&key->addr, addr,
1165                (family == AF_INET6) ? sizeof(struct in6_addr) :
1166                                       sizeof(struct in_addr));
1167         hlist_add_head_rcu(&key->node, &md5sig->head);
1168         return 0;
1169 }
1170 EXPORT_SYMBOL(tcp_md5_do_add);
1171
1172 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1173                    u8 prefixlen, int l3index)
1174 {
1175         struct tcp_md5sig_key *key;
1176
1177         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1178         if (!key)
1179                 return -ENOENT;
1180         hlist_del_rcu(&key->node);
1181         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1182         kfree_rcu(key, rcu);
1183         return 0;
1184 }
1185 EXPORT_SYMBOL(tcp_md5_do_del);
1186
1187 static void tcp_clear_md5_list(struct sock *sk)
1188 {
1189         struct tcp_sock *tp = tcp_sk(sk);
1190         struct tcp_md5sig_key *key;
1191         struct hlist_node *n;
1192         struct tcp_md5sig_info *md5sig;
1193
1194         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1195
1196         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1197                 hlist_del_rcu(&key->node);
1198                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1199                 kfree_rcu(key, rcu);
1200         }
1201 }
1202
1203 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1204                                  sockptr_t optval, int optlen)
1205 {
1206         struct tcp_md5sig cmd;
1207         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1208         const union tcp_md5_addr *addr;
1209         u8 prefixlen = 32;
1210         int l3index = 0;
1211
1212         if (optlen < sizeof(cmd))
1213                 return -EINVAL;
1214
1215         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1216                 return -EFAULT;
1217
1218         if (sin->sin_family != AF_INET)
1219                 return -EINVAL;
1220
1221         if (optname == TCP_MD5SIG_EXT &&
1222             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1223                 prefixlen = cmd.tcpm_prefixlen;
1224                 if (prefixlen > 32)
1225                         return -EINVAL;
1226         }
1227
1228         if (optname == TCP_MD5SIG_EXT &&
1229             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1230                 struct net_device *dev;
1231
1232                 rcu_read_lock();
1233                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1234                 if (dev && netif_is_l3_master(dev))
1235                         l3index = dev->ifindex;
1236
1237                 rcu_read_unlock();
1238
1239                 /* ok to reference set/not set outside of rcu;
1240                  * right now device MUST be an L3 master
1241                  */
1242                 if (!dev || !l3index)
1243                         return -EINVAL;
1244         }
1245
1246         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1247
1248         if (!cmd.tcpm_keylen)
1249                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1250
1251         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1252                 return -EINVAL;
1253
1254         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1255                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1256 }
1257
1258 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1259                                    __be32 daddr, __be32 saddr,
1260                                    const struct tcphdr *th, int nbytes)
1261 {
1262         struct tcp4_pseudohdr *bp;
1263         struct scatterlist sg;
1264         struct tcphdr *_th;
1265
1266         bp = hp->scratch;
1267         bp->saddr = saddr;
1268         bp->daddr = daddr;
1269         bp->pad = 0;
1270         bp->protocol = IPPROTO_TCP;
1271         bp->len = cpu_to_be16(nbytes);
1272
1273         _th = (struct tcphdr *)(bp + 1);
1274         memcpy(_th, th, sizeof(*th));
1275         _th->check = 0;
1276
1277         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1278         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1279                                 sizeof(*bp) + sizeof(*th));
1280         return crypto_ahash_update(hp->md5_req);
1281 }
1282
1283 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1284                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1285 {
1286         struct tcp_md5sig_pool *hp;
1287         struct ahash_request *req;
1288
1289         hp = tcp_get_md5sig_pool();
1290         if (!hp)
1291                 goto clear_hash_noput;
1292         req = hp->md5_req;
1293
1294         if (crypto_ahash_init(req))
1295                 goto clear_hash;
1296         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1297                 goto clear_hash;
1298         if (tcp_md5_hash_key(hp, key))
1299                 goto clear_hash;
1300         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1301         if (crypto_ahash_final(req))
1302                 goto clear_hash;
1303
1304         tcp_put_md5sig_pool();
1305         return 0;
1306
1307 clear_hash:
1308         tcp_put_md5sig_pool();
1309 clear_hash_noput:
1310         memset(md5_hash, 0, 16);
1311         return 1;
1312 }
1313
1314 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1315                         const struct sock *sk,
1316                         const struct sk_buff *skb)
1317 {
1318         struct tcp_md5sig_pool *hp;
1319         struct ahash_request *req;
1320         const struct tcphdr *th = tcp_hdr(skb);
1321         __be32 saddr, daddr;
1322
1323         if (sk) { /* valid for establish/request sockets */
1324                 saddr = sk->sk_rcv_saddr;
1325                 daddr = sk->sk_daddr;
1326         } else {
1327                 const struct iphdr *iph = ip_hdr(skb);
1328                 saddr = iph->saddr;
1329                 daddr = iph->daddr;
1330         }
1331
1332         hp = tcp_get_md5sig_pool();
1333         if (!hp)
1334                 goto clear_hash_noput;
1335         req = hp->md5_req;
1336
1337         if (crypto_ahash_init(req))
1338                 goto clear_hash;
1339
1340         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1341                 goto clear_hash;
1342         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1343                 goto clear_hash;
1344         if (tcp_md5_hash_key(hp, key))
1345                 goto clear_hash;
1346         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1347         if (crypto_ahash_final(req))
1348                 goto clear_hash;
1349
1350         tcp_put_md5sig_pool();
1351         return 0;
1352
1353 clear_hash:
1354         tcp_put_md5sig_pool();
1355 clear_hash_noput:
1356         memset(md5_hash, 0, 16);
1357         return 1;
1358 }
1359 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1360
1361 #endif
1362
1363 /* Called with rcu_read_lock() */
1364 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1365                                     const struct sk_buff *skb,
1366                                     int dif, int sdif)
1367 {
1368 #ifdef CONFIG_TCP_MD5SIG
1369         /*
1370          * This gets called for each TCP segment that arrives
1371          * so we want to be efficient.
1372          * We have 3 drop cases:
1373          * o No MD5 hash and one expected.
1374          * o MD5 hash and we're not expecting one.
1375          * o MD5 hash and its wrong.
1376          */
1377         const __u8 *hash_location = NULL;
1378         struct tcp_md5sig_key *hash_expected;
1379         const struct iphdr *iph = ip_hdr(skb);
1380         const struct tcphdr *th = tcp_hdr(skb);
1381         const union tcp_md5_addr *addr;
1382         unsigned char newhash[16];
1383         int genhash, l3index;
1384
1385         /* sdif set, means packet ingressed via a device
1386          * in an L3 domain and dif is set to the l3mdev
1387          */
1388         l3index = sdif ? dif : 0;
1389
1390         addr = (union tcp_md5_addr *)&iph->saddr;
1391         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1392         hash_location = tcp_parse_md5sig_option(th);
1393
1394         /* We've parsed the options - do we have a hash? */
1395         if (!hash_expected && !hash_location)
1396                 return false;
1397
1398         if (hash_expected && !hash_location) {
1399                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1400                 return true;
1401         }
1402
1403         if (!hash_expected && hash_location) {
1404                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1405                 return true;
1406         }
1407
1408         /* Okay, so this is hash_expected and hash_location -
1409          * so we need to calculate the checksum.
1410          */
1411         genhash = tcp_v4_md5_hash_skb(newhash,
1412                                       hash_expected,
1413                                       NULL, skb);
1414
1415         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1416                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1417                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1418                                      &iph->saddr, ntohs(th->source),
1419                                      &iph->daddr, ntohs(th->dest),
1420                                      genhash ? " tcp_v4_calc_md5_hash failed"
1421                                      : "", l3index);
1422                 return true;
1423         }
1424         return false;
1425 #endif
1426         return false;
1427 }
1428
1429 static void tcp_v4_init_req(struct request_sock *req,
1430                             const struct sock *sk_listener,
1431                             struct sk_buff *skb)
1432 {
1433         struct inet_request_sock *ireq = inet_rsk(req);
1434         struct net *net = sock_net(sk_listener);
1435
1436         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1437         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1438         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1439 }
1440
1441 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1442                                           struct flowi *fl,
1443                                           const struct request_sock *req)
1444 {
1445         return inet_csk_route_req(sk, &fl->u.ip4, req);
1446 }
1447
1448 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1449         .family         =       PF_INET,
1450         .obj_size       =       sizeof(struct tcp_request_sock),
1451         .rtx_syn_ack    =       tcp_rtx_synack,
1452         .send_ack       =       tcp_v4_reqsk_send_ack,
1453         .destructor     =       tcp_v4_reqsk_destructor,
1454         .send_reset     =       tcp_v4_send_reset,
1455         .syn_ack_timeout =      tcp_syn_ack_timeout,
1456 };
1457
1458 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1459         .mss_clamp      =       TCP_MSS_DEFAULT,
1460 #ifdef CONFIG_TCP_MD5SIG
1461         .req_md5_lookup =       tcp_v4_md5_lookup,
1462         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1463 #endif
1464         .init_req       =       tcp_v4_init_req,
1465 #ifdef CONFIG_SYN_COOKIES
1466         .cookie_init_seq =      cookie_v4_init_sequence,
1467 #endif
1468         .route_req      =       tcp_v4_route_req,
1469         .init_seq       =       tcp_v4_init_seq,
1470         .init_ts_off    =       tcp_v4_init_ts_off,
1471         .send_synack    =       tcp_v4_send_synack,
1472 };
1473
1474 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1475 {
1476         /* Never answer to SYNs send to broadcast or multicast */
1477         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1478                 goto drop;
1479
1480         return tcp_conn_request(&tcp_request_sock_ops,
1481                                 &tcp_request_sock_ipv4_ops, sk, skb);
1482
1483 drop:
1484         tcp_listendrop(sk);
1485         return 0;
1486 }
1487 EXPORT_SYMBOL(tcp_v4_conn_request);
1488
1489
1490 /*
1491  * The three way handshake has completed - we got a valid synack -
1492  * now create the new socket.
1493  */
1494 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1495                                   struct request_sock *req,
1496                                   struct dst_entry *dst,
1497                                   struct request_sock *req_unhash,
1498                                   bool *own_req)
1499 {
1500         struct inet_request_sock *ireq;
1501         struct inet_sock *newinet;
1502         struct tcp_sock *newtp;
1503         struct sock *newsk;
1504 #ifdef CONFIG_TCP_MD5SIG
1505         const union tcp_md5_addr *addr;
1506         struct tcp_md5sig_key *key;
1507         int l3index;
1508 #endif
1509         struct ip_options_rcu *inet_opt;
1510
1511         if (sk_acceptq_is_full(sk))
1512                 goto exit_overflow;
1513
1514         newsk = tcp_create_openreq_child(sk, req, skb);
1515         if (!newsk)
1516                 goto exit_nonewsk;
1517
1518         newsk->sk_gso_type = SKB_GSO_TCPV4;
1519         inet_sk_rx_dst_set(newsk, skb);
1520
1521         newtp                 = tcp_sk(newsk);
1522         newinet               = inet_sk(newsk);
1523         ireq                  = inet_rsk(req);
1524         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1525         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1526         newsk->sk_bound_dev_if = ireq->ir_iif;
1527         newinet->inet_saddr   = ireq->ir_loc_addr;
1528         inet_opt              = rcu_dereference(ireq->ireq_opt);
1529         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1530         newinet->mc_index     = inet_iif(skb);
1531         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1532         newinet->rcv_tos      = ip_hdr(skb)->tos;
1533         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1534         if (inet_opt)
1535                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1536         newinet->inet_id = prandom_u32();
1537
1538         /* Set ToS of the new socket based upon the value of incoming SYN. */
1539         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1540                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1541
1542         if (!dst) {
1543                 dst = inet_csk_route_child_sock(sk, newsk, req);
1544                 if (!dst)
1545                         goto put_and_exit;
1546         } else {
1547                 /* syncookie case : see end of cookie_v4_check() */
1548         }
1549         sk_setup_caps(newsk, dst);
1550
1551         tcp_ca_openreq_child(newsk, dst);
1552
1553         tcp_sync_mss(newsk, dst_mtu(dst));
1554         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1555
1556         tcp_initialize_rcv_mss(newsk);
1557
1558 #ifdef CONFIG_TCP_MD5SIG
1559         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1560         /* Copy over the MD5 key from the original socket */
1561         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1562         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1563         if (key) {
1564                 /*
1565                  * We're using one, so create a matching key
1566                  * on the newsk structure. If we fail to get
1567                  * memory, then we end up not copying the key
1568                  * across. Shucks.
1569                  */
1570                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1571                                key->key, key->keylen, GFP_ATOMIC);
1572                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1573         }
1574 #endif
1575
1576         if (__inet_inherit_port(sk, newsk) < 0)
1577                 goto put_and_exit;
1578         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1579         if (likely(*own_req)) {
1580                 tcp_move_syn(newtp, req);
1581                 ireq->ireq_opt = NULL;
1582         } else {
1583                 newinet->inet_opt = NULL;
1584         }
1585         return newsk;
1586
1587 exit_overflow:
1588         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1589 exit_nonewsk:
1590         dst_release(dst);
1591 exit:
1592         tcp_listendrop(sk);
1593         return NULL;
1594 put_and_exit:
1595         newinet->inet_opt = NULL;
1596         inet_csk_prepare_forced_close(newsk);
1597         tcp_done(newsk);
1598         goto exit;
1599 }
1600 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1601
1602 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1603 {
1604 #ifdef CONFIG_SYN_COOKIES
1605         const struct tcphdr *th = tcp_hdr(skb);
1606
1607         if (!th->syn)
1608                 sk = cookie_v4_check(sk, skb);
1609 #endif
1610         return sk;
1611 }
1612
1613 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1614                          struct tcphdr *th, u32 *cookie)
1615 {
1616         u16 mss = 0;
1617 #ifdef CONFIG_SYN_COOKIES
1618         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1619                                     &tcp_request_sock_ipv4_ops, sk, th);
1620         if (mss) {
1621                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1622                 tcp_synq_overflow(sk);
1623         }
1624 #endif
1625         return mss;
1626 }
1627
1628 /* The socket must have it's spinlock held when we get
1629  * here, unless it is a TCP_LISTEN socket.
1630  *
1631  * We have a potential double-lock case here, so even when
1632  * doing backlog processing we use the BH locking scheme.
1633  * This is because we cannot sleep with the original spinlock
1634  * held.
1635  */
1636 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1637 {
1638         struct sock *rsk;
1639
1640         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1641                 struct dst_entry *dst = sk->sk_rx_dst;
1642
1643                 sock_rps_save_rxhash(sk, skb);
1644                 sk_mark_napi_id(sk, skb);
1645                 if (dst) {
1646                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1647                             !dst->ops->check(dst, 0)) {
1648                                 dst_release(dst);
1649                                 sk->sk_rx_dst = NULL;
1650                         }
1651                 }
1652                 tcp_rcv_established(sk, skb);
1653                 return 0;
1654         }
1655
1656         if (tcp_checksum_complete(skb))
1657                 goto csum_err;
1658
1659         if (sk->sk_state == TCP_LISTEN) {
1660                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1661
1662                 if (!nsk)
1663                         goto discard;
1664                 if (nsk != sk) {
1665                         if (tcp_child_process(sk, nsk, skb)) {
1666                                 rsk = nsk;
1667                                 goto reset;
1668                         }
1669                         return 0;
1670                 }
1671         } else
1672                 sock_rps_save_rxhash(sk, skb);
1673
1674         if (tcp_rcv_state_process(sk, skb)) {
1675                 rsk = sk;
1676                 goto reset;
1677         }
1678         return 0;
1679
1680 reset:
1681         tcp_v4_send_reset(rsk, skb);
1682 discard:
1683         kfree_skb(skb);
1684         /* Be careful here. If this function gets more complicated and
1685          * gcc suffers from register pressure on the x86, sk (in %ebx)
1686          * might be destroyed here. This current version compiles correctly,
1687          * but you have been warned.
1688          */
1689         return 0;
1690
1691 csum_err:
1692         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1693         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1694         goto discard;
1695 }
1696 EXPORT_SYMBOL(tcp_v4_do_rcv);
1697
1698 int tcp_v4_early_demux(struct sk_buff *skb)
1699 {
1700         const struct iphdr *iph;
1701         const struct tcphdr *th;
1702         struct sock *sk;
1703
1704         if (skb->pkt_type != PACKET_HOST)
1705                 return 0;
1706
1707         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1708                 return 0;
1709
1710         iph = ip_hdr(skb);
1711         th = tcp_hdr(skb);
1712
1713         if (th->doff < sizeof(struct tcphdr) / 4)
1714                 return 0;
1715
1716         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1717                                        iph->saddr, th->source,
1718                                        iph->daddr, ntohs(th->dest),
1719                                        skb->skb_iif, inet_sdif(skb));
1720         if (sk) {
1721                 skb->sk = sk;
1722                 skb->destructor = sock_edemux;
1723                 if (sk_fullsock(sk)) {
1724                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1725
1726                         if (dst)
1727                                 dst = dst_check(dst, 0);
1728                         if (dst &&
1729                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1730                                 skb_dst_set_noref(skb, dst);
1731                 }
1732         }
1733         return 0;
1734 }
1735
1736 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1737 {
1738         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1739         struct skb_shared_info *shinfo;
1740         const struct tcphdr *th;
1741         struct tcphdr *thtail;
1742         struct sk_buff *tail;
1743         unsigned int hdrlen;
1744         bool fragstolen;
1745         u32 gso_segs;
1746         int delta;
1747
1748         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1749          * we can fix skb->truesize to its real value to avoid future drops.
1750          * This is valid because skb is not yet charged to the socket.
1751          * It has been noticed pure SACK packets were sometimes dropped
1752          * (if cooked by drivers without copybreak feature).
1753          */
1754         skb_condense(skb);
1755
1756         skb_dst_drop(skb);
1757
1758         if (unlikely(tcp_checksum_complete(skb))) {
1759                 bh_unlock_sock(sk);
1760                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1761                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1762                 return true;
1763         }
1764
1765         /* Attempt coalescing to last skb in backlog, even if we are
1766          * above the limits.
1767          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1768          */
1769         th = (const struct tcphdr *)skb->data;
1770         hdrlen = th->doff * 4;
1771         shinfo = skb_shinfo(skb);
1772
1773         if (!shinfo->gso_size)
1774                 shinfo->gso_size = skb->len - hdrlen;
1775
1776         if (!shinfo->gso_segs)
1777                 shinfo->gso_segs = 1;
1778
1779         tail = sk->sk_backlog.tail;
1780         if (!tail)
1781                 goto no_coalesce;
1782         thtail = (struct tcphdr *)tail->data;
1783
1784         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1785             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1786             ((TCP_SKB_CB(tail)->tcp_flags |
1787               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1788             !((TCP_SKB_CB(tail)->tcp_flags &
1789               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1790             ((TCP_SKB_CB(tail)->tcp_flags ^
1791               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1792 #ifdef CONFIG_TLS_DEVICE
1793             tail->decrypted != skb->decrypted ||
1794 #endif
1795             thtail->doff != th->doff ||
1796             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1797                 goto no_coalesce;
1798
1799         __skb_pull(skb, hdrlen);
1800         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1801                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1802
1803                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1804                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1805                         thtail->window = th->window;
1806                 }
1807
1808                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1809                  * thtail->fin, so that the fast path in tcp_rcv_established()
1810                  * is not entered if we append a packet with a FIN.
1811                  * SYN, RST, URG are not present.
1812                  * ACK is set on both packets.
1813                  * PSH : we do not really care in TCP stack,
1814                  *       at least for 'GRO' packets.
1815                  */
1816                 thtail->fin |= th->fin;
1817                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1818
1819                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1820                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1821                         tail->tstamp = skb->tstamp;
1822                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1823                 }
1824
1825                 /* Not as strict as GRO. We only need to carry mss max value */
1826                 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1827                                                  skb_shinfo(tail)->gso_size);
1828
1829                 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1830                 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1831
1832                 sk->sk_backlog.len += delta;
1833                 __NET_INC_STATS(sock_net(sk),
1834                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1835                 kfree_skb_partial(skb, fragstolen);
1836                 return false;
1837         }
1838         __skb_push(skb, hdrlen);
1839
1840 no_coalesce:
1841         /* Only socket owner can try to collapse/prune rx queues
1842          * to reduce memory overhead, so add a little headroom here.
1843          * Few sockets backlog are possibly concurrently non empty.
1844          */
1845         limit += 64*1024;
1846
1847         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1848                 bh_unlock_sock(sk);
1849                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1850                 return true;
1851         }
1852         return false;
1853 }
1854 EXPORT_SYMBOL(tcp_add_backlog);
1855
1856 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1857 {
1858         struct tcphdr *th = (struct tcphdr *)skb->data;
1859
1860         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1861 }
1862 EXPORT_SYMBOL(tcp_filter);
1863
1864 static void tcp_v4_restore_cb(struct sk_buff *skb)
1865 {
1866         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1867                 sizeof(struct inet_skb_parm));
1868 }
1869
1870 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1871                            const struct tcphdr *th)
1872 {
1873         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1874          * barrier() makes sure compiler wont play fool^Waliasing games.
1875          */
1876         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1877                 sizeof(struct inet_skb_parm));
1878         barrier();
1879
1880         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1881         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1882                                     skb->len - th->doff * 4);
1883         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1884         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1885         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1886         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1887         TCP_SKB_CB(skb)->sacked  = 0;
1888         TCP_SKB_CB(skb)->has_rxtstamp =
1889                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1890 }
1891
1892 /*
1893  *      From tcp_input.c
1894  */
1895
1896 int tcp_v4_rcv(struct sk_buff *skb)
1897 {
1898         struct net *net = dev_net(skb->dev);
1899         struct sk_buff *skb_to_free;
1900         int sdif = inet_sdif(skb);
1901         int dif = inet_iif(skb);
1902         const struct iphdr *iph;
1903         const struct tcphdr *th;
1904         bool refcounted;
1905         struct sock *sk;
1906         int ret;
1907
1908         if (skb->pkt_type != PACKET_HOST)
1909                 goto discard_it;
1910
1911         /* Count it even if it's bad */
1912         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1913
1914         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1915                 goto discard_it;
1916
1917         th = (const struct tcphdr *)skb->data;
1918
1919         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1920                 goto bad_packet;
1921         if (!pskb_may_pull(skb, th->doff * 4))
1922                 goto discard_it;
1923
1924         /* An explanation is required here, I think.
1925          * Packet length and doff are validated by header prediction,
1926          * provided case of th->doff==0 is eliminated.
1927          * So, we defer the checks. */
1928
1929         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1930                 goto csum_error;
1931
1932         th = (const struct tcphdr *)skb->data;
1933         iph = ip_hdr(skb);
1934 lookup:
1935         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1936                                th->dest, sdif, &refcounted);
1937         if (!sk)
1938                 goto no_tcp_socket;
1939
1940 process:
1941         if (sk->sk_state == TCP_TIME_WAIT)
1942                 goto do_time_wait;
1943
1944         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1945                 struct request_sock *req = inet_reqsk(sk);
1946                 bool req_stolen = false;
1947                 struct sock *nsk;
1948
1949                 sk = req->rsk_listener;
1950                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1951                         sk_drops_add(sk, skb);
1952                         reqsk_put(req);
1953                         goto discard_it;
1954                 }
1955                 if (tcp_checksum_complete(skb)) {
1956                         reqsk_put(req);
1957                         goto csum_error;
1958                 }
1959                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1960                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1961                         goto lookup;
1962                 }
1963                 /* We own a reference on the listener, increase it again
1964                  * as we might lose it too soon.
1965                  */
1966                 sock_hold(sk);
1967                 refcounted = true;
1968                 nsk = NULL;
1969                 if (!tcp_filter(sk, skb)) {
1970                         th = (const struct tcphdr *)skb->data;
1971                         iph = ip_hdr(skb);
1972                         tcp_v4_fill_cb(skb, iph, th);
1973                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1974                 }
1975                 if (!nsk) {
1976                         reqsk_put(req);
1977                         if (req_stolen) {
1978                                 /* Another cpu got exclusive access to req
1979                                  * and created a full blown socket.
1980                                  * Try to feed this packet to this socket
1981                                  * instead of discarding it.
1982                                  */
1983                                 tcp_v4_restore_cb(skb);
1984                                 sock_put(sk);
1985                                 goto lookup;
1986                         }
1987                         goto discard_and_relse;
1988                 }
1989                 if (nsk == sk) {
1990                         reqsk_put(req);
1991                         tcp_v4_restore_cb(skb);
1992                 } else if (tcp_child_process(sk, nsk, skb)) {
1993                         tcp_v4_send_reset(nsk, skb);
1994                         goto discard_and_relse;
1995                 } else {
1996                         sock_put(sk);
1997                         return 0;
1998                 }
1999         }
2000         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2001                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2002                 goto discard_and_relse;
2003         }
2004
2005         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2006                 goto discard_and_relse;
2007
2008         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2009                 goto discard_and_relse;
2010
2011         nf_reset_ct(skb);
2012
2013         if (tcp_filter(sk, skb))
2014                 goto discard_and_relse;
2015         th = (const struct tcphdr *)skb->data;
2016         iph = ip_hdr(skb);
2017         tcp_v4_fill_cb(skb, iph, th);
2018
2019         skb->dev = NULL;
2020
2021         if (sk->sk_state == TCP_LISTEN) {
2022                 ret = tcp_v4_do_rcv(sk, skb);
2023                 goto put_and_return;
2024         }
2025
2026         sk_incoming_cpu_update(sk);
2027
2028         bh_lock_sock_nested(sk);
2029         tcp_segs_in(tcp_sk(sk), skb);
2030         ret = 0;
2031         if (!sock_owned_by_user(sk)) {
2032                 skb_to_free = sk->sk_rx_skb_cache;
2033                 sk->sk_rx_skb_cache = NULL;
2034                 ret = tcp_v4_do_rcv(sk, skb);
2035         } else {
2036                 if (tcp_add_backlog(sk, skb))
2037                         goto discard_and_relse;
2038                 skb_to_free = NULL;
2039         }
2040         bh_unlock_sock(sk);
2041         if (skb_to_free)
2042                 __kfree_skb(skb_to_free);
2043
2044 put_and_return:
2045         if (refcounted)
2046                 sock_put(sk);
2047
2048         return ret;
2049
2050 no_tcp_socket:
2051         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2052                 goto discard_it;
2053
2054         tcp_v4_fill_cb(skb, iph, th);
2055
2056         if (tcp_checksum_complete(skb)) {
2057 csum_error:
2058                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2059 bad_packet:
2060                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2061         } else {
2062                 tcp_v4_send_reset(NULL, skb);
2063         }
2064
2065 discard_it:
2066         /* Discard frame. */
2067         kfree_skb(skb);
2068         return 0;
2069
2070 discard_and_relse:
2071         sk_drops_add(sk, skb);
2072         if (refcounted)
2073                 sock_put(sk);
2074         goto discard_it;
2075
2076 do_time_wait:
2077         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2078                 inet_twsk_put(inet_twsk(sk));
2079                 goto discard_it;
2080         }
2081
2082         tcp_v4_fill_cb(skb, iph, th);
2083
2084         if (tcp_checksum_complete(skb)) {
2085                 inet_twsk_put(inet_twsk(sk));
2086                 goto csum_error;
2087         }
2088         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2089         case TCP_TW_SYN: {
2090                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2091                                                         &tcp_hashinfo, skb,
2092                                                         __tcp_hdrlen(th),
2093                                                         iph->saddr, th->source,
2094                                                         iph->daddr, th->dest,
2095                                                         inet_iif(skb),
2096                                                         sdif);
2097                 if (sk2) {
2098                         inet_twsk_deschedule_put(inet_twsk(sk));
2099                         sk = sk2;
2100                         tcp_v4_restore_cb(skb);
2101                         refcounted = false;
2102                         goto process;
2103                 }
2104         }
2105                 /* to ACK */
2106                 fallthrough;
2107         case TCP_TW_ACK:
2108                 tcp_v4_timewait_ack(sk, skb);
2109                 break;
2110         case TCP_TW_RST:
2111                 tcp_v4_send_reset(sk, skb);
2112                 inet_twsk_deschedule_put(inet_twsk(sk));
2113                 goto discard_it;
2114         case TCP_TW_SUCCESS:;
2115         }
2116         goto discard_it;
2117 }
2118
2119 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2120         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2121         .twsk_unique    = tcp_twsk_unique,
2122         .twsk_destructor= tcp_twsk_destructor,
2123 };
2124
2125 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2126 {
2127         struct dst_entry *dst = skb_dst(skb);
2128
2129         if (dst && dst_hold_safe(dst)) {
2130                 sk->sk_rx_dst = dst;
2131                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2132         }
2133 }
2134 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2135
2136 const struct inet_connection_sock_af_ops ipv4_specific = {
2137         .queue_xmit        = ip_queue_xmit,
2138         .send_check        = tcp_v4_send_check,
2139         .rebuild_header    = inet_sk_rebuild_header,
2140         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2141         .conn_request      = tcp_v4_conn_request,
2142         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2143         .net_header_len    = sizeof(struct iphdr),
2144         .setsockopt        = ip_setsockopt,
2145         .getsockopt        = ip_getsockopt,
2146         .addr2sockaddr     = inet_csk_addr2sockaddr,
2147         .sockaddr_len      = sizeof(struct sockaddr_in),
2148         .mtu_reduced       = tcp_v4_mtu_reduced,
2149 };
2150 EXPORT_SYMBOL(ipv4_specific);
2151
2152 #ifdef CONFIG_TCP_MD5SIG
2153 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2154         .md5_lookup             = tcp_v4_md5_lookup,
2155         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2156         .md5_parse              = tcp_v4_parse_md5_keys,
2157 };
2158 #endif
2159
2160 /* NOTE: A lot of things set to zero explicitly by call to
2161  *       sk_alloc() so need not be done here.
2162  */
2163 static int tcp_v4_init_sock(struct sock *sk)
2164 {
2165         struct inet_connection_sock *icsk = inet_csk(sk);
2166
2167         tcp_init_sock(sk);
2168
2169         icsk->icsk_af_ops = &ipv4_specific;
2170
2171 #ifdef CONFIG_TCP_MD5SIG
2172         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2173 #endif
2174
2175         return 0;
2176 }
2177
2178 void tcp_v4_destroy_sock(struct sock *sk)
2179 {
2180         struct tcp_sock *tp = tcp_sk(sk);
2181
2182         trace_tcp_destroy_sock(sk);
2183
2184         tcp_clear_xmit_timers(sk);
2185
2186         tcp_cleanup_congestion_control(sk);
2187
2188         tcp_cleanup_ulp(sk);
2189
2190         /* Cleanup up the write buffer. */
2191         tcp_write_queue_purge(sk);
2192
2193         /* Check if we want to disable active TFO */
2194         tcp_fastopen_active_disable_ofo_check(sk);
2195
2196         /* Cleans up our, hopefully empty, out_of_order_queue. */
2197         skb_rbtree_purge(&tp->out_of_order_queue);
2198
2199 #ifdef CONFIG_TCP_MD5SIG
2200         /* Clean up the MD5 key list, if any */
2201         if (tp->md5sig_info) {
2202                 tcp_clear_md5_list(sk);
2203                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2204                 tp->md5sig_info = NULL;
2205         }
2206 #endif
2207
2208         /* Clean up a referenced TCP bind bucket. */
2209         if (inet_csk(sk)->icsk_bind_hash)
2210                 inet_put_port(sk);
2211
2212         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2213
2214         /* If socket is aborted during connect operation */
2215         tcp_free_fastopen_req(tp);
2216         tcp_fastopen_destroy_cipher(sk);
2217         tcp_saved_syn_free(tp);
2218
2219         sk_sockets_allocated_dec(sk);
2220 }
2221 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2222
2223 #ifdef CONFIG_PROC_FS
2224 /* Proc filesystem TCP sock list dumping. */
2225
2226 /*
2227  * Get next listener socket follow cur.  If cur is NULL, get first socket
2228  * starting from bucket given in st->bucket; when st->bucket is zero the
2229  * very first socket in the hash table is returned.
2230  */
2231 static void *listening_get_next(struct seq_file *seq, void *cur)
2232 {
2233         struct tcp_seq_afinfo *afinfo;
2234         struct tcp_iter_state *st = seq->private;
2235         struct net *net = seq_file_net(seq);
2236         struct inet_listen_hashbucket *ilb;
2237         struct hlist_nulls_node *node;
2238         struct sock *sk = cur;
2239
2240         if (st->bpf_seq_afinfo)
2241                 afinfo = st->bpf_seq_afinfo;
2242         else
2243                 afinfo = PDE_DATA(file_inode(seq->file));
2244
2245         if (!sk) {
2246 get_head:
2247                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2248                 spin_lock(&ilb->lock);
2249                 sk = sk_nulls_head(&ilb->nulls_head);
2250                 st->offset = 0;
2251                 goto get_sk;
2252         }
2253         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2254         ++st->num;
2255         ++st->offset;
2256
2257         sk = sk_nulls_next(sk);
2258 get_sk:
2259         sk_nulls_for_each_from(sk, node) {
2260                 if (!net_eq(sock_net(sk), net))
2261                         continue;
2262                 if (afinfo->family == AF_UNSPEC ||
2263                     sk->sk_family == afinfo->family)
2264                         return sk;
2265         }
2266         spin_unlock(&ilb->lock);
2267         st->offset = 0;
2268         if (++st->bucket < INET_LHTABLE_SIZE)
2269                 goto get_head;
2270         return NULL;
2271 }
2272
2273 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2274 {
2275         struct tcp_iter_state *st = seq->private;
2276         void *rc;
2277
2278         st->bucket = 0;
2279         st->offset = 0;
2280         rc = listening_get_next(seq, NULL);
2281
2282         while (rc && *pos) {
2283                 rc = listening_get_next(seq, rc);
2284                 --*pos;
2285         }
2286         return rc;
2287 }
2288
2289 static inline bool empty_bucket(const struct tcp_iter_state *st)
2290 {
2291         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2292 }
2293
2294 /*
2295  * Get first established socket starting from bucket given in st->bucket.
2296  * If st->bucket is zero, the very first socket in the hash is returned.
2297  */
2298 static void *established_get_first(struct seq_file *seq)
2299 {
2300         struct tcp_seq_afinfo *afinfo;
2301         struct tcp_iter_state *st = seq->private;
2302         struct net *net = seq_file_net(seq);
2303         void *rc = NULL;
2304
2305         if (st->bpf_seq_afinfo)
2306                 afinfo = st->bpf_seq_afinfo;
2307         else
2308                 afinfo = PDE_DATA(file_inode(seq->file));
2309
2310         st->offset = 0;
2311         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2312                 struct sock *sk;
2313                 struct hlist_nulls_node *node;
2314                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2315
2316                 /* Lockless fast path for the common case of empty buckets */
2317                 if (empty_bucket(st))
2318                         continue;
2319
2320                 spin_lock_bh(lock);
2321                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2322                         if ((afinfo->family != AF_UNSPEC &&
2323                              sk->sk_family != afinfo->family) ||
2324                             !net_eq(sock_net(sk), net)) {
2325                                 continue;
2326                         }
2327                         rc = sk;
2328                         goto out;
2329                 }
2330                 spin_unlock_bh(lock);
2331         }
2332 out:
2333         return rc;
2334 }
2335
2336 static void *established_get_next(struct seq_file *seq, void *cur)
2337 {
2338         struct tcp_seq_afinfo *afinfo;
2339         struct sock *sk = cur;
2340         struct hlist_nulls_node *node;
2341         struct tcp_iter_state *st = seq->private;
2342         struct net *net = seq_file_net(seq);
2343
2344         if (st->bpf_seq_afinfo)
2345                 afinfo = st->bpf_seq_afinfo;
2346         else
2347                 afinfo = PDE_DATA(file_inode(seq->file));
2348
2349         ++st->num;
2350         ++st->offset;
2351
2352         sk = sk_nulls_next(sk);
2353
2354         sk_nulls_for_each_from(sk, node) {
2355                 if ((afinfo->family == AF_UNSPEC ||
2356                      sk->sk_family == afinfo->family) &&
2357                     net_eq(sock_net(sk), net))
2358                         return sk;
2359         }
2360
2361         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2362         ++st->bucket;
2363         return established_get_first(seq);
2364 }
2365
2366 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2367 {
2368         struct tcp_iter_state *st = seq->private;
2369         void *rc;
2370
2371         st->bucket = 0;
2372         rc = established_get_first(seq);
2373
2374         while (rc && pos) {
2375                 rc = established_get_next(seq, rc);
2376                 --pos;
2377         }
2378         return rc;
2379 }
2380
2381 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2382 {
2383         void *rc;
2384         struct tcp_iter_state *st = seq->private;
2385
2386         st->state = TCP_SEQ_STATE_LISTENING;
2387         rc        = listening_get_idx(seq, &pos);
2388
2389         if (!rc) {
2390                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2391                 rc        = established_get_idx(seq, pos);
2392         }
2393
2394         return rc;
2395 }
2396
2397 static void *tcp_seek_last_pos(struct seq_file *seq)
2398 {
2399         struct tcp_iter_state *st = seq->private;
2400         int offset = st->offset;
2401         int orig_num = st->num;
2402         void *rc = NULL;
2403
2404         switch (st->state) {
2405         case TCP_SEQ_STATE_LISTENING:
2406                 if (st->bucket >= INET_LHTABLE_SIZE)
2407                         break;
2408                 st->state = TCP_SEQ_STATE_LISTENING;
2409                 rc = listening_get_next(seq, NULL);
2410                 while (offset-- && rc)
2411                         rc = listening_get_next(seq, rc);
2412                 if (rc)
2413                         break;
2414                 st->bucket = 0;
2415                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2416                 fallthrough;
2417         case TCP_SEQ_STATE_ESTABLISHED:
2418                 if (st->bucket > tcp_hashinfo.ehash_mask)
2419                         break;
2420                 rc = established_get_first(seq);
2421                 while (offset-- && rc)
2422                         rc = established_get_next(seq, rc);
2423         }
2424
2425         st->num = orig_num;
2426
2427         return rc;
2428 }
2429
2430 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2431 {
2432         struct tcp_iter_state *st = seq->private;
2433         void *rc;
2434
2435         if (*pos && *pos == st->last_pos) {
2436                 rc = tcp_seek_last_pos(seq);
2437                 if (rc)
2438                         goto out;
2439         }
2440
2441         st->state = TCP_SEQ_STATE_LISTENING;
2442         st->num = 0;
2443         st->bucket = 0;
2444         st->offset = 0;
2445         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2446
2447 out:
2448         st->last_pos = *pos;
2449         return rc;
2450 }
2451 EXPORT_SYMBOL(tcp_seq_start);
2452
2453 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2454 {
2455         struct tcp_iter_state *st = seq->private;
2456         void *rc = NULL;
2457
2458         if (v == SEQ_START_TOKEN) {
2459                 rc = tcp_get_idx(seq, 0);
2460                 goto out;
2461         }
2462
2463         switch (st->state) {
2464         case TCP_SEQ_STATE_LISTENING:
2465                 rc = listening_get_next(seq, v);
2466                 if (!rc) {
2467                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2468                         st->bucket = 0;
2469                         st->offset = 0;
2470                         rc        = established_get_first(seq);
2471                 }
2472                 break;
2473         case TCP_SEQ_STATE_ESTABLISHED:
2474                 rc = established_get_next(seq, v);
2475                 break;
2476         }
2477 out:
2478         ++*pos;
2479         st->last_pos = *pos;
2480         return rc;
2481 }
2482 EXPORT_SYMBOL(tcp_seq_next);
2483
2484 void tcp_seq_stop(struct seq_file *seq, void *v)
2485 {
2486         struct tcp_iter_state *st = seq->private;
2487
2488         switch (st->state) {
2489         case TCP_SEQ_STATE_LISTENING:
2490                 if (v != SEQ_START_TOKEN)
2491                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2492                 break;
2493         case TCP_SEQ_STATE_ESTABLISHED:
2494                 if (v)
2495                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2496                 break;
2497         }
2498 }
2499 EXPORT_SYMBOL(tcp_seq_stop);
2500
2501 static void get_openreq4(const struct request_sock *req,
2502                          struct seq_file *f, int i)
2503 {
2504         const struct inet_request_sock *ireq = inet_rsk(req);
2505         long delta = req->rsk_timer.expires - jiffies;
2506
2507         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2508                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2509                 i,
2510                 ireq->ir_loc_addr,
2511                 ireq->ir_num,
2512                 ireq->ir_rmt_addr,
2513                 ntohs(ireq->ir_rmt_port),
2514                 TCP_SYN_RECV,
2515                 0, 0, /* could print option size, but that is af dependent. */
2516                 1,    /* timers active (only the expire timer) */
2517                 jiffies_delta_to_clock_t(delta),
2518                 req->num_timeout,
2519                 from_kuid_munged(seq_user_ns(f),
2520                                  sock_i_uid(req->rsk_listener)),
2521                 0,  /* non standard timer */
2522                 0, /* open_requests have no inode */
2523                 0,
2524                 req);
2525 }
2526
2527 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2528 {
2529         int timer_active;
2530         unsigned long timer_expires;
2531         const struct tcp_sock *tp = tcp_sk(sk);
2532         const struct inet_connection_sock *icsk = inet_csk(sk);
2533         const struct inet_sock *inet = inet_sk(sk);
2534         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2535         __be32 dest = inet->inet_daddr;
2536         __be32 src = inet->inet_rcv_saddr;
2537         __u16 destp = ntohs(inet->inet_dport);
2538         __u16 srcp = ntohs(inet->inet_sport);
2539         int rx_queue;
2540         int state;
2541
2542         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2543             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2544             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2545                 timer_active    = 1;
2546                 timer_expires   = icsk->icsk_timeout;
2547         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2548                 timer_active    = 4;
2549                 timer_expires   = icsk->icsk_timeout;
2550         } else if (timer_pending(&sk->sk_timer)) {
2551                 timer_active    = 2;
2552                 timer_expires   = sk->sk_timer.expires;
2553         } else {
2554                 timer_active    = 0;
2555                 timer_expires = jiffies;
2556         }
2557
2558         state = inet_sk_state_load(sk);
2559         if (state == TCP_LISTEN)
2560                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2561         else
2562                 /* Because we don't lock the socket,
2563                  * we might find a transient negative value.
2564                  */
2565                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2566                                       READ_ONCE(tp->copied_seq), 0);
2567
2568         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2569                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2570                 i, src, srcp, dest, destp, state,
2571                 READ_ONCE(tp->write_seq) - tp->snd_una,
2572                 rx_queue,
2573                 timer_active,
2574                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2575                 icsk->icsk_retransmits,
2576                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2577                 icsk->icsk_probes_out,
2578                 sock_i_ino(sk),
2579                 refcount_read(&sk->sk_refcnt), sk,
2580                 jiffies_to_clock_t(icsk->icsk_rto),
2581                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2582                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2583                 tp->snd_cwnd,
2584                 state == TCP_LISTEN ?
2585                     fastopenq->max_qlen :
2586                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2587 }
2588
2589 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2590                                struct seq_file *f, int i)
2591 {
2592         long delta = tw->tw_timer.expires - jiffies;
2593         __be32 dest, src;
2594         __u16 destp, srcp;
2595
2596         dest  = tw->tw_daddr;
2597         src   = tw->tw_rcv_saddr;
2598         destp = ntohs(tw->tw_dport);
2599         srcp  = ntohs(tw->tw_sport);
2600
2601         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2602                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2603                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2604                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2605                 refcount_read(&tw->tw_refcnt), tw);
2606 }
2607
2608 #define TMPSZ 150
2609
2610 static int tcp4_seq_show(struct seq_file *seq, void *v)
2611 {
2612         struct tcp_iter_state *st;
2613         struct sock *sk = v;
2614
2615         seq_setwidth(seq, TMPSZ - 1);
2616         if (v == SEQ_START_TOKEN) {
2617                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2618                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2619                            "inode");
2620                 goto out;
2621         }
2622         st = seq->private;
2623
2624         if (sk->sk_state == TCP_TIME_WAIT)
2625                 get_timewait4_sock(v, seq, st->num);
2626         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2627                 get_openreq4(v, seq, st->num);
2628         else
2629                 get_tcp4_sock(v, seq, st->num);
2630 out:
2631         seq_pad(seq, '\n');
2632         return 0;
2633 }
2634
2635 #ifdef CONFIG_BPF_SYSCALL
2636 struct bpf_iter__tcp {
2637         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2638         __bpf_md_ptr(struct sock_common *, sk_common);
2639         uid_t uid __aligned(8);
2640 };
2641
2642 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2643                              struct sock_common *sk_common, uid_t uid)
2644 {
2645         struct bpf_iter__tcp ctx;
2646
2647         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2648         ctx.meta = meta;
2649         ctx.sk_common = sk_common;
2650         ctx.uid = uid;
2651         return bpf_iter_run_prog(prog, &ctx);
2652 }
2653
2654 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2655 {
2656         struct bpf_iter_meta meta;
2657         struct bpf_prog *prog;
2658         struct sock *sk = v;
2659         uid_t uid;
2660
2661         if (v == SEQ_START_TOKEN)
2662                 return 0;
2663
2664         if (sk->sk_state == TCP_TIME_WAIT) {
2665                 uid = 0;
2666         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2667                 const struct request_sock *req = v;
2668
2669                 uid = from_kuid_munged(seq_user_ns(seq),
2670                                        sock_i_uid(req->rsk_listener));
2671         } else {
2672                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2673         }
2674
2675         meta.seq = seq;
2676         prog = bpf_iter_get_info(&meta, false);
2677         return tcp_prog_seq_show(prog, &meta, v, uid);
2678 }
2679
2680 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2681 {
2682         struct bpf_iter_meta meta;
2683         struct bpf_prog *prog;
2684
2685         if (!v) {
2686                 meta.seq = seq;
2687                 prog = bpf_iter_get_info(&meta, true);
2688                 if (prog)
2689                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2690         }
2691
2692         tcp_seq_stop(seq, v);
2693 }
2694
2695 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2696         .show           = bpf_iter_tcp_seq_show,
2697         .start          = tcp_seq_start,
2698         .next           = tcp_seq_next,
2699         .stop           = bpf_iter_tcp_seq_stop,
2700 };
2701 #endif
2702
2703 static const struct seq_operations tcp4_seq_ops = {
2704         .show           = tcp4_seq_show,
2705         .start          = tcp_seq_start,
2706         .next           = tcp_seq_next,
2707         .stop           = tcp_seq_stop,
2708 };
2709
2710 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2711         .family         = AF_INET,
2712 };
2713
2714 static int __net_init tcp4_proc_init_net(struct net *net)
2715 {
2716         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2717                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2718                 return -ENOMEM;
2719         return 0;
2720 }
2721
2722 static void __net_exit tcp4_proc_exit_net(struct net *net)
2723 {
2724         remove_proc_entry("tcp", net->proc_net);
2725 }
2726
2727 static struct pernet_operations tcp4_net_ops = {
2728         .init = tcp4_proc_init_net,
2729         .exit = tcp4_proc_exit_net,
2730 };
2731
2732 int __init tcp4_proc_init(void)
2733 {
2734         return register_pernet_subsys(&tcp4_net_ops);
2735 }
2736
2737 void tcp4_proc_exit(void)
2738 {
2739         unregister_pernet_subsys(&tcp4_net_ops);
2740 }
2741 #endif /* CONFIG_PROC_FS */
2742
2743 struct proto tcp_prot = {
2744         .name                   = "TCP",
2745         .owner                  = THIS_MODULE,
2746         .close                  = tcp_close,
2747         .pre_connect            = tcp_v4_pre_connect,
2748         .connect                = tcp_v4_connect,
2749         .disconnect             = tcp_disconnect,
2750         .accept                 = inet_csk_accept,
2751         .ioctl                  = tcp_ioctl,
2752         .init                   = tcp_v4_init_sock,
2753         .destroy                = tcp_v4_destroy_sock,
2754         .shutdown               = tcp_shutdown,
2755         .setsockopt             = tcp_setsockopt,
2756         .getsockopt             = tcp_getsockopt,
2757         .keepalive              = tcp_set_keepalive,
2758         .recvmsg                = tcp_recvmsg,
2759         .sendmsg                = tcp_sendmsg,
2760         .sendpage               = tcp_sendpage,
2761         .backlog_rcv            = tcp_v4_do_rcv,
2762         .release_cb             = tcp_release_cb,
2763         .hash                   = inet_hash,
2764         .unhash                 = inet_unhash,
2765         .get_port               = inet_csk_get_port,
2766         .enter_memory_pressure  = tcp_enter_memory_pressure,
2767         .leave_memory_pressure  = tcp_leave_memory_pressure,
2768         .stream_memory_free     = tcp_stream_memory_free,
2769         .sockets_allocated      = &tcp_sockets_allocated,
2770         .orphan_count           = &tcp_orphan_count,
2771         .memory_allocated       = &tcp_memory_allocated,
2772         .memory_pressure        = &tcp_memory_pressure,
2773         .sysctl_mem             = sysctl_tcp_mem,
2774         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2775         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2776         .max_header             = MAX_TCP_HEADER,
2777         .obj_size               = sizeof(struct tcp_sock),
2778         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2779         .twsk_prot              = &tcp_timewait_sock_ops,
2780         .rsk_prot               = &tcp_request_sock_ops,
2781         .h.hashinfo             = &tcp_hashinfo,
2782         .no_autobind            = true,
2783         .diag_destroy           = tcp_abort,
2784 };
2785 EXPORT_SYMBOL(tcp_prot);
2786
2787 static void __net_exit tcp_sk_exit(struct net *net)
2788 {
2789         int cpu;
2790
2791         if (net->ipv4.tcp_congestion_control)
2792                 bpf_module_put(net->ipv4.tcp_congestion_control,
2793                                net->ipv4.tcp_congestion_control->owner);
2794
2795         for_each_possible_cpu(cpu)
2796                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2797         free_percpu(net->ipv4.tcp_sk);
2798 }
2799
2800 static int __net_init tcp_sk_init(struct net *net)
2801 {
2802         int res, cpu, cnt;
2803
2804         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2805         if (!net->ipv4.tcp_sk)
2806                 return -ENOMEM;
2807
2808         for_each_possible_cpu(cpu) {
2809                 struct sock *sk;
2810
2811                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2812                                            IPPROTO_TCP, net);
2813                 if (res)
2814                         goto fail;
2815                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2816
2817                 /* Please enforce IP_DF and IPID==0 for RST and
2818                  * ACK sent in SYN-RECV and TIME-WAIT state.
2819                  */
2820                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2821
2822                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2823         }
2824
2825         net->ipv4.sysctl_tcp_ecn = 2;
2826         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2827
2828         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2829         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2830         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2831         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2832         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2833
2834         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2835         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2836         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2837
2838         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2839         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2840         net->ipv4.sysctl_tcp_syncookies = 1;
2841         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2842         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2843         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2844         net->ipv4.sysctl_tcp_orphan_retries = 0;
2845         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2846         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2847         net->ipv4.sysctl_tcp_tw_reuse = 2;
2848         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2849
2850         cnt = tcp_hashinfo.ehash_mask + 1;
2851         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2852         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2853
2854         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2855         net->ipv4.sysctl_tcp_sack = 1;
2856         net->ipv4.sysctl_tcp_window_scaling = 1;
2857         net->ipv4.sysctl_tcp_timestamps = 1;
2858         net->ipv4.sysctl_tcp_early_retrans = 3;
2859         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2860         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2861         net->ipv4.sysctl_tcp_retrans_collapse = 1;
2862         net->ipv4.sysctl_tcp_max_reordering = 300;
2863         net->ipv4.sysctl_tcp_dsack = 1;
2864         net->ipv4.sysctl_tcp_app_win = 31;
2865         net->ipv4.sysctl_tcp_adv_win_scale = 1;
2866         net->ipv4.sysctl_tcp_frto = 2;
2867         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2868         /* This limits the percentage of the congestion window which we
2869          * will allow a single TSO frame to consume.  Building TSO frames
2870          * which are too large can cause TCP streams to be bursty.
2871          */
2872         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2873         /* Default TSQ limit of 16 TSO segments */
2874         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2875         /* rfc5961 challenge ack rate limiting */
2876         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2877         net->ipv4.sysctl_tcp_min_tso_segs = 2;
2878         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2879         net->ipv4.sysctl_tcp_autocorking = 1;
2880         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2881         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2882         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2883         if (net != &init_net) {
2884                 memcpy(net->ipv4.sysctl_tcp_rmem,
2885                        init_net.ipv4.sysctl_tcp_rmem,
2886                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
2887                 memcpy(net->ipv4.sysctl_tcp_wmem,
2888                        init_net.ipv4.sysctl_tcp_wmem,
2889                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
2890         }
2891         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2892         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2893         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2894         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2895         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2896         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2897         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2898
2899         /* Reno is always built in */
2900         if (!net_eq(net, &init_net) &&
2901             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2902                                init_net.ipv4.tcp_congestion_control->owner))
2903                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2904         else
2905                 net->ipv4.tcp_congestion_control = &tcp_reno;
2906
2907         return 0;
2908 fail:
2909         tcp_sk_exit(net);
2910
2911         return res;
2912 }
2913
2914 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2915 {
2916         struct net *net;
2917
2918         inet_twsk_purge(&tcp_hashinfo, AF_INET);
2919
2920         list_for_each_entry(net, net_exit_list, exit_list)
2921                 tcp_fastopen_ctx_destroy(net);
2922 }
2923
2924 static struct pernet_operations __net_initdata tcp_sk_ops = {
2925        .init       = tcp_sk_init,
2926        .exit       = tcp_sk_exit,
2927        .exit_batch = tcp_sk_exit_batch,
2928 };
2929
2930 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2931 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2932                      struct sock_common *sk_common, uid_t uid)
2933
2934 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2935 {
2936         struct tcp_iter_state *st = priv_data;
2937         struct tcp_seq_afinfo *afinfo;
2938         int ret;
2939
2940         afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2941         if (!afinfo)
2942                 return -ENOMEM;
2943
2944         afinfo->family = AF_UNSPEC;
2945         st->bpf_seq_afinfo = afinfo;
2946         ret = bpf_iter_init_seq_net(priv_data, aux);
2947         if (ret)
2948                 kfree(afinfo);
2949         return ret;
2950 }
2951
2952 static void bpf_iter_fini_tcp(void *priv_data)
2953 {
2954         struct tcp_iter_state *st = priv_data;
2955
2956         kfree(st->bpf_seq_afinfo);
2957         bpf_iter_fini_seq_net(priv_data);
2958 }
2959
2960 static const struct bpf_iter_seq_info tcp_seq_info = {
2961         .seq_ops                = &bpf_iter_tcp_seq_ops,
2962         .init_seq_private       = bpf_iter_init_tcp,
2963         .fini_seq_private       = bpf_iter_fini_tcp,
2964         .seq_priv_size          = sizeof(struct tcp_iter_state),
2965 };
2966
2967 static struct bpf_iter_reg tcp_reg_info = {
2968         .target                 = "tcp",
2969         .ctx_arg_info_size      = 1,
2970         .ctx_arg_info           = {
2971                 { offsetof(struct bpf_iter__tcp, sk_common),
2972                   PTR_TO_BTF_ID_OR_NULL },
2973         },
2974         .seq_info               = &tcp_seq_info,
2975 };
2976
2977 static void __init bpf_iter_register(void)
2978 {
2979         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
2980         if (bpf_iter_reg_target(&tcp_reg_info))
2981                 pr_warn("Warning: could not register bpf iterator tcp\n");
2982 }
2983
2984 #endif
2985
2986 void __init tcp_v4_init(void)
2987 {
2988         if (register_pernet_subsys(&tcp_sk_ops))
2989                 panic("Failed to create the TCP control socket.\n");
2990
2991 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2992         bpf_iter_register();
2993 #endif
2994 }