net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/netdma.h>
  76 #include <net/secure_seq.h>
  77 #include <net/tcp_memcontrol.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93 #ifdef CONFIG_TCP_MD5SIG
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96 #endif
  97
  98 struct inet_hashinfo tcp_hashinfo;
  99 EXPORT_SYMBOL(tcp_hashinfo);
 100
 101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102 {
 103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                           ip_hdr(skb)->saddr,
 105                                           tcp_hdr(skb)->dest,
 106                                           tcp_hdr(skb)->source);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112         struct tcp_sock *tp = tcp_sk(sk);
 113
 114         /* With PAWS, it is safe from the viewpoint
 115            of data integrity. Even without PAWS it is safe provided sequence
 116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118            Actually, the idea is close to VJ's one, only timestamp cache is
 119            held not per host, but per port pair and TW bucket is used as state
 120            holder.
 121
 122            If TW bucket has been already destroyed we fall back to VJ's scheme
 123            and use initial timestamp retrieved from peer table.
 124          */
 125         if (tcptw->tw_ts_recent_stamp &&
 126             (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                 if (tp->write_seq == 0)
 130                         tp->write_seq = 1;
 131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                 sock_hold(sktw);
 134                 return 1;
 135         }
 136
 137         return 0;
 138 }
 139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141 static int tcp_repair_connect(struct sock *sk)
 142 {
 143         tcp_connect_init(sk);
 144         tcp_finish_connect(sk, NULL);
 145
 146         return 0;
 147 }
 148
 149 /* This will initiate an outgoing connection. */
 150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 151 {
 152         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 153         struct inet_sock *inet = inet_sk(sk);
 154         struct tcp_sock *tp = tcp_sk(sk);
 155         __be16 orig_sport, orig_dport;
 156         __be32 daddr, nexthop;
 157         struct flowi4 *fl4;
 158         struct rtable *rt;
 159         int err;
 160         struct ip_options_rcu *inet_opt;
 161
 162         if (addr_len < sizeof(struct sockaddr_in))
 163                 return -EINVAL;
 164
 165         if (usin->sin_family != AF_INET)
 166                 return -EAFNOSUPPORT;
 167
 168         nexthop = daddr = usin->sin_addr.s_addr;
 169         inet_opt = rcu_dereference_protected(inet->inet_opt,
 170                                              sock_owned_by_user(sk));
 171         if (inet_opt && inet_opt->opt.srr) {
 172                 if (!daddr)
 173                         return -EINVAL;
 174                 nexthop = inet_opt->opt.faddr;
 175         }
 176
 177         orig_sport = inet->inet_sport;
 178         orig_dport = usin->sin_port;
 179         fl4 = &inet->cork.fl.u.ip4;
 180         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 181                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 182                               IPPROTO_TCP,
 183                               orig_sport, orig_dport, sk, true);
 184         if (IS_ERR(rt)) {
 185                 err = PTR_ERR(rt);
 186                 if (err == -ENETUNREACH)
 187                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 188                 return err;
 189         }
 190
 191         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 192                 ip_rt_put(rt);
 193                 return -ENETUNREACH;
 194         }
 195
 196         if (!inet_opt || !inet_opt->opt.srr)
 197                 daddr = fl4->daddr;
 198
 199         if (!inet->inet_saddr)
 200                 inet->inet_saddr = fl4->saddr;
 201         inet->inet_rcv_saddr = inet->inet_saddr;
 202
 203         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 204                 /* Reset inherited state */
 205                 tp->rx_opt.ts_recent       = 0;
 206                 tp->rx_opt.ts_recent_stamp = 0;
 207                 if (likely(!tp->repair))
 208                         tp->write_seq      = 0;
 209         }
 210
 211         if (tcp_death_row.sysctl_tw_recycle &&
 212             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 213                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 214
 215         inet->inet_dport = usin->sin_port;
 216         inet->inet_daddr = daddr;
 217
 218         inet_csk(sk)->icsk_ext_hdr_len = 0;
 219         if (inet_opt)
 220                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 221
 222         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 223
 224         /* Socket identity is still unknown (sport may be zero).
 225          * However we set state to SYN-SENT and not releasing socket
 226          * lock select source port, enter ourselves into the hash tables and
 227          * complete initialization after this.
 228          */
 229         tcp_set_state(sk, TCP_SYN_SENT);
 230         err = inet_hash_connect(&tcp_death_row, sk);
 231         if (err)
 232                 goto failure;
 233
 234         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 235                                inet->inet_sport, inet->inet_dport, sk);
 236         if (IS_ERR(rt)) {
 237                 err = PTR_ERR(rt);
 238                 rt = NULL;
 239                 goto failure;
 240         }
 241         /* OK, now commit destination to socket.  */
 242         sk->sk_gso_type = SKB_GSO_TCPV4;
 243         sk_setup_caps(sk, &rt->dst);
 244
 245         if (!tp->write_seq && likely(!tp->repair))
 246                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 247                                                            inet->inet_daddr,
 248                                                            inet->inet_sport,
 249                                                            usin->sin_port);
 250
 251         inet->inet_id = tp->write_seq ^ jiffies;
 252
 253         if (likely(!tp->repair))
 254                 err = tcp_connect(sk);
 255         else
 256                 err = tcp_repair_connect(sk);
 257
 258         rt = NULL;
 259         if (err)
 260                 goto failure;
 261
 262         return 0;
 263
 264 failure:
 265         /*
 266          * This unhashes the socket and releases the local port,
 267          * if necessary.
 268          */
 269         tcp_set_state(sk, TCP_CLOSE);
 270         ip_rt_put(rt);
 271         sk->sk_route_caps = 0;
 272         inet->inet_dport = 0;
 273         return err;
 274 }
 275 EXPORT_SYMBOL(tcp_v4_connect);
 276
 277 /*
 278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 279  * It can be called through tcp_release_cb() if socket was owned by user
 280  * at the time tcp_v4_err() was called to handle ICMP message.
 281  */
 282 static void tcp_v4_mtu_reduced(struct sock *sk)
 283 {
 284         struct dst_entry *dst;
 285         struct inet_sock *inet = inet_sk(sk);
 286         u32 mtu = tcp_sk(sk)->mtu_info;
 287
 288         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 289          * send out by Linux are always <576bytes so they should go through
 290          * unfragmented).
 291          */
 292         if (sk->sk_state == TCP_LISTEN)
 293                 return;
 294
 295         dst = inet_csk_update_pmtu(sk, mtu);
 296         if (!dst)
 297                 return;
 298
 299         /* Something is about to be wrong... Remember soft error
 300          * for the case, if this connection will not able to recover.
 301          */
 302         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 303                 sk->sk_err_soft = EMSGSIZE;
 304
 305         mtu = dst_mtu(dst);
 306
 307         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 308             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 309                 tcp_sync_mss(sk, mtu);
 310
 311                 /* Resend the TCP packet because it's
 312                  * clear that the old packet has been
 313                  * dropped. This is the new "fast" path mtu
 314                  * discovery.
 315                  */
 316                 tcp_simple_retransmit(sk);
 317         } /* else let the usual retransmit timer handle it */
 318 }
 319
 320 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 321 {
 322         struct dst_entry *dst = __sk_dst_check(sk, 0);
 323
 324         if (dst)
 325                 dst->ops->redirect(dst, sk, skb);
 326 }
 327
 328 /*
 329  * This routine is called by the ICMP module when it gets some
 330  * sort of error condition.  If err < 0 then the socket should
 331  * be closed and the error returned to the user.  If err > 0
 332  * it's just the icmp type << 8 | icmp code.  After adjustment
 333  * header points to the first 8 bytes of the tcp header.  We need
 334  * to find the appropriate port.
 335  *
 336  * The locking strategy used here is very "optimistic". When
 337  * someone else accesses the socket the ICMP is just dropped
 338  * and for some paths there is no check at all.
 339  * A more general error queue to queue errors for later handling
 340  * is probably better.
 341  *
 342  */
 343
 344 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 345 {
 346         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 347         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 348         struct inet_connection_sock *icsk;
 349         struct tcp_sock *tp;
 350         struct inet_sock *inet;
 351         const int type = icmp_hdr(icmp_skb)->type;
 352         const int code = icmp_hdr(icmp_skb)->code;
 353         struct sock *sk;
 354         struct sk_buff *skb;
 355         struct request_sock *req;
 356         __u32 seq;
 357         __u32 remaining;
 358         int err;
 359         struct net *net = dev_net(icmp_skb->dev);
 360
 361         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 362                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 363                 return;
 364         }
 365
 366         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 367                         iph->saddr, th->source, inet_iif(icmp_skb));
 368         if (!sk) {
 369                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 370                 return;
 371         }
 372         if (sk->sk_state == TCP_TIME_WAIT) {
 373                 inet_twsk_put(inet_twsk(sk));
 374                 return;
 375         }
 376
 377         bh_lock_sock(sk);
 378         /* If too many ICMPs get dropped on busy
 379          * servers this needs to be solved differently.
 380          * We do take care of PMTU discovery (RFC1191) special case :
 381          * we can receive locally generated ICMP messages while socket is held.
 382          */
 383         if (sock_owned_by_user(sk) &&
 384             type != ICMP_DEST_UNREACH &&
 385             code != ICMP_FRAG_NEEDED)
 386                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 387
 388         if (sk->sk_state == TCP_CLOSE)
 389                 goto out;
 390
 391         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 392                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 393                 goto out;
 394         }
 395
 396         icsk = inet_csk(sk);
 397         tp = tcp_sk(sk);
 398         req = tp->fastopen_rsk;
 399         seq = ntohl(th->seq);
 400         if (sk->sk_state != TCP_LISTEN &&
 401             !between(seq, tp->snd_una, tp->snd_nxt) &&
 402             (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
 403                 /* For a Fast Open socket, allow seq to be snt_isn. */
 404                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 405                 goto out;
 406         }
 407
 408         switch (type) {
 409         case ICMP_REDIRECT:
 410                 do_redirect(icmp_skb, sk);
 411                 goto out;
 412         case ICMP_SOURCE_QUENCH:
 413                 /* Just silently ignore these. */
 414                 goto out;
 415         case ICMP_PARAMETERPROB:
 416                 err = EPROTO;
 417                 break;
 418         case ICMP_DEST_UNREACH:
 419                 if (code > NR_ICMP_UNREACH)
 420                         goto out;
 421
 422                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 423                         tp->mtu_info = info;
 424                         if (!sock_owned_by_user(sk)) {
 425                                 tcp_v4_mtu_reduced(sk);
 426                         } else {
 427                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 428                                         sock_hold(sk);
 429                         }
 430                         goto out;
 431                 }
 432
 433                 err = icmp_err_convert[code].errno;
 434                 /* check if icmp_skb allows revert of backoff
 435                  * (see draft-zimmermann-tcp-lcd) */
 436                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 437                         break;
 438                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 439                     !icsk->icsk_backoff)
 440                         break;
 441
 442                 /* XXX (TFO) - revisit the following logic for TFO */
 443
 444                 if (sock_owned_by_user(sk))
 445                         break;
 446
 447                 icsk->icsk_backoff--;
 448                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 449                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 450                 tcp_bound_rto(sk);
 451
 452                 skb = tcp_write_queue_head(sk);
 453                 BUG_ON(!skb);
 454
 455                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 456                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 457
 458                 if (remaining) {
 459                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 460                                                   remaining, TCP_RTO_MAX);
 461                 } else {
 462                         /* RTO revert clocked out retransmission.
 463                          * Will retransmit now */
 464                         tcp_retransmit_timer(sk);
 465                 }
 466
 467                 break;
 468         case ICMP_TIME_EXCEEDED:
 469                 err = EHOSTUNREACH;
 470                 break;
 471         default:
 472                 goto out;
 473         }
 474
 475         /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
 476          * than following the TCP_SYN_RECV case and closing the socket,
 477          * we ignore the ICMP error and keep trying like a fully established
 478          * socket. Is this the right thing to do?
 479          */
 480         if (req && req->sk == NULL)
 481                 goto out;
 482
 483         switch (sk->sk_state) {
 484                 struct request_sock *req, **prev;
 485         case TCP_LISTEN:
 486                 if (sock_owned_by_user(sk))
 487                         goto out;
 488
 489                 req = inet_csk_search_req(sk, &prev, th->dest,
 490                                           iph->daddr, iph->saddr);
 491                 if (!req)
 492                         goto out;
 493
 494                 /* ICMPs are not backlogged, hence we cannot get
 495                    an established socket here.
 496                  */
 497                 WARN_ON(req->sk);
 498
 499                 if (seq != tcp_rsk(req)->snt_isn) {
 500                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 501                         goto out;
 502                 }
 503
 504                 /*
 505                  * Still in SYN_RECV, just remove it silently.
 506                  * There is no good way to pass the error to the newly
 507                  * created socket, and POSIX does not want network
 508                  * errors returned from accept().
 509                  */
 510                 inet_csk_reqsk_queue_drop(sk, req, prev);
 511                 goto out;
 512
 513         case TCP_SYN_SENT:
 514         case TCP_SYN_RECV:  /* Cannot happen.
 515                                It can f.e. if SYNs crossed,
 516                                or Fast Open.
 517                              */
 518                 if (!sock_owned_by_user(sk)) {
 519                         sk->sk_err = err;
 520
 521                         sk->sk_error_report(sk);
 522
 523                         tcp_done(sk);
 524                 } else {
 525                         sk->sk_err_soft = err;
 526                 }
 527                 goto out;
 528         }
 529
 530         /* If we've already connected we will keep trying
 531          * until we time out, or the user gives up.
 532          *
 533          * rfc1122 4.2.3.9 allows to consider as hard errors
 534          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 535          * but it is obsoleted by pmtu discovery).
 536          *
 537          * Note, that in modern internet, where routing is unreliable
 538          * and in each dark corner broken firewalls sit, sending random
 539          * errors ordered by their masters even this two messages finally lose
 540          * their original sense (even Linux sends invalid PORT_UNREACHs)
 541          *
 542          * Now we are in compliance with RFCs.
 543          *                                                      --ANK (980905)
 544          */
 545
 546         inet = inet_sk(sk);
 547         if (!sock_owned_by_user(sk) && inet->recverr) {
 548                 sk->sk_err = err;
 549                 sk->sk_error_report(sk);
 550         } else  { /* Only an error on timeout */
 551                 sk->sk_err_soft = err;
 552         }
 553
 554 out:
 555         bh_unlock_sock(sk);
 556         sock_put(sk);
 557 }
 558
 559 static void __tcp_v4_send_check(struct sk_buff *skb,
 560                                 __be32 saddr, __be32 daddr)
 561 {
 562         struct tcphdr *th = tcp_hdr(skb);
 563
 564         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 565                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 566                 skb->csum_start = skb_transport_header(skb) - skb->head;
 567                 skb->csum_offset = offsetof(struct tcphdr, check);
 568         } else {
 569                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 570                                          csum_partial(th,
 571                                                       th->doff << 2,
 572                                                       skb->csum));
 573         }
 574 }
 575
 576 /* This routine computes an IPv4 TCP checksum. */
 577 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 578 {
 579         const struct inet_sock *inet = inet_sk(sk);
 580
 581         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 582 }
 583 EXPORT_SYMBOL(tcp_v4_send_check);
 584
 585 int tcp_v4_gso_send_check(struct sk_buff *skb)
 586 {
 587         const struct iphdr *iph;
 588         struct tcphdr *th;
 589
 590         if (!pskb_may_pull(skb, sizeof(*th)))
 591                 return -EINVAL;
 592
 593         iph = ip_hdr(skb);
 594         th = tcp_hdr(skb);
 595
 596         th->check = 0;
 597         skb->ip_summed = CHECKSUM_PARTIAL;
 598         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 599         return 0;
 600 }
 601
 602 /*
 603  *      This routine will send an RST to the other tcp.
 604  *
 605  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 606  *                    for reset.
 607  *      Answer: if a packet caused RST, it is not for a socket
 608  *              existing in our system, if it is matched to a socket,
 609  *              it is just duplicate segment or bug in other side's TCP.
 610  *              So that we build reply only basing on parameters
 611  *              arrived with segment.
 612  *      Exception: precedence violation. We do not implement it in any case.
 613  */
 614
 615 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 616 {
 617         const struct tcphdr *th = tcp_hdr(skb);
 618         struct {
 619                 struct tcphdr th;
 620 #ifdef CONFIG_TCP_MD5SIG
 621                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 622 #endif
 623         } rep;
 624         struct ip_reply_arg arg;
 625 #ifdef CONFIG_TCP_MD5SIG
 626         struct tcp_md5sig_key *key;
 627         const __u8 *hash_location = NULL;
 628         unsigned char newhash[16];
 629         int genhash;
 630         struct sock *sk1 = NULL;
 631 #endif
 632         struct net *net;
 633
 634         /* Never send a reset in response to a reset. */
 635         if (th->rst)
 636                 return;
 637
 638         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 639                 return;
 640
 641         /* Swap the send and the receive. */
 642         memset(&rep, 0, sizeof(rep));
 643         rep.th.dest   = th->source;
 644         rep.th.source = th->dest;
 645         rep.th.doff   = sizeof(struct tcphdr) / 4;
 646         rep.th.rst    = 1;
 647
 648         if (th->ack) {
 649                 rep.th.seq = th->ack_seq;
 650         } else {
 651                 rep.th.ack = 1;
 652                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 653                                        skb->len - (th->doff << 2));
 654         }
 655
 656         memset(&arg, 0, sizeof(arg));
 657         arg.iov[0].iov_base = (unsigned char *)&rep;
 658         arg.iov[0].iov_len  = sizeof(rep.th);
 659
 660 #ifdef CONFIG_TCP_MD5SIG
 661         hash_location = tcp_parse_md5sig_option(th);
 662         if (!sk && hash_location) {
 663                 /*
 664                  * active side is lost. Try to find listening socket through
 665                  * source port, and then find md5 key through listening socket.
 666                  * we are not loose security here:
 667                  * Incoming packet is checked with md5 hash with finding key,
 668                  * no RST generated if md5 hash doesn't match.
 669                  */
 670                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 671                                              &tcp_hashinfo, ip_hdr(skb)->daddr,
 672                                              ntohs(th->source), inet_iif(skb));
 673                 /* don't send rst if it can't find key */
 674                 if (!sk1)
 675                         return;
 676                 rcu_read_lock();
 677                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 678                                         &ip_hdr(skb)->saddr, AF_INET);
 679                 if (!key)
 680                         goto release_sk1;
 681
 682                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 683                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 684                         goto release_sk1;
 685         } else {
 686                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 687                                              &ip_hdr(skb)->saddr,
 688                                              AF_INET) : NULL;
 689         }
 690
 691         if (key) {
 692                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 693                                    (TCPOPT_NOP << 16) |
 694                                    (TCPOPT_MD5SIG << 8) |
 695                                    TCPOLEN_MD5SIG);
 696                 /* Update length and the length the header thinks exists */
 697                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 698                 rep.th.doff = arg.iov[0].iov_len / 4;
 699
 700                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 701                                      key, ip_hdr(skb)->saddr,
 702                                      ip_hdr(skb)->daddr, &rep.th);
 703         }
 704 #endif
 705         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 706                                       ip_hdr(skb)->saddr, /* XXX */
 707                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 708         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 709         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 710         /* When socket is gone, all binding information is lost.
 711          * routing might fail in this case. using iif for oif to
 712          * make sure we can deliver it
 713          */
 714         arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
 715
 716         net = dev_net(skb_dst(skb)->dev);
 717         arg.tos = ip_hdr(skb)->tos;
 718         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 719                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 720
 721         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 722         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 723
 724 #ifdef CONFIG_TCP_MD5SIG
 725 release_sk1:
 726         if (sk1) {
 727                 rcu_read_unlock();
 728                 sock_put(sk1);
 729         }
 730 #endif
 731 }
 732
 733 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 734    outside socket context is ugly, certainly. What can I do?
 735  */
 736
 737 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 738                             u32 win, u32 ts, int oif,
 739                             struct tcp_md5sig_key *key,
 740                             int reply_flags, u8 tos)
 741 {
 742         const struct tcphdr *th = tcp_hdr(skb);
 743         struct {
 744                 struct tcphdr th;
 745                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 746 #ifdef CONFIG_TCP_MD5SIG
 747                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 748 #endif
 749                         ];
 750         } rep;
 751         struct ip_reply_arg arg;
 752         struct net *net = dev_net(skb_dst(skb)->dev);
 753
 754         memset(&rep.th, 0, sizeof(struct tcphdr));
 755         memset(&arg, 0, sizeof(arg));
 756
 757         arg.iov[0].iov_base = (unsigned char *)&rep;
 758         arg.iov[0].iov_len  = sizeof(rep.th);
 759         if (ts) {
 760                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 761                                    (TCPOPT_TIMESTAMP << 8) |
 762                                    TCPOLEN_TIMESTAMP);
 763                 rep.opt[1] = htonl(tcp_time_stamp);
 764                 rep.opt[2] = htonl(ts);
 765                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 766         }
 767
 768         /* Swap the send and the receive. */
 769         rep.th.dest    = th->source;
 770         rep.th.source  = th->dest;
 771         rep.th.doff    = arg.iov[0].iov_len / 4;
 772         rep.th.seq     = htonl(seq);
 773         rep.th.ack_seq = htonl(ack);
 774         rep.th.ack     = 1;
 775         rep.th.window  = htons(win);
 776
 777 #ifdef CONFIG_TCP_MD5SIG
 778         if (key) {
 779                 int offset = (ts) ? 3 : 0;
 780
 781                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 782                                           (TCPOPT_NOP << 16) |
 783                                           (TCPOPT_MD5SIG << 8) |
 784                                           TCPOLEN_MD5SIG);
 785                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 786                 rep.th.doff = arg.iov[0].iov_len/4;
 787
 788                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 789                                     key, ip_hdr(skb)->saddr,
 790                                     ip_hdr(skb)->daddr, &rep.th);
 791         }
 792 #endif
 793         arg.flags = reply_flags;
 794         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 795                                       ip_hdr(skb)->saddr, /* XXX */
 796                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 797         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 798         if (oif)
 799                 arg.bound_dev_if = oif;
 800         arg.tos = tos;
 801         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 802                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 803
 804         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 805 }
 806
 807 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 808 {
 809         struct inet_timewait_sock *tw = inet_twsk(sk);
 810         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 811
 812         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 813                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 814                         tcptw->tw_ts_recent,
 815                         tw->tw_bound_dev_if,
 816                         tcp_twsk_md5_key(tcptw),
 817                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 818                         tw->tw_tos
 819                         );
 820
 821         inet_twsk_put(tw);
 822 }
 823
 824 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 825                                   struct request_sock *req)
 826 {
 827         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 828          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 829          */
 830         tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
 831                         tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 832                         tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 833                         req->ts_recent,
 834                         0,
 835                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 836                                           AF_INET),
 837                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 838                         ip_hdr(skb)->tos);
 839 }
 840
 841 /*
 842  *      Send a SYN-ACK after having received a SYN.
 843  *      This still operates on a request_sock only, not on a big
 844  *      socket.
 845  */
 846 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 847                               struct request_sock *req,
 848                               struct request_values *rvp,
 849                               u16 queue_mapping,
 850                               bool nocache)
 851 {
 852         const struct inet_request_sock *ireq = inet_rsk(req);
 853         struct flowi4 fl4;
 854         int err = -1;
 855         struct sk_buff * skb;
 856
 857         /* First, grab a route. */
 858         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 859                 return -1;
 860
 861         skb = tcp_make_synack(sk, dst, req, rvp, NULL);
 862
 863         if (skb) {
 864                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 865
 866                 skb_set_queue_mapping(skb, queue_mapping);
 867                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 868                                             ireq->rmt_addr,
 869                                             ireq->opt);
 870                 err = net_xmit_eval(err);
 871         }
 872
 873         return err;
 874 }
 875
 876 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 877                               struct request_values *rvp)
 878 {
 879         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 880         return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
 881 }
 882
 883 /*
 884  *      IPv4 request_sock destructor.
 885  */
 886 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 887 {
 888         kfree(inet_rsk(req)->opt);
 889 }
 890
 891 /*
 892  * Return true if a syncookie should be sent
 893  */
 894 bool tcp_syn_flood_action(struct sock *sk,
 895                          const struct sk_buff *skb,
 896                          const char *proto)
 897 {
 898         const char *msg = "Dropping request";
 899         bool want_cookie = false;
 900         struct listen_sock *lopt;
 901
 902
 903
 904 #ifdef CONFIG_SYN_COOKIES
 905         if (sysctl_tcp_syncookies) {
 906                 msg = "Sending cookies";
 907                 want_cookie = true;
 908                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 909         } else
 910 #endif
 911                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 912
 913         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 914         if (!lopt->synflood_warned) {
 915                 lopt->synflood_warned = 1;
 916                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 917                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 918         }
 919         return want_cookie;
 920 }
 921 EXPORT_SYMBOL(tcp_syn_flood_action);
 922
 923 /*
 924  * Save and compile IPv4 options into the request_sock if needed.
 925  */
 926 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 927                                                   struct sk_buff *skb)
 928 {
 929         const struct ip_options *opt = &(IPCB(skb)->opt);
 930         struct ip_options_rcu *dopt = NULL;
 931
 932         if (opt && opt->optlen) {
 933                 int opt_size = sizeof(*dopt) + opt->optlen;
 934
 935                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 936                 if (dopt) {
 937                         if (ip_options_echo(&dopt->opt, skb)) {
 938                                 kfree(dopt);
 939                                 dopt = NULL;
 940                         }
 941                 }
 942         }
 943         return dopt;
 944 }
 945
 946 #ifdef CONFIG_TCP_MD5SIG
 947 /*
 948  * RFC2385 MD5 checksumming requires a mapping of
 949  * IP address->MD5 Key.
 950  * We need to maintain these in the sk structure.
 951  */
 952
 953 /* Find the Key structure for an address.  */
 954 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 955                                          const union tcp_md5_addr *addr,
 956                                          int family)
 957 {
 958         struct tcp_sock *tp = tcp_sk(sk);
 959         struct tcp_md5sig_key *key;
 960         struct hlist_node *pos;
 961         unsigned int size = sizeof(struct in_addr);
 962         struct tcp_md5sig_info *md5sig;
 963
 964         /* caller either holds rcu_read_lock() or socket lock */
 965         md5sig = rcu_dereference_check(tp->md5sig_info,
 966                                        sock_owned_by_user(sk) ||
 967                                        lockdep_is_held(&sk->sk_lock.slock));
 968         if (!md5sig)
 969                 return NULL;
 970 #if IS_ENABLED(CONFIG_IPV6)
 971         if (family == AF_INET6)
 972                 size = sizeof(struct in6_addr);
 973 #endif
 974         hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
 975                 if (key->family != family)
 976                         continue;
 977                 if (!memcmp(&key->addr, addr, size))
 978                         return key;
 979         }
 980         return NULL;
 981 }
 982 EXPORT_SYMBOL(tcp_md5_do_lookup);
 983
 984 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 985                                          struct sock *addr_sk)
 986 {
 987         union tcp_md5_addr *addr;
 988
 989         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 990         return tcp_md5_do_lookup(sk, addr, AF_INET);
 991 }
 992 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 993
 994 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 995                                                       struct request_sock *req)
 996 {
 997         union tcp_md5_addr *addr;
 998
 999         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
1000         return tcp_md5_do_lookup(sk, addr, AF_INET);
1001 }
1002
1003 /* This can be called on a newly created socket, from other files */
1004 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1005                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1006 {
1007         /* Add Key to the list */
1008         struct tcp_md5sig_key *key;
1009         struct tcp_sock *tp = tcp_sk(sk);
1010         struct tcp_md5sig_info *md5sig;
1011
1012         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1013         if (key) {
1014                 /* Pre-existing entry - just update that one. */
1015                 memcpy(key->key, newkey, newkeylen);
1016                 key->keylen = newkeylen;
1017                 return 0;
1018         }
1019
1020         md5sig = rcu_dereference_protected(tp->md5sig_info,
1021                                            sock_owned_by_user(sk));
1022         if (!md5sig) {
1023                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1024                 if (!md5sig)
1025                         return -ENOMEM;
1026
1027                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1028                 INIT_HLIST_HEAD(&md5sig->head);
1029                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1030         }
1031
1032         key = sock_kmalloc(sk, sizeof(*key), gfp);
1033         if (!key)
1034                 return -ENOMEM;
1035         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1036                 sock_kfree_s(sk, key, sizeof(*key));
1037                 return -ENOMEM;
1038         }
1039
1040         memcpy(key->key, newkey, newkeylen);
1041         key->keylen = newkeylen;
1042         key->family = family;
1043         memcpy(&key->addr, addr,
1044                (family == AF_INET6) ? sizeof(struct in6_addr) :
1045                                       sizeof(struct in_addr));
1046         hlist_add_head_rcu(&key->node, &md5sig->head);
1047         return 0;
1048 }
1049 EXPORT_SYMBOL(tcp_md5_do_add);
1050
1051 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1052 {
1053         struct tcp_sock *tp = tcp_sk(sk);
1054         struct tcp_md5sig_key *key;
1055         struct tcp_md5sig_info *md5sig;
1056
1057         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1058         if (!key)
1059                 return -ENOENT;
1060         hlist_del_rcu(&key->node);
1061         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1062         kfree_rcu(key, rcu);
1063         md5sig = rcu_dereference_protected(tp->md5sig_info,
1064                                            sock_owned_by_user(sk));
1065         if (hlist_empty(&md5sig->head))
1066                 tcp_free_md5sig_pool();
1067         return 0;
1068 }
1069 EXPORT_SYMBOL(tcp_md5_do_del);
1070
1071 void tcp_clear_md5_list(struct sock *sk)
1072 {
1073         struct tcp_sock *tp = tcp_sk(sk);
1074         struct tcp_md5sig_key *key;
1075         struct hlist_node *pos, *n;
1076         struct tcp_md5sig_info *md5sig;
1077
1078         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1079
1080         if (!hlist_empty(&md5sig->head))
1081                 tcp_free_md5sig_pool();
1082         hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1083                 hlist_del_rcu(&key->node);
1084                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1085                 kfree_rcu(key, rcu);
1086         }
1087 }
1088
1089 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1090                                  int optlen)
1091 {
1092         struct tcp_md5sig cmd;
1093         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1094
1095         if (optlen < sizeof(cmd))
1096                 return -EINVAL;
1097
1098         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1099                 return -EFAULT;
1100
1101         if (sin->sin_family != AF_INET)
1102                 return -EINVAL;
1103
1104         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1105                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1106                                       AF_INET);
1107
1108         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1109                 return -EINVAL;
1110
1111         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1112                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1113                               GFP_KERNEL);
1114 }
1115
1116 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1117                                         __be32 daddr, __be32 saddr, int nbytes)
1118 {
1119         struct tcp4_pseudohdr *bp;
1120         struct scatterlist sg;
1121
1122         bp = &hp->md5_blk.ip4;
1123
1124         /*
1125          * 1. the TCP pseudo-header (in the order: source IP address,
1126          * destination IP address, zero-padded protocol number, and
1127          * segment length)
1128          */
1129         bp->saddr = saddr;
1130         bp->daddr = daddr;
1131         bp->pad = 0;
1132         bp->protocol = IPPROTO_TCP;
1133         bp->len = cpu_to_be16(nbytes);
1134
1135         sg_init_one(&sg, bp, sizeof(*bp));
1136         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1137 }
1138
1139 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1140                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1141 {
1142         struct tcp_md5sig_pool *hp;
1143         struct hash_desc *desc;
1144
1145         hp = tcp_get_md5sig_pool();
1146         if (!hp)
1147                 goto clear_hash_noput;
1148         desc = &hp->md5_desc;
1149
1150         if (crypto_hash_init(desc))
1151                 goto clear_hash;
1152         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1153                 goto clear_hash;
1154         if (tcp_md5_hash_header(hp, th))
1155                 goto clear_hash;
1156         if (tcp_md5_hash_key(hp, key))
1157                 goto clear_hash;
1158         if (crypto_hash_final(desc, md5_hash))
1159                 goto clear_hash;
1160
1161         tcp_put_md5sig_pool();
1162         return 0;
1163
1164 clear_hash:
1165         tcp_put_md5sig_pool();
1166 clear_hash_noput:
1167         memset(md5_hash, 0, 16);
1168         return 1;
1169 }
1170
1171 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1172                         const struct sock *sk, const struct request_sock *req,
1173                         const struct sk_buff *skb)
1174 {
1175         struct tcp_md5sig_pool *hp;
1176         struct hash_desc *desc;
1177         const struct tcphdr *th = tcp_hdr(skb);
1178         __be32 saddr, daddr;
1179
1180         if (sk) {
1181                 saddr = inet_sk(sk)->inet_saddr;
1182                 daddr = inet_sk(sk)->inet_daddr;
1183         } else if (req) {
1184                 saddr = inet_rsk(req)->loc_addr;
1185                 daddr = inet_rsk(req)->rmt_addr;
1186         } else {
1187                 const struct iphdr *iph = ip_hdr(skb);
1188                 saddr = iph->saddr;
1189                 daddr = iph->daddr;
1190         }
1191
1192         hp = tcp_get_md5sig_pool();
1193         if (!hp)
1194                 goto clear_hash_noput;
1195         desc = &hp->md5_desc;
1196
1197         if (crypto_hash_init(desc))
1198                 goto clear_hash;
1199
1200         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1201                 goto clear_hash;
1202         if (tcp_md5_hash_header(hp, th))
1203                 goto clear_hash;
1204         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1205                 goto clear_hash;
1206         if (tcp_md5_hash_key(hp, key))
1207                 goto clear_hash;
1208         if (crypto_hash_final(desc, md5_hash))
1209                 goto clear_hash;
1210
1211         tcp_put_md5sig_pool();
1212         return 0;
1213
1214 clear_hash:
1215         tcp_put_md5sig_pool();
1216 clear_hash_noput:
1217         memset(md5_hash, 0, 16);
1218         return 1;
1219 }
1220 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1221
1222 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1223 {
1224         /*
1225          * This gets called for each TCP segment that arrives
1226          * so we want to be efficient.
1227          * We have 3 drop cases:
1228          * o No MD5 hash and one expected.
1229          * o MD5 hash and we're not expecting one.
1230          * o MD5 hash and its wrong.
1231          */
1232         const __u8 *hash_location = NULL;
1233         struct tcp_md5sig_key *hash_expected;
1234         const struct iphdr *iph = ip_hdr(skb);
1235         const struct tcphdr *th = tcp_hdr(skb);
1236         int genhash;
1237         unsigned char newhash[16];
1238
1239         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1240                                           AF_INET);
1241         hash_location = tcp_parse_md5sig_option(th);
1242
1243         /* We've parsed the options - do we have a hash? */
1244         if (!hash_expected && !hash_location)
1245                 return false;
1246
1247         if (hash_expected && !hash_location) {
1248                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1249                 return true;
1250         }
1251
1252         if (!hash_expected && hash_location) {
1253                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1254                 return true;
1255         }
1256
1257         /* Okay, so this is hash_expected and hash_location -
1258          * so we need to calculate the checksum.
1259          */
1260         genhash = tcp_v4_md5_hash_skb(newhash,
1261                                       hash_expected,
1262                                       NULL, NULL, skb);
1263
1264         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1265                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1266                                      &iph->saddr, ntohs(th->source),
1267                                      &iph->daddr, ntohs(th->dest),
1268                                      genhash ? " tcp_v4_calc_md5_hash failed"
1269                                      : "");
1270                 return true;
1271         }
1272         return false;
1273 }
1274
1275 #endif
1276
1277 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1278         .family         =       PF_INET,
1279         .obj_size       =       sizeof(struct tcp_request_sock),
1280         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1281         .send_ack       =       tcp_v4_reqsk_send_ack,
1282         .destructor     =       tcp_v4_reqsk_destructor,
1283         .send_reset     =       tcp_v4_send_reset,
1284         .syn_ack_timeout =      tcp_syn_ack_timeout,
1285 };
1286
1287 #ifdef CONFIG_TCP_MD5SIG
1288 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1289         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1290         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1291 };
1292 #endif
1293
1294 static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1295                                struct request_sock *req,
1296                                struct tcp_fastopen_cookie *foc,
1297                                struct tcp_fastopen_cookie *valid_foc)
1298 {
1299         bool skip_cookie = false;
1300         struct fastopen_queue *fastopenq;
1301
1302         if (likely(!fastopen_cookie_present(foc))) {
1303                 /* See include/net/tcp.h for the meaning of these knobs */
1304                 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1305                     ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1306                     (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1307                         skip_cookie = true; /* no cookie to validate */
1308                 else
1309                         return false;
1310         }
1311         fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1312         /* A FO option is present; bump the counter. */
1313         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1314
1315         /* Make sure the listener has enabled fastopen, and we don't
1316          * exceed the max # of pending TFO requests allowed before trying
1317          * to validating the cookie in order to avoid burning CPU cycles
1318          * unnecessarily.
1319          *
1320          * XXX (TFO) - The implication of checking the max_qlen before
1321          * processing a cookie request is that clients can't differentiate
1322          * between qlen overflow causing Fast Open to be disabled
1323          * temporarily vs a server not supporting Fast Open at all.
1324          */
1325         if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1326             fastopenq == NULL || fastopenq->max_qlen == 0)
1327                 return false;
1328
1329         if (fastopenq->qlen >= fastopenq->max_qlen) {
1330                 struct request_sock *req1;
1331                 spin_lock(&fastopenq->lock);
1332                 req1 = fastopenq->rskq_rst_head;
1333                 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1334                         spin_unlock(&fastopenq->lock);
1335                         NET_INC_STATS_BH(sock_net(sk),
1336                             LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1337                         /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1338                         foc->len = -1;
1339                         return false;
1340                 }
1341                 fastopenq->rskq_rst_head = req1->dl_next;
1342                 fastopenq->qlen--;
1343                 spin_unlock(&fastopenq->lock);
1344                 reqsk_free(req1);
1345         }
1346         if (skip_cookie) {
1347                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1348                 return true;
1349         }
1350         if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1351                 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1352                         tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1353                         if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1354                             memcmp(&foc->val[0], &valid_foc->val[0],
1355                             TCP_FASTOPEN_COOKIE_SIZE) != 0)
1356                                 return false;
1357                         valid_foc->len = -1;
1358                 }
1359                 /* Acknowledge the data received from the peer. */
1360                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1361                 return true;
1362         } else if (foc->len == 0) { /* Client requesting a cookie */
1363                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1364                 NET_INC_STATS_BH(sock_net(sk),
1365                     LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1366         } else {
1367                 /* Client sent a cookie with wrong size. Treat it
1368                  * the same as invalid and return a valid one.
1369                  */
1370                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1371         }
1372         return false;
1373 }
1374
1375 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1376                                     struct sk_buff *skb,
1377                                     struct sk_buff *skb_synack,
1378                                     struct request_sock *req,
1379                                     struct request_values *rvp)
1380 {
1381         struct tcp_sock *tp = tcp_sk(sk);
1382         struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1383         const struct inet_request_sock *ireq = inet_rsk(req);
1384         struct sock *child;
1385
1386         req->retrans = 0;
1387         req->sk = NULL;
1388
1389         child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1390         if (child == NULL) {
1391                 NET_INC_STATS_BH(sock_net(sk),
1392                                  LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1393                 kfree_skb(skb_synack);
1394                 return -1;
1395         }
1396         ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1397                         ireq->rmt_addr, ireq->opt);
1398         /* XXX (TFO) - is it ok to ignore error and continue? */
1399
1400         spin_lock(&queue->fastopenq->lock);
1401         queue->fastopenq->qlen++;
1402         spin_unlock(&queue->fastopenq->lock);
1403
1404         /* Initialize the child socket. Have to fix some values to take
1405          * into account the child is a Fast Open socket and is created
1406          * only out of the bits carried in the SYN packet.
1407          */
1408         tp = tcp_sk(child);
1409
1410         tp->fastopen_rsk = req;
1411         /* Do a hold on the listner sk so that if the listener is being
1412          * closed, the child that has been accepted can live on and still
1413          * access listen_lock.
1414          */
1415         sock_hold(sk);
1416         tcp_rsk(req)->listener = sk;
1417
1418         /* RFC1323: The window in SYN & SYN/ACK segments is never
1419          * scaled. So correct it appropriately.
1420          */
1421         tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1422
1423         /* Activate the retrans timer so that SYNACK can be retransmitted.
1424          * The request socket is not added to the SYN table of the parent
1425          * because it's been added to the accept queue directly.
1426          */
1427         inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1428             TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1429
1430         /* Add the child socket directly into the accept queue */
1431         inet_csk_reqsk_queue_add(sk, req, child);
1432
1433         /* Now finish processing the fastopen child socket. */
1434         inet_csk(child)->icsk_af_ops->rebuild_header(child);
1435         tcp_init_congestion_control(child);
1436         tcp_mtup_init(child);
1437         tcp_init_buffer_space(child);
1438         tcp_init_metrics(child);
1439
1440         /* Queue the data carried in the SYN packet. We need to first
1441          * bump skb's refcnt because the caller will attempt to free it.
1442          *
1443          * XXX (TFO) - we honor a zero-payload TFO request for now.
1444          * (Any reason not to?)
1445          */
1446         if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1447                 /* Don't queue the skb if there is no payload in SYN.
1448                  * XXX (TFO) - How about SYN+FIN?
1449                  */
1450                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1451         } else {
1452                 skb = skb_get(skb);
1453                 skb_dst_drop(skb);
1454                 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1455                 skb_set_owner_r(skb, child);
1456                 __skb_queue_tail(&child->sk_receive_queue, skb);
1457                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1458         }
1459         sk->sk_data_ready(sk, 0);
1460         bh_unlock_sock(child);
1461         sock_put(child);
1462         WARN_ON(req->sk == NULL);
1463         return 0;
1464 }
1465
1466 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1467 {
1468         struct tcp_extend_values tmp_ext;
1469         struct tcp_options_received tmp_opt;
1470         const u8 *hash_location;
1471         struct request_sock *req;
1472         struct inet_request_sock *ireq;
1473         struct tcp_sock *tp = tcp_sk(sk);
1474         struct dst_entry *dst = NULL;
1475         __be32 saddr = ip_hdr(skb)->saddr;
1476         __be32 daddr = ip_hdr(skb)->daddr;
1477         __u32 isn = TCP_SKB_CB(skb)->when;
1478         bool want_cookie = false;
1479         struct flowi4 fl4;
1480         struct tcp_fastopen_cookie foc = { .len = -1 };
1481         struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1482         struct sk_buff *skb_synack;
1483         int do_fastopen;
1484
1485         /* Never answer to SYNs send to broadcast or multicast */
1486         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1487                 goto drop;
1488
1489         /* TW buckets are converted to open requests without
1490          * limitations, they conserve resources and peer is
1491          * evidently real one.
1492          */
1493         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1494                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1495                 if (!want_cookie)
1496                         goto drop;
1497         }
1498
1499         /* Accept backlog is full. If we have already queued enough
1500          * of warm entries in syn queue, drop request. It is better than
1501          * clogging syn queue with openreqs with exponentially increasing
1502          * timeout.
1503          */
1504         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1505                 goto drop;
1506
1507         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1508         if (!req)
1509                 goto drop;
1510
1511 #ifdef CONFIG_TCP_MD5SIG
1512         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1513 #endif
1514
1515         tcp_clear_options(&tmp_opt);
1516         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1517         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1518         tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1519             want_cookie ? NULL : &foc);
1520
1521         if (tmp_opt.cookie_plus > 0 &&
1522             tmp_opt.saw_tstamp &&
1523             !tp->rx_opt.cookie_out_never &&
1524             (sysctl_tcp_cookie_size > 0 ||
1525              (tp->cookie_values != NULL &&
1526               tp->cookie_values->cookie_desired > 0))) {
1527                 u8 *c;
1528                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1529                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1530
1531                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1532                         goto drop_and_release;
1533
1534                 /* Secret recipe starts with IP addresses */
1535                 *mess++ ^= (__force u32)daddr;
1536                 *mess++ ^= (__force u32)saddr;
1537
1538                 /* plus variable length Initiator Cookie */
1539                 c = (u8 *)mess;
1540                 while (l-- > 0)
1541                         *c++ ^= *hash_location++;
1542
1543                 want_cookie = false;    /* not our kind of cookie */
1544                 tmp_ext.cookie_out_never = 0; /* false */
1545                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1546         } else if (!tp->rx_opt.cookie_in_always) {
1547                 /* redundant indications, but ensure initialization. */
1548                 tmp_ext.cookie_out_never = 1; /* true */
1549                 tmp_ext.cookie_plus = 0;
1550         } else {
1551                 goto drop_and_release;
1552         }
1553         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1554
1555         if (want_cookie && !tmp_opt.saw_tstamp)
1556                 tcp_clear_options(&tmp_opt);
1557
1558         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1559         tcp_openreq_init(req, &tmp_opt, skb);
1560
1561         ireq = inet_rsk(req);
1562         ireq->loc_addr = daddr;
1563         ireq->rmt_addr = saddr;
1564         ireq->no_srccheck = inet_sk(sk)->transparent;
1565         ireq->opt = tcp_v4_save_options(sk, skb);
1566
1567         if (security_inet_conn_request(sk, skb, req))
1568                 goto drop_and_free;
1569
1570         if (!want_cookie || tmp_opt.tstamp_ok)
1571                 TCP_ECN_create_request(req, skb);
1572
1573         if (want_cookie) {
1574                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1575                 req->cookie_ts = tmp_opt.tstamp_ok;
1576         } else if (!isn) {
1577                 /* VJ's idea. We save last timestamp seen
1578                  * from the destination in peer table, when entering
1579                  * state TIME-WAIT, and check against it before
1580                  * accepting new connection request.
1581                  *
1582                  * If "isn" is not zero, this request hit alive
1583                  * timewait bucket, so that all the necessary checks
1584                  * are made in the function processing timewait state.
1585                  */
1586                 if (tmp_opt.saw_tstamp &&
1587                     tcp_death_row.sysctl_tw_recycle &&
1588                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1589                     fl4.daddr == saddr) {
1590                         if (!tcp_peer_is_proven(req, dst, true)) {
1591                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1592                                 goto drop_and_release;
1593                         }
1594                 }
1595                 /* Kill the following clause, if you dislike this way. */
1596                 else if (!sysctl_tcp_syncookies &&
1597                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1598                           (sysctl_max_syn_backlog >> 2)) &&
1599                          !tcp_peer_is_proven(req, dst, false)) {
1600                         /* Without syncookies last quarter of
1601                          * backlog is filled with destinations,
1602                          * proven to be alive.
1603                          * It means that we continue to communicate
1604                          * to destinations, already remembered
1605                          * to the moment of synflood.
1606                          */
1607                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1608                                        &saddr, ntohs(tcp_hdr(skb)->source));
1609                         goto drop_and_release;
1610                 }
1611
1612                 isn = tcp_v4_init_sequence(skb);
1613         }
1614         tcp_rsk(req)->snt_isn = isn;
1615         tcp_rsk(req)->snt_synack = tcp_time_stamp;
1616
1617         if (dst == NULL) {
1618                 dst = inet_csk_route_req(sk, &fl4, req);
1619                 if (dst == NULL)
1620                         goto drop_and_free;
1621         }
1622         do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1623
1624         /* We don't call tcp_v4_send_synack() directly because we need
1625          * to make sure a child socket can be created successfully before
1626          * sending back synack!
1627          *
1628          * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1629          * (or better yet, call tcp_send_synack() in the child context
1630          * directly, but will have to fix bunch of other code first)
1631          * after syn_recv_sock() except one will need to first fix the
1632          * latter to remove its dependency on the current implementation
1633          * of tcp_v4_send_synack()->tcp_select_initial_window().
1634          */
1635         skb_synack = tcp_make_synack(sk, dst, req,
1636             (struct request_values *)&tmp_ext,
1637             fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1638
1639         if (skb_synack) {
1640                 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1641                 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1642         } else
1643                 goto drop_and_free;
1644
1645         if (likely(!do_fastopen)) {
1646                 int err;
1647                 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1648                      ireq->rmt_addr, ireq->opt);
1649                 err = net_xmit_eval(err);
1650                 if (err || want_cookie)
1651                         goto drop_and_free;
1652
1653                 tcp_rsk(req)->listener = NULL;
1654                 /* Add the request_sock to the SYN table */
1655                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1656                 if (fastopen_cookie_present(&foc) && foc.len != 0)
1657                         NET_INC_STATS_BH(sock_net(sk),
1658                             LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1659         } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1660             (struct request_values *)&tmp_ext))
1661                 goto drop_and_free;
1662
1663         return 0;
1664
1665 drop_and_release:
1666         dst_release(dst);
1667 drop_and_free:
1668         reqsk_free(req);
1669 drop:
1670         return 0;
1671 }
1672 EXPORT_SYMBOL(tcp_v4_conn_request);
1673
1674
1675 /*
1676  * The three way handshake has completed - we got a valid synack -
1677  * now create the new socket.
1678  */
1679 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1680                                   struct request_sock *req,
1681                                   struct dst_entry *dst)
1682 {
1683         struct inet_request_sock *ireq;
1684         struct inet_sock *newinet;
1685         struct tcp_sock *newtp;
1686         struct sock *newsk;
1687 #ifdef CONFIG_TCP_MD5SIG
1688         struct tcp_md5sig_key *key;
1689 #endif
1690         struct ip_options_rcu *inet_opt;
1691
1692         if (sk_acceptq_is_full(sk))
1693                 goto exit_overflow;
1694
1695         newsk = tcp_create_openreq_child(sk, req, skb);
1696         if (!newsk)
1697                 goto exit_nonewsk;
1698
1699         newsk->sk_gso_type = SKB_GSO_TCPV4;
1700         inet_sk_rx_dst_set(newsk, skb);
1701
1702         newtp                 = tcp_sk(newsk);
1703         newinet               = inet_sk(newsk);
1704         ireq                  = inet_rsk(req);
1705         newinet->inet_daddr   = ireq->rmt_addr;
1706         newinet->inet_rcv_saddr = ireq->loc_addr;
1707         newinet->inet_saddr           = ireq->loc_addr;
1708         inet_opt              = ireq->opt;
1709         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1710         ireq->opt             = NULL;
1711         newinet->mc_index     = inet_iif(skb);
1712         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1713         newinet->rcv_tos      = ip_hdr(skb)->tos;
1714         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1715         if (inet_opt)
1716                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1717         newinet->inet_id = newtp->write_seq ^ jiffies;
1718
1719         if (!dst) {
1720                 dst = inet_csk_route_child_sock(sk, newsk, req);
1721                 if (!dst)
1722                         goto put_and_exit;
1723         } else {
1724                 /* syncookie case : see end of cookie_v4_check() */
1725         }
1726         sk_setup_caps(newsk, dst);
1727
1728         tcp_mtup_init(newsk);
1729         tcp_sync_mss(newsk, dst_mtu(dst));
1730         newtp->advmss = dst_metric_advmss(dst);
1731         if (tcp_sk(sk)->rx_opt.user_mss &&
1732             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1733                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1734
1735         tcp_initialize_rcv_mss(newsk);
1736         if (tcp_rsk(req)->snt_synack)
1737                 tcp_valid_rtt_meas(newsk,
1738                     tcp_time_stamp - tcp_rsk(req)->snt_synack);
1739         newtp->total_retrans = req->retrans;
1740
1741 #ifdef CONFIG_TCP_MD5SIG
1742         /* Copy over the MD5 key from the original socket */
1743         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1744                                 AF_INET);
1745         if (key != NULL) {
1746                 /*
1747                  * We're using one, so create a matching key
1748                  * on the newsk structure. If we fail to get
1749                  * memory, then we end up not copying the key
1750                  * across. Shucks.
1751                  */
1752                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1753                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1754                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1755         }
1756 #endif
1757
1758         if (__inet_inherit_port(sk, newsk) < 0)
1759                 goto put_and_exit;
1760         __inet_hash_nolisten(newsk, NULL);
1761
1762         return newsk;
1763
1764 exit_overflow:
1765         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1766 exit_nonewsk:
1767         dst_release(dst);
1768 exit:
1769         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1770         return NULL;
1771 put_and_exit:
1772         tcp_clear_xmit_timers(newsk);
1773         tcp_cleanup_congestion_control(newsk);
1774         bh_unlock_sock(newsk);
1775         sock_put(newsk);
1776         goto exit;
1777 }
1778 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1779
1780 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1781 {
1782         struct tcphdr *th = tcp_hdr(skb);
1783         const struct iphdr *iph = ip_hdr(skb);
1784         struct sock *nsk;
1785         struct request_sock **prev;
1786         /* Find possible connection requests. */
1787         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1788                                                        iph->saddr, iph->daddr);
1789         if (req)
1790                 return tcp_check_req(sk, skb, req, prev, false);
1791
1792         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1793                         th->source, iph->daddr, th->dest, inet_iif(skb));
1794
1795         if (nsk) {
1796                 if (nsk->sk_state != TCP_TIME_WAIT) {
1797                         bh_lock_sock(nsk);
1798                         return nsk;
1799                 }
1800                 inet_twsk_put(inet_twsk(nsk));
1801                 return NULL;
1802         }
1803
1804 #ifdef CONFIG_SYN_COOKIES
1805         if (!th->syn)
1806                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1807 #endif
1808         return sk;
1809 }
1810
1811 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1812 {
1813         const struct iphdr *iph = ip_hdr(skb);
1814
1815         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1816                 if (!tcp_v4_check(skb->len, iph->saddr,
1817                                   iph->daddr, skb->csum)) {
1818                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1819                         return 0;
1820                 }
1821         }
1822
1823         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1824                                        skb->len, IPPROTO_TCP, 0);
1825
1826         if (skb->len <= 76) {
1827                 return __skb_checksum_complete(skb);
1828         }
1829         return 0;
1830 }
1831
1832
1833 /* The socket must have it's spinlock held when we get
1834  * here.
1835  *
1836  * We have a potential double-lock case here, so even when
1837  * doing backlog processing we use the BH locking scheme.
1838  * This is because we cannot sleep with the original spinlock
1839  * held.
1840  */
1841 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1842 {
1843         struct sock *rsk;
1844 #ifdef CONFIG_TCP_MD5SIG
1845         /*
1846          * We really want to reject the packet as early as possible
1847          * if:
1848          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1849          *  o There is an MD5 option and we're not expecting one
1850          */
1851         if (tcp_v4_inbound_md5_hash(sk, skb))
1852                 goto discard;
1853 #endif
1854
1855         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1856                 struct dst_entry *dst = sk->sk_rx_dst;
1857
1858                 sock_rps_save_rxhash(sk, skb);
1859                 if (dst) {
1860                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1861                             dst->ops->check(dst, 0) == NULL) {
1862                                 dst_release(dst);
1863                                 sk->sk_rx_dst = NULL;
1864                         }
1865                 }
1866                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1867                         rsk = sk;
1868                         goto reset;
1869                 }
1870                 return 0;
1871         }
1872
1873         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1874                 goto csum_err;
1875
1876         if (sk->sk_state == TCP_LISTEN) {
1877                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1878                 if (!nsk)
1879                         goto discard;
1880
1881                 if (nsk != sk) {
1882                         sock_rps_save_rxhash(nsk, skb);
1883                         if (tcp_child_process(sk, nsk, skb)) {
1884                                 rsk = nsk;
1885                                 goto reset;
1886                         }
1887                         return 0;
1888                 }
1889         } else
1890                 sock_rps_save_rxhash(sk, skb);
1891
1892         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1893                 rsk = sk;
1894                 goto reset;
1895         }
1896         return 0;
1897
1898 reset:
1899         tcp_v4_send_reset(rsk, skb);
1900 discard:
1901         kfree_skb(skb);
1902         /* Be careful here. If this function gets more complicated and
1903          * gcc suffers from register pressure on the x86, sk (in %ebx)
1904          * might be destroyed here. This current version compiles correctly,
1905          * but you have been warned.
1906          */
1907         return 0;
1908
1909 csum_err:
1910         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1911         goto discard;
1912 }
1913 EXPORT_SYMBOL(tcp_v4_do_rcv);
1914
1915 void tcp_v4_early_demux(struct sk_buff *skb)
1916 {
1917         struct net *net = dev_net(skb->dev);
1918         const struct iphdr *iph;
1919         const struct tcphdr *th;
1920         struct sock *sk;
1921
1922         if (skb->pkt_type != PACKET_HOST)
1923                 return;
1924
1925         if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1926                 return;
1927
1928         iph = ip_hdr(skb);
1929         th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1930
1931         if (th->doff < sizeof(struct tcphdr) / 4)
1932                 return;
1933
1934         sk = __inet_lookup_established(net, &tcp_hashinfo,
1935                                        iph->saddr, th->source,
1936                                        iph->daddr, ntohs(th->dest),
1937                                        skb->skb_iif);
1938         if (sk) {
1939                 skb->sk = sk;
1940                 skb->destructor = sock_edemux;
1941                 if (sk->sk_state != TCP_TIME_WAIT) {
1942                         struct dst_entry *dst = sk->sk_rx_dst;
1943
1944                         if (dst)
1945                                 dst = dst_check(dst, 0);
1946                         if (dst &&
1947                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1948                                 skb_dst_set_noref(skb, dst);
1949                 }
1950         }
1951 }
1952
1953 /*
1954  *      From tcp_input.c
1955  */
1956
1957 int tcp_v4_rcv(struct sk_buff *skb)
1958 {
1959         const struct iphdr *iph;
1960         const struct tcphdr *th;
1961         struct sock *sk;
1962         int ret;
1963         struct net *net = dev_net(skb->dev);
1964
1965         if (skb->pkt_type != PACKET_HOST)
1966                 goto discard_it;
1967
1968         /* Count it even if it's bad */
1969         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1970
1971         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1972                 goto discard_it;
1973
1974         th = tcp_hdr(skb);
1975
1976         if (th->doff < sizeof(struct tcphdr) / 4)
1977                 goto bad_packet;
1978         if (!pskb_may_pull(skb, th->doff * 4))
1979                 goto discard_it;
1980
1981         /* An explanation is required here, I think.
1982          * Packet length and doff are validated by header prediction,
1983          * provided case of th->doff==0 is eliminated.
1984          * So, we defer the checks. */
1985         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1986                 goto bad_packet;
1987
1988         th = tcp_hdr(skb);
1989         iph = ip_hdr(skb);
1990         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1991         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1992                                     skb->len - th->doff * 4);
1993         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1994         TCP_SKB_CB(skb)->when    = 0;
1995         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1996         TCP_SKB_CB(skb)->sacked  = 0;
1997
1998         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1999         if (!sk)
2000                 goto no_tcp_socket;
2001
2002 process:
2003         if (sk->sk_state == TCP_TIME_WAIT)
2004                 goto do_time_wait;
2005
2006         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2007                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2008                 goto discard_and_relse;
2009         }
2010
2011         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2012                 goto discard_and_relse;
2013         nf_reset(skb);
2014
2015         if (sk_filter(sk, skb))
2016                 goto discard_and_relse;
2017
2018         skb->dev = NULL;
2019
2020         bh_lock_sock_nested(sk);
2021         ret = 0;
2022         if (!sock_owned_by_user(sk)) {
2023 #ifdef CONFIG_NET_DMA
2024                 struct tcp_sock *tp = tcp_sk(sk);
2025                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2026                         tp->ucopy.dma_chan = net_dma_find_channel();
2027                 if (tp->ucopy.dma_chan)
2028                         ret = tcp_v4_do_rcv(sk, skb);
2029                 else
2030 #endif
2031                 {
2032                         if (!tcp_prequeue(sk, skb))
2033                                 ret = tcp_v4_do_rcv(sk, skb);
2034                 }
2035         } else if (unlikely(sk_add_backlog(sk, skb,
2036                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
2037                 bh_unlock_sock(sk);
2038                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2039                 goto discard_and_relse;
2040         }
2041         bh_unlock_sock(sk);
2042
2043         sock_put(sk);
2044
2045         return ret;
2046
2047 no_tcp_socket:
2048         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2049                 goto discard_it;
2050
2051         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2052 bad_packet:
2053                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2054         } else {
2055                 tcp_v4_send_reset(NULL, skb);
2056         }
2057
2058 discard_it:
2059         /* Discard frame. */
2060         kfree_skb(skb);
2061         return 0;
2062
2063 discard_and_relse:
2064         sock_put(sk);
2065         goto discard_it;
2066
2067 do_time_wait:
2068         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2069                 inet_twsk_put(inet_twsk(sk));
2070                 goto discard_it;
2071         }
2072
2073         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2074                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2075                 inet_twsk_put(inet_twsk(sk));
2076                 goto discard_it;
2077         }
2078         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2079         case TCP_TW_SYN: {
2080                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2081                                                         &tcp_hashinfo,
2082                                                         iph->daddr, th->dest,
2083                                                         inet_iif(skb));
2084                 if (sk2) {
2085                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2086                         inet_twsk_put(inet_twsk(sk));
2087                         sk = sk2;
2088                         goto process;
2089                 }
2090                 /* Fall through to ACK */
2091         }
2092         case TCP_TW_ACK:
2093                 tcp_v4_timewait_ack(sk, skb);
2094                 break;
2095         case TCP_TW_RST:
2096                 goto no_tcp_socket;
2097         case TCP_TW_SUCCESS:;
2098         }
2099         goto discard_it;
2100 }
2101
2102 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2103         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2104         .twsk_unique    = tcp_twsk_unique,
2105         .twsk_destructor= tcp_twsk_destructor,
2106 };
2107
2108 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2109 {
2110         struct dst_entry *dst = skb_dst(skb);
2111
2112         dst_hold(dst);
2113         sk->sk_rx_dst = dst;
2114         inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2115 }
2116 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2117
2118 const struct inet_connection_sock_af_ops ipv4_specific = {
2119         .queue_xmit        = ip_queue_xmit,
2120         .send_check        = tcp_v4_send_check,
2121         .rebuild_header    = inet_sk_rebuild_header,
2122         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2123         .conn_request      = tcp_v4_conn_request,
2124         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2125         .net_header_len    = sizeof(struct iphdr),
2126         .setsockopt        = ip_setsockopt,
2127         .getsockopt        = ip_getsockopt,
2128         .addr2sockaddr     = inet_csk_addr2sockaddr,
2129         .sockaddr_len      = sizeof(struct sockaddr_in),
2130         .bind_conflict     = inet_csk_bind_conflict,
2131 #ifdef CONFIG_COMPAT
2132         .compat_setsockopt = compat_ip_setsockopt,
2133         .compat_getsockopt = compat_ip_getsockopt,
2134 #endif
2135 };
2136 EXPORT_SYMBOL(ipv4_specific);
2137
2138 #ifdef CONFIG_TCP_MD5SIG
2139 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2140         .md5_lookup             = tcp_v4_md5_lookup,
2141         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2142         .md5_parse              = tcp_v4_parse_md5_keys,
2143 };
2144 #endif
2145
2146 /* NOTE: A lot of things set to zero explicitly by call to
2147  *       sk_alloc() so need not be done here.
2148  */
2149 static int tcp_v4_init_sock(struct sock *sk)
2150 {
2151         struct inet_connection_sock *icsk = inet_csk(sk);
2152
2153         tcp_init_sock(sk);
2154
2155         icsk->icsk_af_ops = &ipv4_specific;
2156
2157 #ifdef CONFIG_TCP_MD5SIG
2158         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2159 #endif
2160
2161         return 0;
2162 }
2163
2164 void tcp_v4_destroy_sock(struct sock *sk)
2165 {
2166         struct tcp_sock *tp = tcp_sk(sk);
2167
2168         tcp_clear_xmit_timers(sk);
2169
2170         tcp_cleanup_congestion_control(sk);
2171
2172         /* Cleanup up the write buffer. */
2173         tcp_write_queue_purge(sk);
2174
2175         /* Cleans up our, hopefully empty, out_of_order_queue. */
2176         __skb_queue_purge(&tp->out_of_order_queue);
2177
2178 #ifdef CONFIG_TCP_MD5SIG
2179         /* Clean up the MD5 key list, if any */
2180         if (tp->md5sig_info) {
2181                 tcp_clear_md5_list(sk);
2182                 kfree_rcu(tp->md5sig_info, rcu);
2183                 tp->md5sig_info = NULL;
2184         }
2185 #endif
2186
2187 #ifdef CONFIG_NET_DMA
2188         /* Cleans up our sk_async_wait_queue */
2189         __skb_queue_purge(&sk->sk_async_wait_queue);
2190 #endif
2191
2192         /* Clean prequeue, it must be empty really */
2193         __skb_queue_purge(&tp->ucopy.prequeue);
2194
2195         /* Clean up a referenced TCP bind bucket. */
2196         if (inet_csk(sk)->icsk_bind_hash)
2197                 inet_put_port(sk);
2198
2199         /*
2200          * If sendmsg cached page exists, toss it.
2201          */
2202         if (sk->sk_sndmsg_page) {
2203                 __free_page(sk->sk_sndmsg_page);
2204                 sk->sk_sndmsg_page = NULL;
2205         }
2206
2207         /* TCP Cookie Transactions */
2208         if (tp->cookie_values != NULL) {
2209                 kref_put(&tp->cookie_values->kref,
2210                          tcp_cookie_values_release);
2211                 tp->cookie_values = NULL;
2212         }
2213         BUG_ON(tp->fastopen_rsk != NULL);
2214
2215         /* If socket is aborted during connect operation */
2216         tcp_free_fastopen_req(tp);
2217
2218         sk_sockets_allocated_dec(sk);
2219         sock_release_memcg(sk);
2220 }
2221 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2222
2223 #ifdef CONFIG_PROC_FS
2224 /* Proc filesystem TCP sock list dumping. */
2225
2226 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2227 {
2228         return hlist_nulls_empty(head) ? NULL :
2229                 list_entry(head->first, struct inet_timewait_sock, tw_node);
2230 }
2231
2232 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2233 {
2234         return !is_a_nulls(tw->tw_node.next) ?
2235                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2236 }
2237
2238 /*
2239  * Get next listener socket follow cur.  If cur is NULL, get first socket
2240  * starting from bucket given in st->bucket; when st->bucket is zero the
2241  * very first socket in the hash table is returned.
2242  */
2243 static void *listening_get_next(struct seq_file *seq, void *cur)
2244 {
2245         struct inet_connection_sock *icsk;
2246         struct hlist_nulls_node *node;
2247         struct sock *sk = cur;
2248         struct inet_listen_hashbucket *ilb;
2249         struct tcp_iter_state *st = seq->private;
2250         struct net *net = seq_file_net(seq);
2251
2252         if (!sk) {
2253                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2254                 spin_lock_bh(&ilb->lock);
2255                 sk = sk_nulls_head(&ilb->head);
2256                 st->offset = 0;
2257                 goto get_sk;
2258         }
2259         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2260         ++st->num;
2261         ++st->offset;
2262
2263         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2264                 struct request_sock *req = cur;
2265
2266                 icsk = inet_csk(st->syn_wait_sk);
2267                 req = req->dl_next;
2268                 while (1) {
2269                         while (req) {
2270                                 if (req->rsk_ops->family == st->family) {
2271                                         cur = req;
2272                                         goto out;
2273                                 }
2274                                 req = req->dl_next;
2275                         }
2276                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2277                                 break;
2278 get_req:
2279                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2280                 }
2281                 sk        = sk_nulls_next(st->syn_wait_sk);
2282                 st->state = TCP_SEQ_STATE_LISTENING;
2283                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2284         } else {
2285                 icsk = inet_csk(sk);
2286                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2287                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2288                         goto start_req;
2289                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2290                 sk = sk_nulls_next(sk);
2291         }
2292 get_sk:
2293         sk_nulls_for_each_from(sk, node) {
2294                 if (!net_eq(sock_net(sk), net))
2295                         continue;
2296                 if (sk->sk_family == st->family) {
2297                         cur = sk;
2298                         goto out;
2299                 }
2300                 icsk = inet_csk(sk);
2301                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2302                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2303 start_req:
2304                         st->uid         = sock_i_uid(sk);
2305                         st->syn_wait_sk = sk;
2306                         st->state       = TCP_SEQ_STATE_OPENREQ;
2307                         st->sbucket     = 0;
2308                         goto get_req;
2309                 }
2310                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2311         }
2312         spin_unlock_bh(&ilb->lock);
2313         st->offset = 0;
2314         if (++st->bucket < INET_LHTABLE_SIZE) {
2315                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2316                 spin_lock_bh(&ilb->lock);
2317                 sk = sk_nulls_head(&ilb->head);
2318                 goto get_sk;
2319         }
2320         cur = NULL;
2321 out:
2322         return cur;
2323 }
2324
2325 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2326 {
2327         struct tcp_iter_state *st = seq->private;
2328         void *rc;
2329
2330         st->bucket = 0;
2331         st->offset = 0;
2332         rc = listening_get_next(seq, NULL);
2333
2334         while (rc && *pos) {
2335                 rc = listening_get_next(seq, rc);
2336                 --*pos;
2337         }
2338         return rc;
2339 }
2340
2341 static inline bool empty_bucket(struct tcp_iter_state *st)
2342 {
2343         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2344                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2345 }
2346
2347 /*
2348  * Get first established socket starting from bucket given in st->bucket.
2349  * If st->bucket is zero, the very first socket in the hash is returned.
2350  */
2351 static void *established_get_first(struct seq_file *seq)
2352 {
2353         struct tcp_iter_state *st = seq->private;
2354         struct net *net = seq_file_net(seq);
2355         void *rc = NULL;
2356
2357         st->offset = 0;
2358         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2359                 struct sock *sk;
2360                 struct hlist_nulls_node *node;
2361                 struct inet_timewait_sock *tw;
2362                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2363
2364                 /* Lockless fast path for the common case of empty buckets */
2365                 if (empty_bucket(st))
2366                         continue;
2367
2368                 spin_lock_bh(lock);
2369                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2370                         if (sk->sk_family != st->family ||
2371                             !net_eq(sock_net(sk), net)) {
2372                                 continue;
2373                         }
2374                         rc = sk;
2375                         goto out;
2376                 }
2377                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2378                 inet_twsk_for_each(tw, node,
2379                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2380                         if (tw->tw_family != st->family ||
2381                             !net_eq(twsk_net(tw), net)) {
2382                                 continue;
2383                         }
2384                         rc = tw;
2385                         goto out;
2386                 }
2387                 spin_unlock_bh(lock);
2388                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2389         }
2390 out:
2391         return rc;
2392 }
2393
2394 static void *established_get_next(struct seq_file *seq, void *cur)
2395 {
2396         struct sock *sk = cur;
2397         struct inet_timewait_sock *tw;
2398         struct hlist_nulls_node *node;
2399         struct tcp_iter_state *st = seq->private;
2400         struct net *net = seq_file_net(seq);
2401
2402         ++st->num;
2403         ++st->offset;
2404
2405         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2406                 tw = cur;
2407                 tw = tw_next(tw);
2408 get_tw:
2409                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2410                         tw = tw_next(tw);
2411                 }
2412                 if (tw) {
2413                         cur = tw;
2414                         goto out;
2415                 }
2416                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2417                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2418
2419                 /* Look for next non empty bucket */
2420                 st->offset = 0;
2421                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2422                                 empty_bucket(st))
2423                         ;
2424                 if (st->bucket > tcp_hashinfo.ehash_mask)
2425                         return NULL;
2426
2427                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2428                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2429         } else
2430                 sk = sk_nulls_next(sk);
2431
2432         sk_nulls_for_each_from(sk, node) {
2433                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2434                         goto found;
2435         }
2436
2437         st->state = TCP_SEQ_STATE_TIME_WAIT;
2438         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2439         goto get_tw;
2440 found:
2441         cur = sk;
2442 out:
2443         return cur;
2444 }
2445
2446 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2447 {
2448         struct tcp_iter_state *st = seq->private;
2449         void *rc;
2450
2451         st->bucket = 0;
2452         rc = established_get_first(seq);
2453
2454         while (rc && pos) {
2455                 rc = established_get_next(seq, rc);
2456                 --pos;
2457         }
2458         return rc;
2459 }
2460
2461 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2462 {
2463         void *rc;
2464         struct tcp_iter_state *st = seq->private;
2465
2466         st->state = TCP_SEQ_STATE_LISTENING;
2467         rc        = listening_get_idx(seq, &pos);
2468
2469         if (!rc) {
2470                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2471                 rc        = established_get_idx(seq, pos);
2472         }
2473
2474         return rc;
2475 }
2476
2477 static void *tcp_seek_last_pos(struct seq_file *seq)
2478 {
2479         struct tcp_iter_state *st = seq->private;
2480         int offset = st->offset;
2481         int orig_num = st->num;
2482         void *rc = NULL;
2483
2484         switch (st->state) {
2485         case TCP_SEQ_STATE_OPENREQ:
2486         case TCP_SEQ_STATE_LISTENING:
2487                 if (st->bucket >= INET_LHTABLE_SIZE)
2488                         break;
2489                 st->state = TCP_SEQ_STATE_LISTENING;
2490                 rc = listening_get_next(seq, NULL);
2491                 while (offset-- && rc)
2492                         rc = listening_get_next(seq, rc);
2493                 if (rc)
2494                         break;
2495                 st->bucket = 0;
2496                 /* Fallthrough */
2497         case TCP_SEQ_STATE_ESTABLISHED:
2498         case TCP_SEQ_STATE_TIME_WAIT:
2499                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2500                 if (st->bucket > tcp_hashinfo.ehash_mask)
2501                         break;
2502                 rc = established_get_first(seq);
2503                 while (offset-- && rc)
2504                         rc = established_get_next(seq, rc);
2505         }
2506
2507         st->num = orig_num;
2508
2509         return rc;
2510 }
2511
2512 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2513 {
2514         struct tcp_iter_state *st = seq->private;
2515         void *rc;
2516
2517         if (*pos && *pos == st->last_pos) {
2518                 rc = tcp_seek_last_pos(seq);
2519                 if (rc)
2520                         goto out;
2521         }
2522
2523         st->state = TCP_SEQ_STATE_LISTENING;
2524         st->num = 0;
2525         st->bucket = 0;
2526         st->offset = 0;
2527         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2528
2529 out:
2530         st->last_pos = *pos;
2531         return rc;
2532 }
2533
2534 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2535 {
2536         struct tcp_iter_state *st = seq->private;
2537         void *rc = NULL;
2538
2539         if (v == SEQ_START_TOKEN) {
2540                 rc = tcp_get_idx(seq, 0);
2541                 goto out;
2542         }
2543
2544         switch (st->state) {
2545         case TCP_SEQ_STATE_OPENREQ:
2546         case TCP_SEQ_STATE_LISTENING:
2547                 rc = listening_get_next(seq, v);
2548                 if (!rc) {
2549                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2550                         st->bucket = 0;
2551                         st->offset = 0;
2552                         rc        = established_get_first(seq);
2553                 }
2554                 break;
2555         case TCP_SEQ_STATE_ESTABLISHED:
2556         case TCP_SEQ_STATE_TIME_WAIT:
2557                 rc = established_get_next(seq, v);
2558                 break;
2559         }
2560 out:
2561         ++*pos;
2562         st->last_pos = *pos;
2563         return rc;
2564 }
2565
2566 static void tcp_seq_stop(struct seq_file *seq, void *v)
2567 {
2568         struct tcp_iter_state *st = seq->private;
2569
2570         switch (st->state) {
2571         case TCP_SEQ_STATE_OPENREQ:
2572                 if (v) {
2573                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2574                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2575                 }
2576         case TCP_SEQ_STATE_LISTENING:
2577                 if (v != SEQ_START_TOKEN)
2578                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2579                 break;
2580         case TCP_SEQ_STATE_TIME_WAIT:
2581         case TCP_SEQ_STATE_ESTABLISHED:
2582                 if (v)
2583                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2584                 break;
2585         }
2586 }
2587
2588 int tcp_seq_open(struct inode *inode, struct file *file)
2589 {
2590         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2591         struct tcp_iter_state *s;
2592         int err;
2593
2594         err = seq_open_net(inode, file, &afinfo->seq_ops,
2595                           sizeof(struct tcp_iter_state));
2596         if (err < 0)
2597                 return err;
2598
2599         s = ((struct seq_file *)file->private_data)->private;
2600         s->family               = afinfo->family;
2601         s->last_pos             = 0;
2602         return 0;
2603 }
2604 EXPORT_SYMBOL(tcp_seq_open);
2605
2606 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2607 {
2608         int rc = 0;
2609         struct proc_dir_entry *p;
2610
2611         afinfo->seq_ops.start           = tcp_seq_start;
2612         afinfo->seq_ops.next            = tcp_seq_next;
2613         afinfo->seq_ops.stop            = tcp_seq_stop;
2614
2615         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2616                              afinfo->seq_fops, afinfo);
2617         if (!p)
2618                 rc = -ENOMEM;
2619         return rc;
2620 }
2621 EXPORT_SYMBOL(tcp_proc_register);
2622
2623 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2624 {
2625         proc_net_remove(net, afinfo->name);
2626 }
2627 EXPORT_SYMBOL(tcp_proc_unregister);
2628
2629 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2630                          struct seq_file *f, int i, kuid_t uid, int *len)
2631 {
2632         const struct inet_request_sock *ireq = inet_rsk(req);
2633         long delta = req->expires - jiffies;
2634
2635         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2636                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2637                 i,
2638                 ireq->loc_addr,
2639                 ntohs(inet_sk(sk)->inet_sport),
2640                 ireq->rmt_addr,
2641                 ntohs(ireq->rmt_port),
2642                 TCP_SYN_RECV,
2643                 0, 0, /* could print option size, but that is af dependent. */
2644                 1,    /* timers active (only the expire timer) */
2645                 jiffies_delta_to_clock_t(delta),
2646                 req->retrans,
2647                 from_kuid_munged(seq_user_ns(f), uid),
2648                 0,  /* non standard timer */
2649                 0, /* open_requests have no inode */
2650                 atomic_read(&sk->sk_refcnt),
2651                 req,
2652                 len);
2653 }
2654
2655 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2656 {
2657         int timer_active;
2658         unsigned long timer_expires;
2659         const struct tcp_sock *tp = tcp_sk(sk);
2660         const struct inet_connection_sock *icsk = inet_csk(sk);
2661         const struct inet_sock *inet = inet_sk(sk);
2662         struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2663         __be32 dest = inet->inet_daddr;
2664         __be32 src = inet->inet_rcv_saddr;
2665         __u16 destp = ntohs(inet->inet_dport);
2666         __u16 srcp = ntohs(inet->inet_sport);
2667         int rx_queue;
2668
2669         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2670                 timer_active    = 1;
2671                 timer_expires   = icsk->icsk_timeout;
2672         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2673                 timer_active    = 4;
2674                 timer_expires   = icsk->icsk_timeout;
2675         } else if (timer_pending(&sk->sk_timer)) {
2676                 timer_active    = 2;
2677                 timer_expires   = sk->sk_timer.expires;
2678         } else {
2679                 timer_active    = 0;
2680                 timer_expires = jiffies;
2681         }
2682
2683         if (sk->sk_state == TCP_LISTEN)
2684                 rx_queue = sk->sk_ack_backlog;
2685         else
2686                 /*
2687                  * because we dont lock socket, we might find a transient negative value
2688                  */
2689                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2690
2691         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2692                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2693                 i, src, srcp, dest, destp, sk->sk_state,
2694                 tp->write_seq - tp->snd_una,
2695                 rx_queue,
2696                 timer_active,
2697                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2698                 icsk->icsk_retransmits,
2699                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2700                 icsk->icsk_probes_out,
2701                 sock_i_ino(sk),
2702                 atomic_read(&sk->sk_refcnt), sk,
2703                 jiffies_to_clock_t(icsk->icsk_rto),
2704                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2705                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2706                 tp->snd_cwnd,
2707                 sk->sk_state == TCP_LISTEN ?
2708                     (fastopenq ? fastopenq->max_qlen : 0) :
2709                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2710                 len);
2711 }
2712
2713 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2714                                struct seq_file *f, int i, int *len)
2715 {
2716         __be32 dest, src;
2717         __u16 destp, srcp;
2718         long delta = tw->tw_ttd - jiffies;
2719
2720         dest  = tw->tw_daddr;
2721         src   = tw->tw_rcv_saddr;
2722         destp = ntohs(tw->tw_dport);
2723         srcp  = ntohs(tw->tw_sport);
2724
2725         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2726                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2727                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2728                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2729                 atomic_read(&tw->tw_refcnt), tw, len);
2730 }
2731
2732 #define TMPSZ 150
2733
2734 static int tcp4_seq_show(struct seq_file *seq, void *v)
2735 {
2736         struct tcp_iter_state *st;
2737         int len;
2738
2739         if (v == SEQ_START_TOKEN) {
2740                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2741                            "  sl  local_address rem_address   st tx_queue "
2742                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2743                            "inode");
2744                 goto out;
2745         }
2746         st = seq->private;
2747
2748         switch (st->state) {
2749         case TCP_SEQ_STATE_LISTENING:
2750         case TCP_SEQ_STATE_ESTABLISHED:
2751                 get_tcp4_sock(v, seq, st->num, &len);
2752                 break;
2753         case TCP_SEQ_STATE_OPENREQ:
2754                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2755                 break;
2756         case TCP_SEQ_STATE_TIME_WAIT:
2757                 get_timewait4_sock(v, seq, st->num, &len);
2758                 break;
2759         }
2760         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2761 out:
2762         return 0;
2763 }
2764
2765 static const struct file_operations tcp_afinfo_seq_fops = {
2766         .owner   = THIS_MODULE,
2767         .open    = tcp_seq_open,
2768         .read    = seq_read,
2769         .llseek  = seq_lseek,
2770         .release = seq_release_net
2771 };
2772
2773 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2774         .name           = "tcp",
2775         .family         = AF_INET,
2776         .seq_fops       = &tcp_afinfo_seq_fops,
2777         .seq_ops        = {
2778                 .show           = tcp4_seq_show,
2779         },
2780 };
2781
2782 static int __net_init tcp4_proc_init_net(struct net *net)
2783 {
2784         return tcp_proc_register(net, &tcp4_seq_afinfo);
2785 }
2786
2787 static void __net_exit tcp4_proc_exit_net(struct net *net)
2788 {
2789         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2790 }
2791
2792 static struct pernet_operations tcp4_net_ops = {
2793         .init = tcp4_proc_init_net,
2794         .exit = tcp4_proc_exit_net,
2795 };
2796
2797 int __init tcp4_proc_init(void)
2798 {
2799         return register_pernet_subsys(&tcp4_net_ops);
2800 }
2801
2802 void tcp4_proc_exit(void)
2803 {
2804         unregister_pernet_subsys(&tcp4_net_ops);
2805 }
2806 #endif /* CONFIG_PROC_FS */
2807
2808 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2809 {
2810         const struct iphdr *iph = skb_gro_network_header(skb);
2811
2812         switch (skb->ip_summed) {
2813         case CHECKSUM_COMPLETE:
2814                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2815                                   skb->csum)) {
2816                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2817                         break;
2818                 }
2819
2820                 /* fall through */
2821         case CHECKSUM_NONE:
2822                 NAPI_GRO_CB(skb)->flush = 1;
2823                 return NULL;
2824         }
2825
2826         return tcp_gro_receive(head, skb);
2827 }
2828
2829 int tcp4_gro_complete(struct sk_buff *skb)
2830 {
2831         const struct iphdr *iph = ip_hdr(skb);
2832         struct tcphdr *th = tcp_hdr(skb);
2833
2834         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2835                                   iph->saddr, iph->daddr, 0);
2836         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2837
2838         return tcp_gro_complete(skb);
2839 }
2840
2841 struct proto tcp_prot = {
2842         .name                   = "TCP",
2843         .owner                  = THIS_MODULE,
2844         .close                  = tcp_close,
2845         .connect                = tcp_v4_connect,
2846         .disconnect             = tcp_disconnect,
2847         .accept                 = inet_csk_accept,
2848         .ioctl                  = tcp_ioctl,
2849         .init                   = tcp_v4_init_sock,
2850         .destroy                = tcp_v4_destroy_sock,
2851         .shutdown               = tcp_shutdown,
2852         .setsockopt             = tcp_setsockopt,
2853         .getsockopt             = tcp_getsockopt,
2854         .recvmsg                = tcp_recvmsg,
2855         .sendmsg                = tcp_sendmsg,
2856         .sendpage               = tcp_sendpage,
2857         .backlog_rcv            = tcp_v4_do_rcv,
2858         .release_cb             = tcp_release_cb,
2859         .mtu_reduced            = tcp_v4_mtu_reduced,
2860         .hash                   = inet_hash,
2861         .unhash                 = inet_unhash,
2862         .get_port               = inet_csk_get_port,
2863         .enter_memory_pressure  = tcp_enter_memory_pressure,
2864         .sockets_allocated      = &tcp_sockets_allocated,
2865         .orphan_count           = &tcp_orphan_count,
2866         .memory_allocated       = &tcp_memory_allocated,
2867         .memory_pressure        = &tcp_memory_pressure,
2868         .sysctl_wmem            = sysctl_tcp_wmem,
2869         .sysctl_rmem            = sysctl_tcp_rmem,
2870         .max_header             = MAX_TCP_HEADER,
2871         .obj_size               = sizeof(struct tcp_sock),
2872         .slab_flags             = SLAB_DESTROY_BY_RCU,
2873         .twsk_prot              = &tcp_timewait_sock_ops,
2874         .rsk_prot               = &tcp_request_sock_ops,
2875         .h.hashinfo             = &tcp_hashinfo,
2876         .no_autobind            = true,
2877 #ifdef CONFIG_COMPAT
2878         .compat_setsockopt      = compat_tcp_setsockopt,
2879         .compat_getsockopt      = compat_tcp_getsockopt,
2880 #endif
2881 #ifdef CONFIG_MEMCG_KMEM
2882         .init_cgroup            = tcp_init_cgroup,
2883         .destroy_cgroup         = tcp_destroy_cgroup,
2884         .proto_cgroup           = tcp_proto_cgroup,
2885 #endif
2886 };
2887 EXPORT_SYMBOL(tcp_prot);
2888
2889 static int __net_init tcp_sk_init(struct net *net)
2890 {
2891         return 0;
2892 }
2893
2894 static void __net_exit tcp_sk_exit(struct net *net)
2895 {
2896 }
2897
2898 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2899 {
2900         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2901 }
2902
2903 static struct pernet_operations __net_initdata tcp_sk_ops = {
2904        .init       = tcp_sk_init,
2905        .exit       = tcp_sk_exit,
2906        .exit_batch = tcp_sk_exit_batch,
2907 };
2908
2909 void __init tcp_v4_init(void)
2910 {
2911         inet_hashinfo_init(&tcp_hashinfo);
2912         if (register_pernet_subsys(&tcp_sk_ops))
2913                 panic("Failed to create the TCP control socket.\n");
2914 }