net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/netdma.h>
  76 #include <net/secure_seq.h>
  77 #include <net/tcp_memcontrol.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93 #ifdef CONFIG_TCP_MD5SIG
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96 #endif
  97
  98 struct inet_hashinfo tcp_hashinfo;
  99 EXPORT_SYMBOL(tcp_hashinfo);
 100
 101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102 {
 103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                           ip_hdr(skb)->saddr,
 105                                           tcp_hdr(skb)->dest,
 106                                           tcp_hdr(skb)->source);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112         struct tcp_sock *tp = tcp_sk(sk);
 113
 114         /* With PAWS, it is safe from the viewpoint
 115            of data integrity. Even without PAWS it is safe provided sequence
 116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118            Actually, the idea is close to VJ's one, only timestamp cache is
 119            held not per host, but per port pair and TW bucket is used as state
 120            holder.
 121
 122            If TW bucket has been already destroyed we fall back to VJ's scheme
 123            and use initial timestamp retrieved from peer table.
 124          */
 125         if (tcptw->tw_ts_recent_stamp &&
 126             (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                 if (tp->write_seq == 0)
 130                         tp->write_seq = 1;
 131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                 sock_hold(sktw);
 134                 return 1;
 135         }
 136
 137         return 0;
 138 }
 139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141 /* This will initiate an outgoing connection. */
 142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 143 {
 144         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 145         struct inet_sock *inet = inet_sk(sk);
 146         struct tcp_sock *tp = tcp_sk(sk);
 147         __be16 orig_sport, orig_dport;
 148         __be32 daddr, nexthop;
 149         struct flowi4 *fl4;
 150         struct rtable *rt;
 151         int err;
 152         struct ip_options_rcu *inet_opt;
 153
 154         if (addr_len < sizeof(struct sockaddr_in))
 155                 return -EINVAL;
 156
 157         if (usin->sin_family != AF_INET)
 158                 return -EAFNOSUPPORT;
 159
 160         nexthop = daddr = usin->sin_addr.s_addr;
 161         inet_opt = rcu_dereference_protected(inet->inet_opt,
 162                                              sock_owned_by_user(sk));
 163         if (inet_opt && inet_opt->opt.srr) {
 164                 if (!daddr)
 165                         return -EINVAL;
 166                 nexthop = inet_opt->opt.faddr;
 167         }
 168
 169         orig_sport = inet->inet_sport;
 170         orig_dport = usin->sin_port;
 171         fl4 = &inet->cork.fl.u.ip4;
 172         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 173                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 174                               IPPROTO_TCP,
 175                               orig_sport, orig_dport, sk, true);
 176         if (IS_ERR(rt)) {
 177                 err = PTR_ERR(rt);
 178                 if (err == -ENETUNREACH)
 179                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 180                 return err;
 181         }
 182
 183         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 184                 ip_rt_put(rt);
 185                 return -ENETUNREACH;
 186         }
 187
 188         if (!inet_opt || !inet_opt->opt.srr)
 189                 daddr = fl4->daddr;
 190
 191         if (!inet->inet_saddr)
 192                 inet->inet_saddr = fl4->saddr;
 193         inet->inet_rcv_saddr = inet->inet_saddr;
 194
 195         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 196                 /* Reset inherited state */
 197                 tp->rx_opt.ts_recent       = 0;
 198                 tp->rx_opt.ts_recent_stamp = 0;
 199                 tp->write_seq              = 0;
 200         }
 201
 202         if (tcp_death_row.sysctl_tw_recycle &&
 203             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
 204                 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
 205                 /*
 206                  * VJ's idea. We save last timestamp seen from
 207                  * the destination in peer table, when entering state
 208                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 209                  * when trying new connection.
 210                  */
 211                 if (peer) {
 212                         inet_peer_refcheck(peer);
 213                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 214                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 215                                 tp->rx_opt.ts_recent = peer->tcp_ts;
 216                         }
 217                 }
 218         }
 219
 220         inet->inet_dport = usin->sin_port;
 221         inet->inet_daddr = daddr;
 222
 223         inet_csk(sk)->icsk_ext_hdr_len = 0;
 224         if (inet_opt)
 225                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 226
 227         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 228
 229         /* Socket identity is still unknown (sport may be zero).
 230          * However we set state to SYN-SENT and not releasing socket
 231          * lock select source port, enter ourselves into the hash tables and
 232          * complete initialization after this.
 233          */
 234         tcp_set_state(sk, TCP_SYN_SENT);
 235         err = inet_hash_connect(&tcp_death_row, sk);
 236         if (err)
 237                 goto failure;
 238
 239         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 240                                inet->inet_sport, inet->inet_dport, sk);
 241         if (IS_ERR(rt)) {
 242                 err = PTR_ERR(rt);
 243                 rt = NULL;
 244                 goto failure;
 245         }
 246         /* OK, now commit destination to socket.  */
 247         sk->sk_gso_type = SKB_GSO_TCPV4;
 248         sk_setup_caps(sk, &rt->dst);
 249
 250         if (!tp->write_seq)
 251                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 252                                                            inet->inet_daddr,
 253                                                            inet->inet_sport,
 254                                                            usin->sin_port);
 255
 256         inet->inet_id = tp->write_seq ^ jiffies;
 257
 258         err = tcp_connect(sk);
 259         rt = NULL;
 260         if (err)
 261                 goto failure;
 262
 263         return 0;
 264
 265 failure:
 266         /*
 267          * This unhashes the socket and releases the local port,
 268          * if necessary.
 269          */
 270         tcp_set_state(sk, TCP_CLOSE);
 271         ip_rt_put(rt);
 272         sk->sk_route_caps = 0;
 273         inet->inet_dport = 0;
 274         return err;
 275 }
 276 EXPORT_SYMBOL(tcp_v4_connect);
 277
 278 /*
 279  * This routine does path mtu discovery as defined in RFC1191.
 280  */
 281 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 282 {
 283         struct dst_entry *dst;
 284         struct inet_sock *inet = inet_sk(sk);
 285
 286         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 287          * send out by Linux are always <576bytes so they should go through
 288          * unfragmented).
 289          */
 290         if (sk->sk_state == TCP_LISTEN)
 291                 return;
 292
 293         /* We don't check in the destentry if pmtu discovery is forbidden
 294          * on this route. We just assume that no packet_to_big packets
 295          * are send back when pmtu discovery is not active.
 296          * There is a small race when the user changes this flag in the
 297          * route, but I think that's acceptable.
 298          */
 299         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 300                 return;
 301
 302         dst->ops->update_pmtu(dst, mtu);
 303
 304         /* Something is about to be wrong... Remember soft error
 305          * for the case, if this connection will not able to recover.
 306          */
 307         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 308                 sk->sk_err_soft = EMSGSIZE;
 309
 310         mtu = dst_mtu(dst);
 311
 312         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 313             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 314                 tcp_sync_mss(sk, mtu);
 315
 316                 /* Resend the TCP packet because it's
 317                  * clear that the old packet has been
 318                  * dropped. This is the new "fast" path mtu
 319                  * discovery.
 320                  */
 321                 tcp_simple_retransmit(sk);
 322         } /* else let the usual retransmit timer handle it */
 323 }
 324
 325 /*
 326  * This routine is called by the ICMP module when it gets some
 327  * sort of error condition.  If err < 0 then the socket should
 328  * be closed and the error returned to the user.  If err > 0
 329  * it's just the icmp type << 8 | icmp code.  After adjustment
 330  * header points to the first 8 bytes of the tcp header.  We need
 331  * to find the appropriate port.
 332  *
 333  * The locking strategy used here is very "optimistic". When
 334  * someone else accesses the socket the ICMP is just dropped
 335  * and for some paths there is no check at all.
 336  * A more general error queue to queue errors for later handling
 337  * is probably better.
 338  *
 339  */
 340
 341 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 342 {
 343         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 344         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 345         struct inet_connection_sock *icsk;
 346         struct tcp_sock *tp;
 347         struct inet_sock *inet;
 348         const int type = icmp_hdr(icmp_skb)->type;
 349         const int code = icmp_hdr(icmp_skb)->code;
 350         struct sock *sk;
 351         struct sk_buff *skb;
 352         __u32 seq;
 353         __u32 remaining;
 354         int err;
 355         struct net *net = dev_net(icmp_skb->dev);
 356
 357         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 358                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 359                 return;
 360         }
 361
 362         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 363                         iph->saddr, th->source, inet_iif(icmp_skb));
 364         if (!sk) {
 365                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 366                 return;
 367         }
 368         if (sk->sk_state == TCP_TIME_WAIT) {
 369                 inet_twsk_put(inet_twsk(sk));
 370                 return;
 371         }
 372
 373         bh_lock_sock(sk);
 374         /* If too many ICMPs get dropped on busy
 375          * servers this needs to be solved differently.
 376          */
 377         if (sock_owned_by_user(sk))
 378                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 379
 380         if (sk->sk_state == TCP_CLOSE)
 381                 goto out;
 382
 383         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 384                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 385                 goto out;
 386         }
 387
 388         icsk = inet_csk(sk);
 389         tp = tcp_sk(sk);
 390         seq = ntohl(th->seq);
 391         if (sk->sk_state != TCP_LISTEN &&
 392             !between(seq, tp->snd_una, tp->snd_nxt)) {
 393                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 394                 goto out;
 395         }
 396
 397         switch (type) {
 398         case ICMP_SOURCE_QUENCH:
 399                 /* Just silently ignore these. */
 400                 goto out;
 401         case ICMP_PARAMETERPROB:
 402                 err = EPROTO;
 403                 break;
 404         case ICMP_DEST_UNREACH:
 405                 if (code > NR_ICMP_UNREACH)
 406                         goto out;
 407
 408                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 409                         if (!sock_owned_by_user(sk))
 410                                 do_pmtu_discovery(sk, iph, info);
 411                         goto out;
 412                 }
 413
 414                 err = icmp_err_convert[code].errno;
 415                 /* check if icmp_skb allows revert of backoff
 416                  * (see draft-zimmermann-tcp-lcd) */
 417                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 418                         break;
 419                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420                     !icsk->icsk_backoff)
 421                         break;
 422
 423                 if (sock_owned_by_user(sk))
 424                         break;
 425
 426                 icsk->icsk_backoff--;
 427                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 428                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 429                 tcp_bound_rto(sk);
 430
 431                 skb = tcp_write_queue_head(sk);
 432                 BUG_ON(!skb);
 433
 434                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 435                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 436
 437                 if (remaining) {
 438                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 439                                                   remaining, TCP_RTO_MAX);
 440                 } else {
 441                         /* RTO revert clocked out retransmission.
 442                          * Will retransmit now */
 443                         tcp_retransmit_timer(sk);
 444                 }
 445
 446                 break;
 447         case ICMP_TIME_EXCEEDED:
 448                 err = EHOSTUNREACH;
 449                 break;
 450         default:
 451                 goto out;
 452         }
 453
 454         switch (sk->sk_state) {
 455                 struct request_sock *req, **prev;
 456         case TCP_LISTEN:
 457                 if (sock_owned_by_user(sk))
 458                         goto out;
 459
 460                 req = inet_csk_search_req(sk, &prev, th->dest,
 461                                           iph->daddr, iph->saddr);
 462                 if (!req)
 463                         goto out;
 464
 465                 /* ICMPs are not backlogged, hence we cannot get
 466                    an established socket here.
 467                  */
 468                 WARN_ON(req->sk);
 469
 470                 if (seq != tcp_rsk(req)->snt_isn) {
 471                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 472                         goto out;
 473                 }
 474
 475                 /*
 476                  * Still in SYN_RECV, just remove it silently.
 477                  * There is no good way to pass the error to the newly
 478                  * created socket, and POSIX does not want network
 479                  * errors returned from accept().
 480                  */
 481                 inet_csk_reqsk_queue_drop(sk, req, prev);
 482                 goto out;
 483
 484         case TCP_SYN_SENT:
 485         case TCP_SYN_RECV:  /* Cannot happen.
 486                                It can f.e. if SYNs crossed.
 487                              */
 488                 if (!sock_owned_by_user(sk)) {
 489                         sk->sk_err = err;
 490
 491                         sk->sk_error_report(sk);
 492
 493                         tcp_done(sk);
 494                 } else {
 495                         sk->sk_err_soft = err;
 496                 }
 497                 goto out;
 498         }
 499
 500         /* If we've already connected we will keep trying
 501          * until we time out, or the user gives up.
 502          *
 503          * rfc1122 4.2.3.9 allows to consider as hard errors
 504          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 505          * but it is obsoleted by pmtu discovery).
 506          *
 507          * Note, that in modern internet, where routing is unreliable
 508          * and in each dark corner broken firewalls sit, sending random
 509          * errors ordered by their masters even this two messages finally lose
 510          * their original sense (even Linux sends invalid PORT_UNREACHs)
 511          *
 512          * Now we are in compliance with RFCs.
 513          *                                                      --ANK (980905)
 514          */
 515
 516         inet = inet_sk(sk);
 517         if (!sock_owned_by_user(sk) && inet->recverr) {
 518                 sk->sk_err = err;
 519                 sk->sk_error_report(sk);
 520         } else  { /* Only an error on timeout */
 521                 sk->sk_err_soft = err;
 522         }
 523
 524 out:
 525         bh_unlock_sock(sk);
 526         sock_put(sk);
 527 }
 528
 529 static void __tcp_v4_send_check(struct sk_buff *skb,
 530                                 __be32 saddr, __be32 daddr)
 531 {
 532         struct tcphdr *th = tcp_hdr(skb);
 533
 534         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 535                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 536                 skb->csum_start = skb_transport_header(skb) - skb->head;
 537                 skb->csum_offset = offsetof(struct tcphdr, check);
 538         } else {
 539                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 540                                          csum_partial(th,
 541                                                       th->doff << 2,
 542                                                       skb->csum));
 543         }
 544 }
 545
 546 /* This routine computes an IPv4 TCP checksum. */
 547 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 548 {
 549         const struct inet_sock *inet = inet_sk(sk);
 550
 551         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 552 }
 553 EXPORT_SYMBOL(tcp_v4_send_check);
 554
 555 int tcp_v4_gso_send_check(struct sk_buff *skb)
 556 {
 557         const struct iphdr *iph;
 558         struct tcphdr *th;
 559
 560         if (!pskb_may_pull(skb, sizeof(*th)))
 561                 return -EINVAL;
 562
 563         iph = ip_hdr(skb);
 564         th = tcp_hdr(skb);
 565
 566         th->check = 0;
 567         skb->ip_summed = CHECKSUM_PARTIAL;
 568         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 569         return 0;
 570 }
 571
 572 /*
 573  *      This routine will send an RST to the other tcp.
 574  *
 575  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 576  *                    for reset.
 577  *      Answer: if a packet caused RST, it is not for a socket
 578  *              existing in our system, if it is matched to a socket,
 579  *              it is just duplicate segment or bug in other side's TCP.
 580  *              So that we build reply only basing on parameters
 581  *              arrived with segment.
 582  *      Exception: precedence violation. We do not implement it in any case.
 583  */
 584
 585 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 586 {
 587         const struct tcphdr *th = tcp_hdr(skb);
 588         struct {
 589                 struct tcphdr th;
 590 #ifdef CONFIG_TCP_MD5SIG
 591                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 592 #endif
 593         } rep;
 594         struct ip_reply_arg arg;
 595 #ifdef CONFIG_TCP_MD5SIG
 596         struct tcp_md5sig_key *key;
 597         const __u8 *hash_location = NULL;
 598         unsigned char newhash[16];
 599         int genhash;
 600         struct sock *sk1 = NULL;
 601 #endif
 602         struct net *net;
 603
 604         /* Never send a reset in response to a reset. */
 605         if (th->rst)
 606                 return;
 607
 608         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 609                 return;
 610
 611         /* Swap the send and the receive. */
 612         memset(&rep, 0, sizeof(rep));
 613         rep.th.dest   = th->source;
 614         rep.th.source = th->dest;
 615         rep.th.doff   = sizeof(struct tcphdr) / 4;
 616         rep.th.rst    = 1;
 617
 618         if (th->ack) {
 619                 rep.th.seq = th->ack_seq;
 620         } else {
 621                 rep.th.ack = 1;
 622                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 623                                        skb->len - (th->doff << 2));
 624         }
 625
 626         memset(&arg, 0, sizeof(arg));
 627         arg.iov[0].iov_base = (unsigned char *)&rep;
 628         arg.iov[0].iov_len  = sizeof(rep.th);
 629
 630 #ifdef CONFIG_TCP_MD5SIG
 631         hash_location = tcp_parse_md5sig_option(th);
 632         if (!sk && hash_location) {
 633                 /*
 634                  * active side is lost. Try to find listening socket through
 635                  * source port, and then find md5 key through listening socket.
 636                  * we are not loose security here:
 637                  * Incoming packet is checked with md5 hash with finding key,
 638                  * no RST generated if md5 hash doesn't match.
 639                  */
 640                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 641                                              &tcp_hashinfo, ip_hdr(skb)->daddr,
 642                                              ntohs(th->source), inet_iif(skb));
 643                 /* don't send rst if it can't find key */
 644                 if (!sk1)
 645                         return;
 646                 rcu_read_lock();
 647                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 648                                         &ip_hdr(skb)->saddr, AF_INET);
 649                 if (!key)
 650                         goto release_sk1;
 651
 652                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 653                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 654                         goto release_sk1;
 655         } else {
 656                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 657                                              &ip_hdr(skb)->saddr,
 658                                              AF_INET) : NULL;
 659         }
 660
 661         if (key) {
 662                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 663                                    (TCPOPT_NOP << 16) |
 664                                    (TCPOPT_MD5SIG << 8) |
 665                                    TCPOLEN_MD5SIG);
 666                 /* Update length and the length the header thinks exists */
 667                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 668                 rep.th.doff = arg.iov[0].iov_len / 4;
 669
 670                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 671                                      key, ip_hdr(skb)->saddr,
 672                                      ip_hdr(skb)->daddr, &rep.th);
 673         }
 674 #endif
 675         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 676                                       ip_hdr(skb)->saddr, /* XXX */
 677                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 678         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 679         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 680         /* When socket is gone, all binding information is lost.
 681          * routing might fail in this case. using iif for oif to
 682          * make sure we can deliver it
 683          */
 684         arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
 685
 686         net = dev_net(skb_dst(skb)->dev);
 687         arg.tos = ip_hdr(skb)->tos;
 688         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 689                       &arg, arg.iov[0].iov_len);
 690
 691         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 692         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 693
 694 #ifdef CONFIG_TCP_MD5SIG
 695 release_sk1:
 696         if (sk1) {
 697                 rcu_read_unlock();
 698                 sock_put(sk1);
 699         }
 700 #endif
 701 }
 702
 703 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 704    outside socket context is ugly, certainly. What can I do?
 705  */
 706
 707 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 708                             u32 win, u32 ts, int oif,
 709                             struct tcp_md5sig_key *key,
 710                             int reply_flags, u8 tos)
 711 {
 712         const struct tcphdr *th = tcp_hdr(skb);
 713         struct {
 714                 struct tcphdr th;
 715                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 716 #ifdef CONFIG_TCP_MD5SIG
 717                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 718 #endif
 719                         ];
 720         } rep;
 721         struct ip_reply_arg arg;
 722         struct net *net = dev_net(skb_dst(skb)->dev);
 723
 724         memset(&rep.th, 0, sizeof(struct tcphdr));
 725         memset(&arg, 0, sizeof(arg));
 726
 727         arg.iov[0].iov_base = (unsigned char *)&rep;
 728         arg.iov[0].iov_len  = sizeof(rep.th);
 729         if (ts) {
 730                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 731                                    (TCPOPT_TIMESTAMP << 8) |
 732                                    TCPOLEN_TIMESTAMP);
 733                 rep.opt[1] = htonl(tcp_time_stamp);
 734                 rep.opt[2] = htonl(ts);
 735                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 736         }
 737
 738         /* Swap the send and the receive. */
 739         rep.th.dest    = th->source;
 740         rep.th.source  = th->dest;
 741         rep.th.doff    = arg.iov[0].iov_len / 4;
 742         rep.th.seq     = htonl(seq);
 743         rep.th.ack_seq = htonl(ack);
 744         rep.th.ack     = 1;
 745         rep.th.window  = htons(win);
 746
 747 #ifdef CONFIG_TCP_MD5SIG
 748         if (key) {
 749                 int offset = (ts) ? 3 : 0;
 750
 751                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 752                                           (TCPOPT_NOP << 16) |
 753                                           (TCPOPT_MD5SIG << 8) |
 754                                           TCPOLEN_MD5SIG);
 755                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 756                 rep.th.doff = arg.iov[0].iov_len/4;
 757
 758                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 759                                     key, ip_hdr(skb)->saddr,
 760                                     ip_hdr(skb)->daddr, &rep.th);
 761         }
 762 #endif
 763         arg.flags = reply_flags;
 764         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 765                                       ip_hdr(skb)->saddr, /* XXX */
 766                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 767         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 768         if (oif)
 769                 arg.bound_dev_if = oif;
 770         arg.tos = tos;
 771         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
 772                       &arg, arg.iov[0].iov_len);
 773
 774         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 775 }
 776
 777 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 778 {
 779         struct inet_timewait_sock *tw = inet_twsk(sk);
 780         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 781
 782         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 783                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 784                         tcptw->tw_ts_recent,
 785                         tw->tw_bound_dev_if,
 786                         tcp_twsk_md5_key(tcptw),
 787                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 788                         tw->tw_tos
 789                         );
 790
 791         inet_twsk_put(tw);
 792 }
 793
 794 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 795                                   struct request_sock *req)
 796 {
 797         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 798                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 799                         req->ts_recent,
 800                         0,
 801                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 802                                           AF_INET),
 803                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 804                         ip_hdr(skb)->tos);
 805 }
 806
 807 /*
 808  *      Send a SYN-ACK after having received a SYN.
 809  *      This still operates on a request_sock only, not on a big
 810  *      socket.
 811  */
 812 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 813                               struct request_sock *req,
 814                               struct request_values *rvp)
 815 {
 816         const struct inet_request_sock *ireq = inet_rsk(req);
 817         struct flowi4 fl4;
 818         int err = -1;
 819         struct sk_buff * skb;
 820
 821         /* First, grab a route. */
 822         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 823                 return -1;
 824
 825         skb = tcp_make_synack(sk, dst, req, rvp);
 826
 827         if (skb) {
 828                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 829
 830                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 831                                             ireq->rmt_addr,
 832                                             ireq->opt);
 833                 err = net_xmit_eval(err);
 834         }
 835
 836         dst_release(dst);
 837         return err;
 838 }
 839
 840 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 841                               struct request_values *rvp)
 842 {
 843         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 844         return tcp_v4_send_synack(sk, NULL, req, rvp);
 845 }
 846
 847 /*
 848  *      IPv4 request_sock destructor.
 849  */
 850 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 851 {
 852         kfree(inet_rsk(req)->opt);
 853 }
 854
 855 /*
 856  * Return 1 if a syncookie should be sent
 857  */
 858 int tcp_syn_flood_action(struct sock *sk,
 859                          const struct sk_buff *skb,
 860                          const char *proto)
 861 {
 862         const char *msg = "Dropping request";
 863         int want_cookie = 0;
 864         struct listen_sock *lopt;
 865
 866
 867
 868 #ifdef CONFIG_SYN_COOKIES
 869         if (sysctl_tcp_syncookies) {
 870                 msg = "Sending cookies";
 871                 want_cookie = 1;
 872                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 873         } else
 874 #endif
 875                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 876
 877         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 878         if (!lopt->synflood_warned) {
 879                 lopt->synflood_warned = 1;
 880                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 881                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 882         }
 883         return want_cookie;
 884 }
 885 EXPORT_SYMBOL(tcp_syn_flood_action);
 886
 887 /*
 888  * Save and compile IPv4 options into the request_sock if needed.
 889  */
 890 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
 891                                                   struct sk_buff *skb)
 892 {
 893         const struct ip_options *opt = &(IPCB(skb)->opt);
 894         struct ip_options_rcu *dopt = NULL;
 895
 896         if (opt && opt->optlen) {
 897                 int opt_size = sizeof(*dopt) + opt->optlen;
 898
 899                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 900                 if (dopt) {
 901                         if (ip_options_echo(&dopt->opt, skb)) {
 902                                 kfree(dopt);
 903                                 dopt = NULL;
 904                         }
 905                 }
 906         }
 907         return dopt;
 908 }
 909
 910 #ifdef CONFIG_TCP_MD5SIG
 911 /*
 912  * RFC2385 MD5 checksumming requires a mapping of
 913  * IP address->MD5 Key.
 914  * We need to maintain these in the sk structure.
 915  */
 916
 917 /* Find the Key structure for an address.  */
 918 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 919                                          const union tcp_md5_addr *addr,
 920                                          int family)
 921 {
 922         struct tcp_sock *tp = tcp_sk(sk);
 923         struct tcp_md5sig_key *key;
 924         struct hlist_node *pos;
 925         unsigned int size = sizeof(struct in_addr);
 926         struct tcp_md5sig_info *md5sig;
 927
 928         /* caller either holds rcu_read_lock() or socket lock */
 929         md5sig = rcu_dereference_check(tp->md5sig_info,
 930                                        sock_owned_by_user(sk) ||
 931                                        lockdep_is_held(&sk->sk_lock.slock));
 932         if (!md5sig)
 933                 return NULL;
 934 #if IS_ENABLED(CONFIG_IPV6)
 935         if (family == AF_INET6)
 936                 size = sizeof(struct in6_addr);
 937 #endif
 938         hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
 939                 if (key->family != family)
 940                         continue;
 941                 if (!memcmp(&key->addr, addr, size))
 942                         return key;
 943         }
 944         return NULL;
 945 }
 946 EXPORT_SYMBOL(tcp_md5_do_lookup);
 947
 948 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 949                                          struct sock *addr_sk)
 950 {
 951         union tcp_md5_addr *addr;
 952
 953         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 954         return tcp_md5_do_lookup(sk, addr, AF_INET);
 955 }
 956 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 957
 958 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 959                                                       struct request_sock *req)
 960 {
 961         union tcp_md5_addr *addr;
 962
 963         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
 964         return tcp_md5_do_lookup(sk, addr, AF_INET);
 965 }
 966
 967 /* This can be called on a newly created socket, from other files */
 968 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 969                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 970 {
 971         /* Add Key to the list */
 972         struct tcp_md5sig_key *key;
 973         struct tcp_sock *tp = tcp_sk(sk);
 974         struct tcp_md5sig_info *md5sig;
 975
 976         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
 977         if (key) {
 978                 /* Pre-existing entry - just update that one. */
 979                 memcpy(key->key, newkey, newkeylen);
 980                 key->keylen = newkeylen;
 981                 return 0;
 982         }
 983
 984         md5sig = rcu_dereference_protected(tp->md5sig_info,
 985                                            sock_owned_by_user(sk));
 986         if (!md5sig) {
 987                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 988                 if (!md5sig)
 989                         return -ENOMEM;
 990
 991                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 992                 INIT_HLIST_HEAD(&md5sig->head);
 993                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 994         }
 995
 996         key = sock_kmalloc(sk, sizeof(*key), gfp);
 997         if (!key)
 998                 return -ENOMEM;
 999         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1000                 sock_kfree_s(sk, key, sizeof(*key));
1001                 return -ENOMEM;
1002         }
1003
1004         memcpy(key->key, newkey, newkeylen);
1005         key->keylen = newkeylen;
1006         key->family = family;
1007         memcpy(&key->addr, addr,
1008                (family == AF_INET6) ? sizeof(struct in6_addr) :
1009                                       sizeof(struct in_addr));
1010         hlist_add_head_rcu(&key->node, &md5sig->head);
1011         return 0;
1012 }
1013 EXPORT_SYMBOL(tcp_md5_do_add);
1014
1015 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1016 {
1017         struct tcp_sock *tp = tcp_sk(sk);
1018         struct tcp_md5sig_key *key;
1019         struct tcp_md5sig_info *md5sig;
1020
1021         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1022         if (!key)
1023                 return -ENOENT;
1024         hlist_del_rcu(&key->node);
1025         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1026         kfree_rcu(key, rcu);
1027         md5sig = rcu_dereference_protected(tp->md5sig_info,
1028                                            sock_owned_by_user(sk));
1029         if (hlist_empty(&md5sig->head))
1030                 tcp_free_md5sig_pool();
1031         return 0;
1032 }
1033 EXPORT_SYMBOL(tcp_md5_do_del);
1034
1035 void tcp_clear_md5_list(struct sock *sk)
1036 {
1037         struct tcp_sock *tp = tcp_sk(sk);
1038         struct tcp_md5sig_key *key;
1039         struct hlist_node *pos, *n;
1040         struct tcp_md5sig_info *md5sig;
1041
1042         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1043
1044         if (!hlist_empty(&md5sig->head))
1045                 tcp_free_md5sig_pool();
1046         hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1047                 hlist_del_rcu(&key->node);
1048                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1049                 kfree_rcu(key, rcu);
1050         }
1051 }
1052
1053 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1054                                  int optlen)
1055 {
1056         struct tcp_md5sig cmd;
1057         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1058
1059         if (optlen < sizeof(cmd))
1060                 return -EINVAL;
1061
1062         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1063                 return -EFAULT;
1064
1065         if (sin->sin_family != AF_INET)
1066                 return -EINVAL;
1067
1068         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1069                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1070                                       AF_INET);
1071
1072         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1073                 return -EINVAL;
1074
1075         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1076                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1077                               GFP_KERNEL);
1078 }
1079
1080 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1081                                         __be32 daddr, __be32 saddr, int nbytes)
1082 {
1083         struct tcp4_pseudohdr *bp;
1084         struct scatterlist sg;
1085
1086         bp = &hp->md5_blk.ip4;
1087
1088         /*
1089          * 1. the TCP pseudo-header (in the order: source IP address,
1090          * destination IP address, zero-padded protocol number, and
1091          * segment length)
1092          */
1093         bp->saddr = saddr;
1094         bp->daddr = daddr;
1095         bp->pad = 0;
1096         bp->protocol = IPPROTO_TCP;
1097         bp->len = cpu_to_be16(nbytes);
1098
1099         sg_init_one(&sg, bp, sizeof(*bp));
1100         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1101 }
1102
1103 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1104                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1105 {
1106         struct tcp_md5sig_pool *hp;
1107         struct hash_desc *desc;
1108
1109         hp = tcp_get_md5sig_pool();
1110         if (!hp)
1111                 goto clear_hash_noput;
1112         desc = &hp->md5_desc;
1113
1114         if (crypto_hash_init(desc))
1115                 goto clear_hash;
1116         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1117                 goto clear_hash;
1118         if (tcp_md5_hash_header(hp, th))
1119                 goto clear_hash;
1120         if (tcp_md5_hash_key(hp, key))
1121                 goto clear_hash;
1122         if (crypto_hash_final(desc, md5_hash))
1123                 goto clear_hash;
1124
1125         tcp_put_md5sig_pool();
1126         return 0;
1127
1128 clear_hash:
1129         tcp_put_md5sig_pool();
1130 clear_hash_noput:
1131         memset(md5_hash, 0, 16);
1132         return 1;
1133 }
1134
1135 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1136                         const struct sock *sk, const struct request_sock *req,
1137                         const struct sk_buff *skb)
1138 {
1139         struct tcp_md5sig_pool *hp;
1140         struct hash_desc *desc;
1141         const struct tcphdr *th = tcp_hdr(skb);
1142         __be32 saddr, daddr;
1143
1144         if (sk) {
1145                 saddr = inet_sk(sk)->inet_saddr;
1146                 daddr = inet_sk(sk)->inet_daddr;
1147         } else if (req) {
1148                 saddr = inet_rsk(req)->loc_addr;
1149                 daddr = inet_rsk(req)->rmt_addr;
1150         } else {
1151                 const struct iphdr *iph = ip_hdr(skb);
1152                 saddr = iph->saddr;
1153                 daddr = iph->daddr;
1154         }
1155
1156         hp = tcp_get_md5sig_pool();
1157         if (!hp)
1158                 goto clear_hash_noput;
1159         desc = &hp->md5_desc;
1160
1161         if (crypto_hash_init(desc))
1162                 goto clear_hash;
1163
1164         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1165                 goto clear_hash;
1166         if (tcp_md5_hash_header(hp, th))
1167                 goto clear_hash;
1168         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1169                 goto clear_hash;
1170         if (tcp_md5_hash_key(hp, key))
1171                 goto clear_hash;
1172         if (crypto_hash_final(desc, md5_hash))
1173                 goto clear_hash;
1174
1175         tcp_put_md5sig_pool();
1176         return 0;
1177
1178 clear_hash:
1179         tcp_put_md5sig_pool();
1180 clear_hash_noput:
1181         memset(md5_hash, 0, 16);
1182         return 1;
1183 }
1184 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1185
1186 static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1187 {
1188         /*
1189          * This gets called for each TCP segment that arrives
1190          * so we want to be efficient.
1191          * We have 3 drop cases:
1192          * o No MD5 hash and one expected.
1193          * o MD5 hash and we're not expecting one.
1194          * o MD5 hash and its wrong.
1195          */
1196         const __u8 *hash_location = NULL;
1197         struct tcp_md5sig_key *hash_expected;
1198         const struct iphdr *iph = ip_hdr(skb);
1199         const struct tcphdr *th = tcp_hdr(skb);
1200         int genhash;
1201         unsigned char newhash[16];
1202
1203         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1204                                           AF_INET);
1205         hash_location = tcp_parse_md5sig_option(th);
1206
1207         /* We've parsed the options - do we have a hash? */
1208         if (!hash_expected && !hash_location)
1209                 return 0;
1210
1211         if (hash_expected && !hash_location) {
1212                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1213                 return 1;
1214         }
1215
1216         if (!hash_expected && hash_location) {
1217                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1218                 return 1;
1219         }
1220
1221         /* Okay, so this is hash_expected and hash_location -
1222          * so we need to calculate the checksum.
1223          */
1224         genhash = tcp_v4_md5_hash_skb(newhash,
1225                                       hash_expected,
1226                                       NULL, NULL, skb);
1227
1228         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1229                 if (net_ratelimit()) {
1230                         pr_info("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1231                                 &iph->saddr, ntohs(th->source),
1232                                 &iph->daddr, ntohs(th->dest),
1233                                 genhash ? " tcp_v4_calc_md5_hash failed" : "");
1234                 }
1235                 return 1;
1236         }
1237         return 0;
1238 }
1239
1240 #endif
1241
1242 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1243         .family         =       PF_INET,
1244         .obj_size       =       sizeof(struct tcp_request_sock),
1245         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1246         .send_ack       =       tcp_v4_reqsk_send_ack,
1247         .destructor     =       tcp_v4_reqsk_destructor,
1248         .send_reset     =       tcp_v4_send_reset,
1249         .syn_ack_timeout =      tcp_syn_ack_timeout,
1250 };
1251
1252 #ifdef CONFIG_TCP_MD5SIG
1253 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1254         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1255         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1256 };
1257 #endif
1258
1259 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1260 {
1261         struct tcp_extend_values tmp_ext;
1262         struct tcp_options_received tmp_opt;
1263         const u8 *hash_location;
1264         struct request_sock *req;
1265         struct inet_request_sock *ireq;
1266         struct tcp_sock *tp = tcp_sk(sk);
1267         struct dst_entry *dst = NULL;
1268         __be32 saddr = ip_hdr(skb)->saddr;
1269         __be32 daddr = ip_hdr(skb)->daddr;
1270         __u32 isn = TCP_SKB_CB(skb)->when;
1271         int want_cookie = 0;
1272
1273         /* Never answer to SYNs send to broadcast or multicast */
1274         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1275                 goto drop;
1276
1277         /* TW buckets are converted to open requests without
1278          * limitations, they conserve resources and peer is
1279          * evidently real one.
1280          */
1281         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1282                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1283                 if (!want_cookie)
1284                         goto drop;
1285         }
1286
1287         /* Accept backlog is full. If we have already queued enough
1288          * of warm entries in syn queue, drop request. It is better than
1289          * clogging syn queue with openreqs with exponentially increasing
1290          * timeout.
1291          */
1292         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1293                 goto drop;
1294
1295         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1296         if (!req)
1297                 goto drop;
1298
1299 #ifdef CONFIG_TCP_MD5SIG
1300         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1301 #endif
1302
1303         tcp_clear_options(&tmp_opt);
1304         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1305         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1306         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1307
1308         if (tmp_opt.cookie_plus > 0 &&
1309             tmp_opt.saw_tstamp &&
1310             !tp->rx_opt.cookie_out_never &&
1311             (sysctl_tcp_cookie_size > 0 ||
1312              (tp->cookie_values != NULL &&
1313               tp->cookie_values->cookie_desired > 0))) {
1314                 u8 *c;
1315                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1316                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1317
1318                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1319                         goto drop_and_release;
1320
1321                 /* Secret recipe starts with IP addresses */
1322                 *mess++ ^= (__force u32)daddr;
1323                 *mess++ ^= (__force u32)saddr;
1324
1325                 /* plus variable length Initiator Cookie */
1326                 c = (u8 *)mess;
1327                 while (l-- > 0)
1328                         *c++ ^= *hash_location++;
1329
1330                 want_cookie = 0;        /* not our kind of cookie */
1331                 tmp_ext.cookie_out_never = 0; /* false */
1332                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1333         } else if (!tp->rx_opt.cookie_in_always) {
1334                 /* redundant indications, but ensure initialization. */
1335                 tmp_ext.cookie_out_never = 1; /* true */
1336                 tmp_ext.cookie_plus = 0;
1337         } else {
1338                 goto drop_and_release;
1339         }
1340         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1341
1342         if (want_cookie && !tmp_opt.saw_tstamp)
1343                 tcp_clear_options(&tmp_opt);
1344
1345         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1346         tcp_openreq_init(req, &tmp_opt, skb);
1347
1348         ireq = inet_rsk(req);
1349         ireq->loc_addr = daddr;
1350         ireq->rmt_addr = saddr;
1351         ireq->no_srccheck = inet_sk(sk)->transparent;
1352         ireq->opt = tcp_v4_save_options(sk, skb);
1353
1354         if (security_inet_conn_request(sk, skb, req))
1355                 goto drop_and_free;
1356
1357         if (!want_cookie || tmp_opt.tstamp_ok)
1358                 TCP_ECN_create_request(req, tcp_hdr(skb));
1359
1360         if (want_cookie) {
1361                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1362                 req->cookie_ts = tmp_opt.tstamp_ok;
1363         } else if (!isn) {
1364                 struct inet_peer *peer = NULL;
1365                 struct flowi4 fl4;
1366
1367                 /* VJ's idea. We save last timestamp seen
1368                  * from the destination in peer table, when entering
1369                  * state TIME-WAIT, and check against it before
1370                  * accepting new connection request.
1371                  *
1372                  * If "isn" is not zero, this request hit alive
1373                  * timewait bucket, so that all the necessary checks
1374                  * are made in the function processing timewait state.
1375                  */
1376                 if (tmp_opt.saw_tstamp &&
1377                     tcp_death_row.sysctl_tw_recycle &&
1378                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1379                     fl4.daddr == saddr &&
1380                     (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1381                         inet_peer_refcheck(peer);
1382                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1383                             (s32)(peer->tcp_ts - req->ts_recent) >
1384                                                         TCP_PAWS_WINDOW) {
1385                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1386                                 goto drop_and_release;
1387                         }
1388                 }
1389                 /* Kill the following clause, if you dislike this way. */
1390                 else if (!sysctl_tcp_syncookies &&
1391                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1392                           (sysctl_max_syn_backlog >> 2)) &&
1393                          (!peer || !peer->tcp_ts_stamp) &&
1394                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1395                         /* Without syncookies last quarter of
1396                          * backlog is filled with destinations,
1397                          * proven to be alive.
1398                          * It means that we continue to communicate
1399                          * to destinations, already remembered
1400                          * to the moment of synflood.
1401                          */
1402                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1403                                        &saddr, ntohs(tcp_hdr(skb)->source));
1404                         goto drop_and_release;
1405                 }
1406
1407                 isn = tcp_v4_init_sequence(skb);
1408         }
1409         tcp_rsk(req)->snt_isn = isn;
1410         tcp_rsk(req)->snt_synack = tcp_time_stamp;
1411
1412         if (tcp_v4_send_synack(sk, dst, req,
1413                                (struct request_values *)&tmp_ext) ||
1414             want_cookie)
1415                 goto drop_and_free;
1416
1417         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1418         return 0;
1419
1420 drop_and_release:
1421         dst_release(dst);
1422 drop_and_free:
1423         reqsk_free(req);
1424 drop:
1425         return 0;
1426 }
1427 EXPORT_SYMBOL(tcp_v4_conn_request);
1428
1429
1430 /*
1431  * The three way handshake has completed - we got a valid synack -
1432  * now create the new socket.
1433  */
1434 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1435                                   struct request_sock *req,
1436                                   struct dst_entry *dst)
1437 {
1438         struct inet_request_sock *ireq;
1439         struct inet_sock *newinet;
1440         struct tcp_sock *newtp;
1441         struct sock *newsk;
1442 #ifdef CONFIG_TCP_MD5SIG
1443         struct tcp_md5sig_key *key;
1444 #endif
1445         struct ip_options_rcu *inet_opt;
1446
1447         if (sk_acceptq_is_full(sk))
1448                 goto exit_overflow;
1449
1450         newsk = tcp_create_openreq_child(sk, req, skb);
1451         if (!newsk)
1452                 goto exit_nonewsk;
1453
1454         newsk->sk_gso_type = SKB_GSO_TCPV4;
1455
1456         newtp                 = tcp_sk(newsk);
1457         newinet               = inet_sk(newsk);
1458         ireq                  = inet_rsk(req);
1459         newinet->inet_daddr   = ireq->rmt_addr;
1460         newinet->inet_rcv_saddr = ireq->loc_addr;
1461         newinet->inet_saddr           = ireq->loc_addr;
1462         inet_opt              = ireq->opt;
1463         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1464         ireq->opt             = NULL;
1465         newinet->mc_index     = inet_iif(skb);
1466         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1467         newinet->rcv_tos      = ip_hdr(skb)->tos;
1468         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1469         if (inet_opt)
1470                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1471         newinet->inet_id = newtp->write_seq ^ jiffies;
1472
1473         if (!dst) {
1474                 dst = inet_csk_route_child_sock(sk, newsk, req);
1475                 if (!dst)
1476                         goto put_and_exit;
1477         } else {
1478                 /* syncookie case : see end of cookie_v4_check() */
1479         }
1480         sk_setup_caps(newsk, dst);
1481
1482         tcp_mtup_init(newsk);
1483         tcp_sync_mss(newsk, dst_mtu(dst));
1484         newtp->advmss = dst_metric_advmss(dst);
1485         if (tcp_sk(sk)->rx_opt.user_mss &&
1486             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1487                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1488
1489         tcp_initialize_rcv_mss(newsk);
1490         if (tcp_rsk(req)->snt_synack)
1491                 tcp_valid_rtt_meas(newsk,
1492                     tcp_time_stamp - tcp_rsk(req)->snt_synack);
1493         newtp->total_retrans = req->retrans;
1494
1495 #ifdef CONFIG_TCP_MD5SIG
1496         /* Copy over the MD5 key from the original socket */
1497         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1498                                 AF_INET);
1499         if (key != NULL) {
1500                 /*
1501                  * We're using one, so create a matching key
1502                  * on the newsk structure. If we fail to get
1503                  * memory, then we end up not copying the key
1504                  * across. Shucks.
1505                  */
1506                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1507                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1508                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1509         }
1510 #endif
1511
1512         if (__inet_inherit_port(sk, newsk) < 0)
1513                 goto put_and_exit;
1514         __inet_hash_nolisten(newsk, NULL);
1515
1516         return newsk;
1517
1518 exit_overflow:
1519         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1520 exit_nonewsk:
1521         dst_release(dst);
1522 exit:
1523         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1524         return NULL;
1525 put_and_exit:
1526         tcp_clear_xmit_timers(newsk);
1527         tcp_cleanup_congestion_control(newsk);
1528         bh_unlock_sock(newsk);
1529         sock_put(newsk);
1530         goto exit;
1531 }
1532 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1533
1534 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1535 {
1536         struct tcphdr *th = tcp_hdr(skb);
1537         const struct iphdr *iph = ip_hdr(skb);
1538         struct sock *nsk;
1539         struct request_sock **prev;
1540         /* Find possible connection requests. */
1541         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1542                                                        iph->saddr, iph->daddr);
1543         if (req)
1544                 return tcp_check_req(sk, skb, req, prev);
1545
1546         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1547                         th->source, iph->daddr, th->dest, inet_iif(skb));
1548
1549         if (nsk) {
1550                 if (nsk->sk_state != TCP_TIME_WAIT) {
1551                         bh_lock_sock(nsk);
1552                         return nsk;
1553                 }
1554                 inet_twsk_put(inet_twsk(nsk));
1555                 return NULL;
1556         }
1557
1558 #ifdef CONFIG_SYN_COOKIES
1559         if (!th->syn)
1560                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1561 #endif
1562         return sk;
1563 }
1564
1565 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1566 {
1567         const struct iphdr *iph = ip_hdr(skb);
1568
1569         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1570                 if (!tcp_v4_check(skb->len, iph->saddr,
1571                                   iph->daddr, skb->csum)) {
1572                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1573                         return 0;
1574                 }
1575         }
1576
1577         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1578                                        skb->len, IPPROTO_TCP, 0);
1579
1580         if (skb->len <= 76) {
1581                 return __skb_checksum_complete(skb);
1582         }
1583         return 0;
1584 }
1585
1586
1587 /* The socket must have it's spinlock held when we get
1588  * here.
1589  *
1590  * We have a potential double-lock case here, so even when
1591  * doing backlog processing we use the BH locking scheme.
1592  * This is because we cannot sleep with the original spinlock
1593  * held.
1594  */
1595 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1596 {
1597         struct sock *rsk;
1598 #ifdef CONFIG_TCP_MD5SIG
1599         /*
1600          * We really want to reject the packet as early as possible
1601          * if:
1602          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1603          *  o There is an MD5 option and we're not expecting one
1604          */
1605         if (tcp_v4_inbound_md5_hash(sk, skb))
1606                 goto discard;
1607 #endif
1608
1609         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1610                 sock_rps_save_rxhash(sk, skb);
1611                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1612                         rsk = sk;
1613                         goto reset;
1614                 }
1615                 return 0;
1616         }
1617
1618         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1619                 goto csum_err;
1620
1621         if (sk->sk_state == TCP_LISTEN) {
1622                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1623                 if (!nsk)
1624                         goto discard;
1625
1626                 if (nsk != sk) {
1627                         sock_rps_save_rxhash(nsk, skb);
1628                         if (tcp_child_process(sk, nsk, skb)) {
1629                                 rsk = nsk;
1630                                 goto reset;
1631                         }
1632                         return 0;
1633                 }
1634         } else
1635                 sock_rps_save_rxhash(sk, skb);
1636
1637         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1638                 rsk = sk;
1639                 goto reset;
1640         }
1641         return 0;
1642
1643 reset:
1644         tcp_v4_send_reset(rsk, skb);
1645 discard:
1646         kfree_skb(skb);
1647         /* Be careful here. If this function gets more complicated and
1648          * gcc suffers from register pressure on the x86, sk (in %ebx)
1649          * might be destroyed here. This current version compiles correctly,
1650          * but you have been warned.
1651          */
1652         return 0;
1653
1654 csum_err:
1655         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1656         goto discard;
1657 }
1658 EXPORT_SYMBOL(tcp_v4_do_rcv);
1659
1660 /*
1661  *      From tcp_input.c
1662  */
1663
1664 int tcp_v4_rcv(struct sk_buff *skb)
1665 {
1666         const struct iphdr *iph;
1667         const struct tcphdr *th;
1668         struct sock *sk;
1669         int ret;
1670         struct net *net = dev_net(skb->dev);
1671
1672         if (skb->pkt_type != PACKET_HOST)
1673                 goto discard_it;
1674
1675         /* Count it even if it's bad */
1676         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1677
1678         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1679                 goto discard_it;
1680
1681         th = tcp_hdr(skb);
1682
1683         if (th->doff < sizeof(struct tcphdr) / 4)
1684                 goto bad_packet;
1685         if (!pskb_may_pull(skb, th->doff * 4))
1686                 goto discard_it;
1687
1688         /* An explanation is required here, I think.
1689          * Packet length and doff are validated by header prediction,
1690          * provided case of th->doff==0 is eliminated.
1691          * So, we defer the checks. */
1692         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1693                 goto bad_packet;
1694
1695         th = tcp_hdr(skb);
1696         iph = ip_hdr(skb);
1697         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1698         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1699                                     skb->len - th->doff * 4);
1700         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1701         TCP_SKB_CB(skb)->when    = 0;
1702         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1703         TCP_SKB_CB(skb)->sacked  = 0;
1704
1705         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1706         if (!sk)
1707                 goto no_tcp_socket;
1708
1709 process:
1710         if (sk->sk_state == TCP_TIME_WAIT)
1711                 goto do_time_wait;
1712
1713         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1714                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1715                 goto discard_and_relse;
1716         }
1717
1718         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1719                 goto discard_and_relse;
1720         nf_reset(skb);
1721
1722         if (sk_filter(sk, skb))
1723                 goto discard_and_relse;
1724
1725         skb->dev = NULL;
1726
1727         bh_lock_sock_nested(sk);
1728         ret = 0;
1729         if (!sock_owned_by_user(sk)) {
1730 #ifdef CONFIG_NET_DMA
1731                 struct tcp_sock *tp = tcp_sk(sk);
1732                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1733                         tp->ucopy.dma_chan = net_dma_find_channel();
1734                 if (tp->ucopy.dma_chan)
1735                         ret = tcp_v4_do_rcv(sk, skb);
1736                 else
1737 #endif
1738                 {
1739                         if (!tcp_prequeue(sk, skb))
1740                                 ret = tcp_v4_do_rcv(sk, skb);
1741                 }
1742         } else if (unlikely(sk_add_backlog(sk, skb))) {
1743                 bh_unlock_sock(sk);
1744                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1745                 goto discard_and_relse;
1746         }
1747         bh_unlock_sock(sk);
1748
1749         sock_put(sk);
1750
1751         return ret;
1752
1753 no_tcp_socket:
1754         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1755                 goto discard_it;
1756
1757         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1758 bad_packet:
1759                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1760         } else {
1761                 tcp_v4_send_reset(NULL, skb);
1762         }
1763
1764 discard_it:
1765         /* Discard frame. */
1766         kfree_skb(skb);
1767         return 0;
1768
1769 discard_and_relse:
1770         sock_put(sk);
1771         goto discard_it;
1772
1773 do_time_wait:
1774         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1775                 inet_twsk_put(inet_twsk(sk));
1776                 goto discard_it;
1777         }
1778
1779         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1780                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1781                 inet_twsk_put(inet_twsk(sk));
1782                 goto discard_it;
1783         }
1784         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1785         case TCP_TW_SYN: {
1786                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1787                                                         &tcp_hashinfo,
1788                                                         iph->daddr, th->dest,
1789                                                         inet_iif(skb));
1790                 if (sk2) {
1791                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1792                         inet_twsk_put(inet_twsk(sk));
1793                         sk = sk2;
1794                         goto process;
1795                 }
1796                 /* Fall through to ACK */
1797         }
1798         case TCP_TW_ACK:
1799                 tcp_v4_timewait_ack(sk, skb);
1800                 break;
1801         case TCP_TW_RST:
1802                 goto no_tcp_socket;
1803         case TCP_TW_SUCCESS:;
1804         }
1805         goto discard_it;
1806 }
1807
1808 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1809 {
1810         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1811         struct inet_sock *inet = inet_sk(sk);
1812         struct inet_peer *peer;
1813
1814         if (!rt ||
1815             inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1816                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1817                 *release_it = true;
1818         } else {
1819                 if (!rt->peer)
1820                         rt_bind_peer(rt, inet->inet_daddr, 1);
1821                 peer = rt->peer;
1822                 *release_it = false;
1823         }
1824
1825         return peer;
1826 }
1827 EXPORT_SYMBOL(tcp_v4_get_peer);
1828
1829 void *tcp_v4_tw_get_peer(struct sock *sk)
1830 {
1831         const struct inet_timewait_sock *tw = inet_twsk(sk);
1832
1833         return inet_getpeer_v4(tw->tw_daddr, 1);
1834 }
1835 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1836
1837 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1838         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1839         .twsk_unique    = tcp_twsk_unique,
1840         .twsk_destructor= tcp_twsk_destructor,
1841         .twsk_getpeer   = tcp_v4_tw_get_peer,
1842 };
1843
1844 const struct inet_connection_sock_af_ops ipv4_specific = {
1845         .queue_xmit        = ip_queue_xmit,
1846         .send_check        = tcp_v4_send_check,
1847         .rebuild_header    = inet_sk_rebuild_header,
1848         .conn_request      = tcp_v4_conn_request,
1849         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1850         .get_peer          = tcp_v4_get_peer,
1851         .net_header_len    = sizeof(struct iphdr),
1852         .setsockopt        = ip_setsockopt,
1853         .getsockopt        = ip_getsockopt,
1854         .addr2sockaddr     = inet_csk_addr2sockaddr,
1855         .sockaddr_len      = sizeof(struct sockaddr_in),
1856         .bind_conflict     = inet_csk_bind_conflict,
1857 #ifdef CONFIG_COMPAT
1858         .compat_setsockopt = compat_ip_setsockopt,
1859         .compat_getsockopt = compat_ip_getsockopt,
1860 #endif
1861 };
1862 EXPORT_SYMBOL(ipv4_specific);
1863
1864 #ifdef CONFIG_TCP_MD5SIG
1865 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1866         .md5_lookup             = tcp_v4_md5_lookup,
1867         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1868         .md5_parse              = tcp_v4_parse_md5_keys,
1869 };
1870 #endif
1871
1872 /* NOTE: A lot of things set to zero explicitly by call to
1873  *       sk_alloc() so need not be done here.
1874  */
1875 static int tcp_v4_init_sock(struct sock *sk)
1876 {
1877         struct inet_connection_sock *icsk = inet_csk(sk);
1878         struct tcp_sock *tp = tcp_sk(sk);
1879
1880         skb_queue_head_init(&tp->out_of_order_queue);
1881         tcp_init_xmit_timers(sk);
1882         tcp_prequeue_init(tp);
1883
1884         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1885         tp->mdev = TCP_TIMEOUT_INIT;
1886
1887         /* So many TCP implementations out there (incorrectly) count the
1888          * initial SYN frame in their delayed-ACK and congestion control
1889          * algorithms that we must have the following bandaid to talk
1890          * efficiently to them.  -DaveM
1891          */
1892         tp->snd_cwnd = TCP_INIT_CWND;
1893
1894         /* See draft-stevens-tcpca-spec-01 for discussion of the
1895          * initialization of these values.
1896          */
1897         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1898         tp->snd_cwnd_clamp = ~0;
1899         tp->mss_cache = TCP_MSS_DEFAULT;
1900
1901         tp->reordering = sysctl_tcp_reordering;
1902         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1903
1904         sk->sk_state = TCP_CLOSE;
1905
1906         sk->sk_write_space = sk_stream_write_space;
1907         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1908
1909         icsk->icsk_af_ops = &ipv4_specific;
1910         icsk->icsk_sync_mss = tcp_sync_mss;
1911 #ifdef CONFIG_TCP_MD5SIG
1912         tp->af_specific = &tcp_sock_ipv4_specific;
1913 #endif
1914
1915         /* TCP Cookie Transactions */
1916         if (sysctl_tcp_cookie_size > 0) {
1917                 /* Default, cookies without s_data_payload. */
1918                 tp->cookie_values =
1919                         kzalloc(sizeof(*tp->cookie_values),
1920                                 sk->sk_allocation);
1921                 if (tp->cookie_values != NULL)
1922                         kref_init(&tp->cookie_values->kref);
1923         }
1924         /* Presumed zeroed, in order of appearance:
1925          *      cookie_in_always, cookie_out_never,
1926          *      s_data_constant, s_data_in, s_data_out
1927          */
1928         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1929         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1930
1931         local_bh_disable();
1932         sock_update_memcg(sk);
1933         sk_sockets_allocated_inc(sk);
1934         local_bh_enable();
1935
1936         return 0;
1937 }
1938
1939 void tcp_v4_destroy_sock(struct sock *sk)
1940 {
1941         struct tcp_sock *tp = tcp_sk(sk);
1942
1943         tcp_clear_xmit_timers(sk);
1944
1945         tcp_cleanup_congestion_control(sk);
1946
1947         /* Cleanup up the write buffer. */
1948         tcp_write_queue_purge(sk);
1949
1950         /* Cleans up our, hopefully empty, out_of_order_queue. */
1951         __skb_queue_purge(&tp->out_of_order_queue);
1952
1953 #ifdef CONFIG_TCP_MD5SIG
1954         /* Clean up the MD5 key list, if any */
1955         if (tp->md5sig_info) {
1956                 tcp_clear_md5_list(sk);
1957                 kfree_rcu(tp->md5sig_info, rcu);
1958                 tp->md5sig_info = NULL;
1959         }
1960 #endif
1961
1962 #ifdef CONFIG_NET_DMA
1963         /* Cleans up our sk_async_wait_queue */
1964         __skb_queue_purge(&sk->sk_async_wait_queue);
1965 #endif
1966
1967         /* Clean prequeue, it must be empty really */
1968         __skb_queue_purge(&tp->ucopy.prequeue);
1969
1970         /* Clean up a referenced TCP bind bucket. */
1971         if (inet_csk(sk)->icsk_bind_hash)
1972                 inet_put_port(sk);
1973
1974         /*
1975          * If sendmsg cached page exists, toss it.
1976          */
1977         if (sk->sk_sndmsg_page) {
1978                 __free_page(sk->sk_sndmsg_page);
1979                 sk->sk_sndmsg_page = NULL;
1980         }
1981
1982         /* TCP Cookie Transactions */
1983         if (tp->cookie_values != NULL) {
1984                 kref_put(&tp->cookie_values->kref,
1985                          tcp_cookie_values_release);
1986                 tp->cookie_values = NULL;
1987         }
1988
1989         sk_sockets_allocated_dec(sk);
1990         sock_release_memcg(sk);
1991 }
1992 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1993
1994 #ifdef CONFIG_PROC_FS
1995 /* Proc filesystem TCP sock list dumping. */
1996
1997 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1998 {
1999         return hlist_nulls_empty(head) ? NULL :
2000                 list_entry(head->first, struct inet_timewait_sock, tw_node);
2001 }
2002
2003 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2004 {
2005         return !is_a_nulls(tw->tw_node.next) ?
2006                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2007 }
2008
2009 /*
2010  * Get next listener socket follow cur.  If cur is NULL, get first socket
2011  * starting from bucket given in st->bucket; when st->bucket is zero the
2012  * very first socket in the hash table is returned.
2013  */
2014 static void *listening_get_next(struct seq_file *seq, void *cur)
2015 {
2016         struct inet_connection_sock *icsk;
2017         struct hlist_nulls_node *node;
2018         struct sock *sk = cur;
2019         struct inet_listen_hashbucket *ilb;
2020         struct tcp_iter_state *st = seq->private;
2021         struct net *net = seq_file_net(seq);
2022
2023         if (!sk) {
2024                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2025                 spin_lock_bh(&ilb->lock);
2026                 sk = sk_nulls_head(&ilb->head);
2027                 st->offset = 0;
2028                 goto get_sk;
2029         }
2030         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2031         ++st->num;
2032         ++st->offset;
2033
2034         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2035                 struct request_sock *req = cur;
2036
2037                 icsk = inet_csk(st->syn_wait_sk);
2038                 req = req->dl_next;
2039                 while (1) {
2040                         while (req) {
2041                                 if (req->rsk_ops->family == st->family) {
2042                                         cur = req;
2043                                         goto out;
2044                                 }
2045                                 req = req->dl_next;
2046                         }
2047                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2048                                 break;
2049 get_req:
2050                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2051                 }
2052                 sk        = sk_nulls_next(st->syn_wait_sk);
2053                 st->state = TCP_SEQ_STATE_LISTENING;
2054                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2055         } else {
2056                 icsk = inet_csk(sk);
2057                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2058                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2059                         goto start_req;
2060                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2061                 sk = sk_nulls_next(sk);
2062         }
2063 get_sk:
2064         sk_nulls_for_each_from(sk, node) {
2065                 if (!net_eq(sock_net(sk), net))
2066                         continue;
2067                 if (sk->sk_family == st->family) {
2068                         cur = sk;
2069                         goto out;
2070                 }
2071                 icsk = inet_csk(sk);
2072                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2073                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2074 start_req:
2075                         st->uid         = sock_i_uid(sk);
2076                         st->syn_wait_sk = sk;
2077                         st->state       = TCP_SEQ_STATE_OPENREQ;
2078                         st->sbucket     = 0;
2079                         goto get_req;
2080                 }
2081                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2082         }
2083         spin_unlock_bh(&ilb->lock);
2084         st->offset = 0;
2085         if (++st->bucket < INET_LHTABLE_SIZE) {
2086                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2087                 spin_lock_bh(&ilb->lock);
2088                 sk = sk_nulls_head(&ilb->head);
2089                 goto get_sk;
2090         }
2091         cur = NULL;
2092 out:
2093         return cur;
2094 }
2095
2096 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2097 {
2098         struct tcp_iter_state *st = seq->private;
2099         void *rc;
2100
2101         st->bucket = 0;
2102         st->offset = 0;
2103         rc = listening_get_next(seq, NULL);
2104
2105         while (rc && *pos) {
2106                 rc = listening_get_next(seq, rc);
2107                 --*pos;
2108         }
2109         return rc;
2110 }
2111
2112 static inline int empty_bucket(struct tcp_iter_state *st)
2113 {
2114         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2115                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2116 }
2117
2118 /*
2119  * Get first established socket starting from bucket given in st->bucket.
2120  * If st->bucket is zero, the very first socket in the hash is returned.
2121  */
2122 static void *established_get_first(struct seq_file *seq)
2123 {
2124         struct tcp_iter_state *st = seq->private;
2125         struct net *net = seq_file_net(seq);
2126         void *rc = NULL;
2127
2128         st->offset = 0;
2129         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2130                 struct sock *sk;
2131                 struct hlist_nulls_node *node;
2132                 struct inet_timewait_sock *tw;
2133                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2134
2135                 /* Lockless fast path for the common case of empty buckets */
2136                 if (empty_bucket(st))
2137                         continue;
2138
2139                 spin_lock_bh(lock);
2140                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2141                         if (sk->sk_family != st->family ||
2142                             !net_eq(sock_net(sk), net)) {
2143                                 continue;
2144                         }
2145                         rc = sk;
2146                         goto out;
2147                 }
2148                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2149                 inet_twsk_for_each(tw, node,
2150                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2151                         if (tw->tw_family != st->family ||
2152                             !net_eq(twsk_net(tw), net)) {
2153                                 continue;
2154                         }
2155                         rc = tw;
2156                         goto out;
2157                 }
2158                 spin_unlock_bh(lock);
2159                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2160         }
2161 out:
2162         return rc;
2163 }
2164
2165 static void *established_get_next(struct seq_file *seq, void *cur)
2166 {
2167         struct sock *sk = cur;
2168         struct inet_timewait_sock *tw;
2169         struct hlist_nulls_node *node;
2170         struct tcp_iter_state *st = seq->private;
2171         struct net *net = seq_file_net(seq);
2172
2173         ++st->num;
2174         ++st->offset;
2175
2176         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2177                 tw = cur;
2178                 tw = tw_next(tw);
2179 get_tw:
2180                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2181                         tw = tw_next(tw);
2182                 }
2183                 if (tw) {
2184                         cur = tw;
2185                         goto out;
2186                 }
2187                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2188                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2189
2190                 /* Look for next non empty bucket */
2191                 st->offset = 0;
2192                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2193                                 empty_bucket(st))
2194                         ;
2195                 if (st->bucket > tcp_hashinfo.ehash_mask)
2196                         return NULL;
2197
2198                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2199                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2200         } else
2201                 sk = sk_nulls_next(sk);
2202
2203         sk_nulls_for_each_from(sk, node) {
2204                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2205                         goto found;
2206         }
2207
2208         st->state = TCP_SEQ_STATE_TIME_WAIT;
2209         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2210         goto get_tw;
2211 found:
2212         cur = sk;
2213 out:
2214         return cur;
2215 }
2216
2217 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2218 {
2219         struct tcp_iter_state *st = seq->private;
2220         void *rc;
2221
2222         st->bucket = 0;
2223         rc = established_get_first(seq);
2224
2225         while (rc && pos) {
2226                 rc = established_get_next(seq, rc);
2227                 --pos;
2228         }
2229         return rc;
2230 }
2231
2232 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2233 {
2234         void *rc;
2235         struct tcp_iter_state *st = seq->private;
2236
2237         st->state = TCP_SEQ_STATE_LISTENING;
2238         rc        = listening_get_idx(seq, &pos);
2239
2240         if (!rc) {
2241                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2242                 rc        = established_get_idx(seq, pos);
2243         }
2244
2245         return rc;
2246 }
2247
2248 static void *tcp_seek_last_pos(struct seq_file *seq)
2249 {
2250         struct tcp_iter_state *st = seq->private;
2251         int offset = st->offset;
2252         int orig_num = st->num;
2253         void *rc = NULL;
2254
2255         switch (st->state) {
2256         case TCP_SEQ_STATE_OPENREQ:
2257         case TCP_SEQ_STATE_LISTENING:
2258                 if (st->bucket >= INET_LHTABLE_SIZE)
2259                         break;
2260                 st->state = TCP_SEQ_STATE_LISTENING;
2261                 rc = listening_get_next(seq, NULL);
2262                 while (offset-- && rc)
2263                         rc = listening_get_next(seq, rc);
2264                 if (rc)
2265                         break;
2266                 st->bucket = 0;
2267                 /* Fallthrough */
2268         case TCP_SEQ_STATE_ESTABLISHED:
2269         case TCP_SEQ_STATE_TIME_WAIT:
2270                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2271                 if (st->bucket > tcp_hashinfo.ehash_mask)
2272                         break;
2273                 rc = established_get_first(seq);
2274                 while (offset-- && rc)
2275                         rc = established_get_next(seq, rc);
2276         }
2277
2278         st->num = orig_num;
2279
2280         return rc;
2281 }
2282
2283 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2284 {
2285         struct tcp_iter_state *st = seq->private;
2286         void *rc;
2287
2288         if (*pos && *pos == st->last_pos) {
2289                 rc = tcp_seek_last_pos(seq);
2290                 if (rc)
2291                         goto out;
2292         }
2293
2294         st->state = TCP_SEQ_STATE_LISTENING;
2295         st->num = 0;
2296         st->bucket = 0;
2297         st->offset = 0;
2298         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2299
2300 out:
2301         st->last_pos = *pos;
2302         return rc;
2303 }
2304
2305 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2306 {
2307         struct tcp_iter_state *st = seq->private;
2308         void *rc = NULL;
2309
2310         if (v == SEQ_START_TOKEN) {
2311                 rc = tcp_get_idx(seq, 0);
2312                 goto out;
2313         }
2314
2315         switch (st->state) {
2316         case TCP_SEQ_STATE_OPENREQ:
2317         case TCP_SEQ_STATE_LISTENING:
2318                 rc = listening_get_next(seq, v);
2319                 if (!rc) {
2320                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2321                         st->bucket = 0;
2322                         st->offset = 0;
2323                         rc        = established_get_first(seq);
2324                 }
2325                 break;
2326         case TCP_SEQ_STATE_ESTABLISHED:
2327         case TCP_SEQ_STATE_TIME_WAIT:
2328                 rc = established_get_next(seq, v);
2329                 break;
2330         }
2331 out:
2332         ++*pos;
2333         st->last_pos = *pos;
2334         return rc;
2335 }
2336
2337 static void tcp_seq_stop(struct seq_file *seq, void *v)
2338 {
2339         struct tcp_iter_state *st = seq->private;
2340
2341         switch (st->state) {
2342         case TCP_SEQ_STATE_OPENREQ:
2343                 if (v) {
2344                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2345                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2346                 }
2347         case TCP_SEQ_STATE_LISTENING:
2348                 if (v != SEQ_START_TOKEN)
2349                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2350                 break;
2351         case TCP_SEQ_STATE_TIME_WAIT:
2352         case TCP_SEQ_STATE_ESTABLISHED:
2353                 if (v)
2354                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2355                 break;
2356         }
2357 }
2358
2359 int tcp_seq_open(struct inode *inode, struct file *file)
2360 {
2361         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2362         struct tcp_iter_state *s;
2363         int err;
2364
2365         err = seq_open_net(inode, file, &afinfo->seq_ops,
2366                           sizeof(struct tcp_iter_state));
2367         if (err < 0)
2368                 return err;
2369
2370         s = ((struct seq_file *)file->private_data)->private;
2371         s->family               = afinfo->family;
2372         s->last_pos             = 0;
2373         return 0;
2374 }
2375 EXPORT_SYMBOL(tcp_seq_open);
2376
2377 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2378 {
2379         int rc = 0;
2380         struct proc_dir_entry *p;
2381
2382         afinfo->seq_ops.start           = tcp_seq_start;
2383         afinfo->seq_ops.next            = tcp_seq_next;
2384         afinfo->seq_ops.stop            = tcp_seq_stop;
2385
2386         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2387                              afinfo->seq_fops, afinfo);
2388         if (!p)
2389                 rc = -ENOMEM;
2390         return rc;
2391 }
2392 EXPORT_SYMBOL(tcp_proc_register);
2393
2394 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2395 {
2396         proc_net_remove(net, afinfo->name);
2397 }
2398 EXPORT_SYMBOL(tcp_proc_unregister);
2399
2400 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2401                          struct seq_file *f, int i, int uid, int *len)
2402 {
2403         const struct inet_request_sock *ireq = inet_rsk(req);
2404         int ttd = req->expires - jiffies;
2405
2406         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2407                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2408                 i,
2409                 ireq->loc_addr,
2410                 ntohs(inet_sk(sk)->inet_sport),
2411                 ireq->rmt_addr,
2412                 ntohs(ireq->rmt_port),
2413                 TCP_SYN_RECV,
2414                 0, 0, /* could print option size, but that is af dependent. */
2415                 1,    /* timers active (only the expire timer) */
2416                 jiffies_to_clock_t(ttd),
2417                 req->retrans,
2418                 uid,
2419                 0,  /* non standard timer */
2420                 0, /* open_requests have no inode */
2421                 atomic_read(&sk->sk_refcnt),
2422                 req,
2423                 len);
2424 }
2425
2426 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2427 {
2428         int timer_active;
2429         unsigned long timer_expires;
2430         const struct tcp_sock *tp = tcp_sk(sk);
2431         const struct inet_connection_sock *icsk = inet_csk(sk);
2432         const struct inet_sock *inet = inet_sk(sk);
2433         __be32 dest = inet->inet_daddr;
2434         __be32 src = inet->inet_rcv_saddr;
2435         __u16 destp = ntohs(inet->inet_dport);
2436         __u16 srcp = ntohs(inet->inet_sport);
2437         int rx_queue;
2438
2439         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2440                 timer_active    = 1;
2441                 timer_expires   = icsk->icsk_timeout;
2442         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2443                 timer_active    = 4;
2444                 timer_expires   = icsk->icsk_timeout;
2445         } else if (timer_pending(&sk->sk_timer)) {
2446                 timer_active    = 2;
2447                 timer_expires   = sk->sk_timer.expires;
2448         } else {
2449                 timer_active    = 0;
2450                 timer_expires = jiffies;
2451         }
2452
2453         if (sk->sk_state == TCP_LISTEN)
2454                 rx_queue = sk->sk_ack_backlog;
2455         else
2456                 /*
2457                  * because we dont lock socket, we might find a transient negative value
2458                  */
2459                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2460
2461         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2462                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2463                 i, src, srcp, dest, destp, sk->sk_state,
2464                 tp->write_seq - tp->snd_una,
2465                 rx_queue,
2466                 timer_active,
2467                 jiffies_to_clock_t(timer_expires - jiffies),
2468                 icsk->icsk_retransmits,
2469                 sock_i_uid(sk),
2470                 icsk->icsk_probes_out,
2471                 sock_i_ino(sk),
2472                 atomic_read(&sk->sk_refcnt), sk,
2473                 jiffies_to_clock_t(icsk->icsk_rto),
2474                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2475                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2476                 tp->snd_cwnd,
2477                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2478                 len);
2479 }
2480
2481 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2482                                struct seq_file *f, int i, int *len)
2483 {
2484         __be32 dest, src;
2485         __u16 destp, srcp;
2486         int ttd = tw->tw_ttd - jiffies;
2487
2488         if (ttd < 0)
2489                 ttd = 0;
2490
2491         dest  = tw->tw_daddr;
2492         src   = tw->tw_rcv_saddr;
2493         destp = ntohs(tw->tw_dport);
2494         srcp  = ntohs(tw->tw_sport);
2495
2496         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2497                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2498                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2499                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2500                 atomic_read(&tw->tw_refcnt), tw, len);
2501 }
2502
2503 #define TMPSZ 150
2504
2505 static int tcp4_seq_show(struct seq_file *seq, void *v)
2506 {
2507         struct tcp_iter_state *st;
2508         int len;
2509
2510         if (v == SEQ_START_TOKEN) {
2511                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2512                            "  sl  local_address rem_address   st tx_queue "
2513                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2514                            "inode");
2515                 goto out;
2516         }
2517         st = seq->private;
2518
2519         switch (st->state) {
2520         case TCP_SEQ_STATE_LISTENING:
2521         case TCP_SEQ_STATE_ESTABLISHED:
2522                 get_tcp4_sock(v, seq, st->num, &len);
2523                 break;
2524         case TCP_SEQ_STATE_OPENREQ:
2525                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2526                 break;
2527         case TCP_SEQ_STATE_TIME_WAIT:
2528                 get_timewait4_sock(v, seq, st->num, &len);
2529                 break;
2530         }
2531         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2532 out:
2533         return 0;
2534 }
2535
2536 static const struct file_operations tcp_afinfo_seq_fops = {
2537         .owner   = THIS_MODULE,
2538         .open    = tcp_seq_open,
2539         .read    = seq_read,
2540         .llseek  = seq_lseek,
2541         .release = seq_release_net
2542 };
2543
2544 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2545         .name           = "tcp",
2546         .family         = AF_INET,
2547         .seq_fops       = &tcp_afinfo_seq_fops,
2548         .seq_ops        = {
2549                 .show           = tcp4_seq_show,
2550         },
2551 };
2552
2553 static int __net_init tcp4_proc_init_net(struct net *net)
2554 {
2555         return tcp_proc_register(net, &tcp4_seq_afinfo);
2556 }
2557
2558 static void __net_exit tcp4_proc_exit_net(struct net *net)
2559 {
2560         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2561 }
2562
2563 static struct pernet_operations tcp4_net_ops = {
2564         .init = tcp4_proc_init_net,
2565         .exit = tcp4_proc_exit_net,
2566 };
2567
2568 int __init tcp4_proc_init(void)
2569 {
2570         return register_pernet_subsys(&tcp4_net_ops);
2571 }
2572
2573 void tcp4_proc_exit(void)
2574 {
2575         unregister_pernet_subsys(&tcp4_net_ops);
2576 }
2577 #endif /* CONFIG_PROC_FS */
2578
2579 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2580 {
2581         const struct iphdr *iph = skb_gro_network_header(skb);
2582
2583         switch (skb->ip_summed) {
2584         case CHECKSUM_COMPLETE:
2585                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2586                                   skb->csum)) {
2587                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2588                         break;
2589                 }
2590
2591                 /* fall through */
2592         case CHECKSUM_NONE:
2593                 NAPI_GRO_CB(skb)->flush = 1;
2594                 return NULL;
2595         }
2596
2597         return tcp_gro_receive(head, skb);
2598 }
2599
2600 int tcp4_gro_complete(struct sk_buff *skb)
2601 {
2602         const struct iphdr *iph = ip_hdr(skb);
2603         struct tcphdr *th = tcp_hdr(skb);
2604
2605         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2606                                   iph->saddr, iph->daddr, 0);
2607         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2608
2609         return tcp_gro_complete(skb);
2610 }
2611
2612 struct proto tcp_prot = {
2613         .name                   = "TCP",
2614         .owner                  = THIS_MODULE,
2615         .close                  = tcp_close,
2616         .connect                = tcp_v4_connect,
2617         .disconnect             = tcp_disconnect,
2618         .accept                 = inet_csk_accept,
2619         .ioctl                  = tcp_ioctl,
2620         .init                   = tcp_v4_init_sock,
2621         .destroy                = tcp_v4_destroy_sock,
2622         .shutdown               = tcp_shutdown,
2623         .setsockopt             = tcp_setsockopt,
2624         .getsockopt             = tcp_getsockopt,
2625         .recvmsg                = tcp_recvmsg,
2626         .sendmsg                = tcp_sendmsg,
2627         .sendpage               = tcp_sendpage,
2628         .backlog_rcv            = tcp_v4_do_rcv,
2629         .hash                   = inet_hash,
2630         .unhash                 = inet_unhash,
2631         .get_port               = inet_csk_get_port,
2632         .enter_memory_pressure  = tcp_enter_memory_pressure,
2633         .sockets_allocated      = &tcp_sockets_allocated,
2634         .orphan_count           = &tcp_orphan_count,
2635         .memory_allocated       = &tcp_memory_allocated,
2636         .memory_pressure        = &tcp_memory_pressure,
2637         .sysctl_wmem            = sysctl_tcp_wmem,
2638         .sysctl_rmem            = sysctl_tcp_rmem,
2639         .max_header             = MAX_TCP_HEADER,
2640         .obj_size               = sizeof(struct tcp_sock),
2641         .slab_flags             = SLAB_DESTROY_BY_RCU,
2642         .twsk_prot              = &tcp_timewait_sock_ops,
2643         .rsk_prot               = &tcp_request_sock_ops,
2644         .h.hashinfo             = &tcp_hashinfo,
2645         .no_autobind            = true,
2646 #ifdef CONFIG_COMPAT
2647         .compat_setsockopt      = compat_tcp_setsockopt,
2648         .compat_getsockopt      = compat_tcp_getsockopt,
2649 #endif
2650 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2651         .init_cgroup            = tcp_init_cgroup,
2652         .destroy_cgroup         = tcp_destroy_cgroup,
2653         .proto_cgroup           = tcp_proto_cgroup,
2654 #endif
2655 };
2656 EXPORT_SYMBOL(tcp_prot);
2657
2658 static int __net_init tcp_sk_init(struct net *net)
2659 {
2660         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2661                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2662 }
2663
2664 static void __net_exit tcp_sk_exit(struct net *net)
2665 {
2666         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2667 }
2668
2669 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2670 {
2671         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2672 }
2673
2674 static struct pernet_operations __net_initdata tcp_sk_ops = {
2675        .init       = tcp_sk_init,
2676        .exit       = tcp_sk_exit,
2677        .exit_batch = tcp_sk_exit_batch,
2678 };
2679
2680 void __init tcp_v4_init(void)
2681 {
2682         inet_hashinfo_init(&tcp_hashinfo);
2683         if (register_pernet_subsys(&tcp_sk_ops))
2684                 panic("Failed to create the TCP control socket.\n");
2685 }