2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
67 #include <net/inet_hashtables.h>
70 #include <net/inet_common.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
107 /* Caller must disable local BH processing. */
108 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
110 struct inet_bind_hashbucket *head =
111 &tcp_bhash[inet_bhashfn(inet_sk(child)->num,
113 struct inet_bind_bucket *tb;
115 spin_lock(&head->lock);
116 tb = inet_sk(sk)->bind_hash;
117 sk_add_bind_node(child, &tb->owners);
118 inet_sk(child)->bind_hash = tb;
119 spin_unlock(&head->lock);
122 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
125 __tcp_inherit_port(sk, child);
129 void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
130 const unsigned short snum)
132 struct inet_sock *inet = inet_sk(sk);
134 sk_add_bind_node(sk, &tb->owners);
135 inet->bind_hash = tb;
138 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
140 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
142 struct hlist_node *node;
143 int reuse = sk->sk_reuse;
145 sk_for_each_bound(sk2, node, &tb->owners) {
147 !tcp_v6_ipv6only(sk2) &&
148 (!sk->sk_bound_dev_if ||
149 !sk2->sk_bound_dev_if ||
150 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
151 if (!reuse || !sk2->sk_reuse ||
152 sk2->sk_state == TCP_LISTEN) {
153 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
154 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
155 sk2_rcv_saddr == sk_rcv_saddr)
163 /* Obtain a reference to a local port for the given sock,
164 * if snum is zero it means select any available local port.
166 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
168 struct inet_bind_hashbucket *head;
169 struct hlist_node *node;
170 struct inet_bind_bucket *tb;
175 int low = sysctl_local_port_range[0];
176 int high = sysctl_local_port_range[1];
177 int remaining = (high - low) + 1;
180 spin_lock(&tcp_portalloc_lock);
181 if (tcp_port_rover < low)
184 rover = tcp_port_rover;
189 head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)];
190 spin_lock(&head->lock);
191 inet_bind_bucket_for_each(tb, node, &head->chain)
192 if (tb->port == rover)
196 spin_unlock(&head->lock);
197 } while (--remaining > 0);
198 tcp_port_rover = rover;
199 spin_unlock(&tcp_portalloc_lock);
201 /* Exhausted local port range during search? It is not
202 * possible for us to be holding one of the bind hash
203 * locks if this test triggers, because if 'remaining'
204 * drops to zero, we broke out of the do/while loop at
205 * the top level, not from the 'break;' statement.
208 if (unlikely(remaining <= 0))
211 /* OK, here is the one we will use. HEAD is
212 * non-NULL and we hold it's mutex.
216 head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
217 spin_lock(&head->lock);
218 inet_bind_bucket_for_each(tb, node, &head->chain)
219 if (tb->port == snum)
225 if (!hlist_empty(&tb->owners)) {
226 if (sk->sk_reuse > 1)
228 if (tb->fastreuse > 0 &&
229 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
233 if (tcp_bind_conflict(sk, tb))
239 if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL)
241 if (hlist_empty(&tb->owners)) {
242 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
246 } else if (tb->fastreuse &&
247 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
250 if (!inet_sk(sk)->bind_hash)
251 tcp_bind_hash(sk, tb, snum);
252 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
256 spin_unlock(&head->lock);
262 /* Get rid of any references to a local port held by the
265 static void __tcp_put_port(struct sock *sk)
267 struct inet_sock *inet = inet_sk(sk);
268 struct inet_bind_hashbucket *head = &tcp_bhash[inet_bhashfn(inet->num,
270 struct inet_bind_bucket *tb;
272 spin_lock(&head->lock);
273 tb = inet->bind_hash;
274 __sk_del_bind_node(sk);
275 inet->bind_hash = NULL;
277 inet_bind_bucket_destroy(tcp_bucket_cachep, tb);
278 spin_unlock(&head->lock);
281 void tcp_put_port(struct sock *sk)
288 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
289 * Look, when several writers sleep and reader wakes them up, all but one
290 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
291 * this, _but_ remember, it adds useless work on UP machines (wake up each
292 * exclusive lock release). It should be ifdefed really.
295 void tcp_listen_wlock(void)
297 write_lock(&tcp_lhash_lock);
299 if (atomic_read(&tcp_lhash_users)) {
303 prepare_to_wait_exclusive(&tcp_lhash_wait,
304 &wait, TASK_UNINTERRUPTIBLE);
305 if (!atomic_read(&tcp_lhash_users))
307 write_unlock_bh(&tcp_lhash_lock);
309 write_lock_bh(&tcp_lhash_lock);
312 finish_wait(&tcp_lhash_wait, &wait);
316 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
318 struct hlist_head *list;
321 BUG_TRAP(sk_unhashed(sk));
322 if (listen_possible && sk->sk_state == TCP_LISTEN) {
323 list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)];
324 lock = &tcp_lhash_lock;
327 sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size);
328 list = &tcp_ehash[sk->sk_hashent].chain;
329 lock = &tcp_ehash[sk->sk_hashent].lock;
332 __sk_add_node(sk, list);
333 sock_prot_inc_use(sk->sk_prot);
335 if (listen_possible && sk->sk_state == TCP_LISTEN)
336 wake_up(&tcp_lhash_wait);
339 static void tcp_v4_hash(struct sock *sk)
341 if (sk->sk_state != TCP_CLOSE) {
343 __tcp_v4_hash(sk, 1);
348 void tcp_unhash(struct sock *sk)
355 if (sk->sk_state == TCP_LISTEN) {
358 lock = &tcp_lhash_lock;
360 struct inet_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
362 write_lock_bh(&head->lock);
365 if (__sk_del_node_init(sk))
366 sock_prot_dec_use(sk->sk_prot);
367 write_unlock_bh(lock);
370 if (sk->sk_state == TCP_LISTEN)
371 wake_up(&tcp_lhash_wait);
374 /* Don't inline this cruft. Here are some nice properties to
375 * exploit here. The BSD API does not allow a listening TCP
376 * to specify the remote port nor the remote address for the
377 * connection. So always assume those are both wildcarded
378 * during the search since they can never be otherwise.
380 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
382 const unsigned short hnum,
385 struct sock *result = NULL, *sk;
386 struct hlist_node *node;
390 sk_for_each(sk, node, head) {
391 struct inet_sock *inet = inet_sk(sk);
393 if (inet->num == hnum && !ipv6_only_sock(sk)) {
394 __u32 rcv_saddr = inet->rcv_saddr;
396 score = (sk->sk_family == PF_INET ? 1 : 0);
398 if (rcv_saddr != daddr)
402 if (sk->sk_bound_dev_if) {
403 if (sk->sk_bound_dev_if != dif)
409 if (score > hiscore) {
418 /* Optimize the common listener case. */
419 static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
420 const unsigned short hnum,
423 struct sock *sk = NULL;
424 struct hlist_head *head;
426 read_lock(&tcp_lhash_lock);
427 head = &tcp_listening_hash[inet_lhashfn(hnum)];
428 if (!hlist_empty(head)) {
429 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
431 if (inet->num == hnum && !sk->sk_node.next &&
432 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
433 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
434 !sk->sk_bound_dev_if)
436 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
442 read_unlock(&tcp_lhash_lock);
446 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
447 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
449 * Local BH must be disabled here.
452 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
458 struct inet_ehash_bucket *head;
459 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
460 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
462 struct hlist_node *node;
463 /* Optimize here for direct hit, only listening connections can
464 * have wildcards anyways.
466 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size);
467 head = &tcp_ehash[hash];
468 read_lock(&head->lock);
469 sk_for_each(sk, node, &head->chain) {
470 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
471 goto hit; /* You sunk my battleship! */
474 /* Must check for a TIME_WAIT'er before going to listener hash. */
475 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
476 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
481 read_unlock(&head->lock);
488 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
489 u32 daddr, u16 hnum, int dif)
491 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
494 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
497 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
503 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
509 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
511 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
513 return secure_tcp_sequence_number(skb->nh.iph->daddr,
519 /* called with local bh disabled */
520 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
521 struct tcp_tw_bucket **twp)
523 struct inet_sock *inet = inet_sk(sk);
524 u32 daddr = inet->rcv_saddr;
525 u32 saddr = inet->daddr;
526 int dif = sk->sk_bound_dev_if;
527 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
528 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
529 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size);
530 struct inet_ehash_bucket *head = &tcp_ehash[hash];
532 struct hlist_node *node;
533 struct tcp_tw_bucket *tw;
535 write_lock(&head->lock);
537 /* Check TIME-WAIT sockets first. */
538 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
539 tw = (struct tcp_tw_bucket *)sk2;
541 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
542 struct tcp_sock *tp = tcp_sk(sk);
544 /* With PAWS, it is safe from the viewpoint
545 of data integrity. Even without PAWS it
546 is safe provided sequence spaces do not
547 overlap i.e. at data rates <= 80Mbit/sec.
549 Actually, the idea is close to VJ's one,
550 only timestamp cache is held not per host,
551 but per port pair and TW bucket is used
554 If TW bucket has been already destroyed we
555 fall back to VJ's scheme and use initial
556 timestamp retrieved from peer table.
558 if (tw->tw_ts_recent_stamp &&
559 (!twp || (sysctl_tcp_tw_reuse &&
561 tw->tw_ts_recent_stamp > 1))) {
563 tw->tw_snd_nxt + 65535 + 2) == 0)
565 tp->rx_opt.ts_recent = tw->tw_ts_recent;
566 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
575 /* And established part... */
576 sk_for_each(sk2, node, &head->chain) {
577 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
582 /* Must record num and sport now. Otherwise we will see
583 * in hash table socket with a funny identity. */
585 inet->sport = htons(lport);
586 sk->sk_hashent = hash;
587 BUG_TRAP(sk_unhashed(sk));
588 __sk_add_node(sk, &head->chain);
589 sock_prot_inc_use(sk->sk_prot);
590 write_unlock(&head->lock);
594 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
596 /* Silly. Should hash-dance instead... */
597 tcp_tw_deschedule(tw);
598 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
606 write_unlock(&head->lock);
607 return -EADDRNOTAVAIL;
610 static inline u32 connect_port_offset(const struct sock *sk)
612 const struct inet_sock *inet = inet_sk(sk);
614 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
619 * Bind a port for a connect operation and hash it.
621 static inline int tcp_v4_hash_connect(struct sock *sk)
623 const unsigned short snum = inet_sk(sk)->num;
624 struct inet_bind_hashbucket *head;
625 struct inet_bind_bucket *tb;
629 int low = sysctl_local_port_range[0];
630 int high = sysctl_local_port_range[1];
631 int range = high - low;
635 u32 offset = hint + connect_port_offset(sk);
636 struct hlist_node *node;
637 struct tcp_tw_bucket *tw = NULL;
640 for (i = 1; i <= range; i++) {
641 port = low + (i + offset) % range;
642 head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)];
643 spin_lock(&head->lock);
645 /* Does not bother with rcv_saddr checks,
646 * because the established check is already
649 inet_bind_bucket_for_each(tb, node, &head->chain) {
650 if (tb->port == port) {
651 BUG_TRAP(!hlist_empty(&tb->owners));
652 if (tb->fastreuse >= 0)
654 if (!__tcp_v4_check_established(sk,
662 tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port);
664 spin_unlock(&head->lock);
671 spin_unlock(&head->lock);
675 return -EADDRNOTAVAIL;
680 /* Head lock still held and bh's disabled */
681 tcp_bind_hash(sk, tb, port);
682 if (sk_unhashed(sk)) {
683 inet_sk(sk)->sport = htons(port);
684 __tcp_v4_hash(sk, 0);
686 spin_unlock(&head->lock);
689 tcp_tw_deschedule(tw);
697 head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
698 tb = inet_sk(sk)->bind_hash;
699 spin_lock_bh(&head->lock);
700 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
701 __tcp_v4_hash(sk, 0);
702 spin_unlock_bh(&head->lock);
705 spin_unlock(&head->lock);
706 /* No definite answer... Walk to established hash table */
707 ret = __tcp_v4_check_established(sk, snum, NULL);
714 /* This will initiate an outgoing connection. */
715 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
717 struct inet_sock *inet = inet_sk(sk);
718 struct tcp_sock *tp = tcp_sk(sk);
719 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
725 if (addr_len < sizeof(struct sockaddr_in))
728 if (usin->sin_family != AF_INET)
729 return -EAFNOSUPPORT;
731 nexthop = daddr = usin->sin_addr.s_addr;
732 if (inet->opt && inet->opt->srr) {
735 nexthop = inet->opt->faddr;
738 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
739 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
741 inet->sport, usin->sin_port, sk);
745 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
750 if (!inet->opt || !inet->opt->srr)
754 inet->saddr = rt->rt_src;
755 inet->rcv_saddr = inet->saddr;
757 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
758 /* Reset inherited state */
759 tp->rx_opt.ts_recent = 0;
760 tp->rx_opt.ts_recent_stamp = 0;
764 if (sysctl_tcp_tw_recycle &&
765 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
766 struct inet_peer *peer = rt_get_peer(rt);
768 /* VJ's idea. We save last timestamp seen from
769 * the destination in peer table, when entering state TIME-WAIT
770 * and initialize rx_opt.ts_recent from it, when trying new connection.
773 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
774 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
775 tp->rx_opt.ts_recent = peer->tcp_ts;
779 inet->dport = usin->sin_port;
782 tp->ext_header_len = 0;
784 tp->ext_header_len = inet->opt->optlen;
786 tp->rx_opt.mss_clamp = 536;
788 /* Socket identity is still unknown (sport may be zero).
789 * However we set state to SYN-SENT and not releasing socket
790 * lock select source port, enter ourselves into the hash tables and
791 * complete initialization after this.
793 tcp_set_state(sk, TCP_SYN_SENT);
794 err = tcp_v4_hash_connect(sk);
798 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
802 /* OK, now commit destination to socket. */
803 sk_setup_caps(sk, &rt->u.dst);
806 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
811 inet->id = tp->write_seq ^ jiffies;
813 err = tcp_connect(sk);
821 /* This unhashes the socket and releases the local port, if necessary. */
822 tcp_set_state(sk, TCP_CLOSE);
824 sk->sk_route_caps = 0;
829 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
831 return ((struct rtable *)skb->dst)->rt_iif;
834 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
836 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
839 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
840 struct request_sock ***prevp,
842 __u32 raddr, __u32 laddr)
844 struct listen_sock *lopt = tp->accept_queue.listen_opt;
845 struct request_sock *req, **prev;
847 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
848 (req = *prev) != NULL;
849 prev = &req->dl_next) {
850 const struct inet_request_sock *ireq = inet_rsk(req);
852 if (ireq->rmt_port == rport &&
853 ireq->rmt_addr == raddr &&
854 ireq->loc_addr == laddr &&
855 TCP_INET_FAMILY(req->rsk_ops->family)) {
865 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
867 struct tcp_sock *tp = tcp_sk(sk);
868 struct listen_sock *lopt = tp->accept_queue.listen_opt;
869 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
871 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
877 * This routine does path mtu discovery as defined in RFC1191.
879 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
882 struct dst_entry *dst;
883 struct inet_sock *inet = inet_sk(sk);
884 struct tcp_sock *tp = tcp_sk(sk);
886 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
887 * send out by Linux are always <576bytes so they should go through
890 if (sk->sk_state == TCP_LISTEN)
893 /* We don't check in the destentry if pmtu discovery is forbidden
894 * on this route. We just assume that no packet_to_big packets
895 * are send back when pmtu discovery is not active.
896 * There is a small race when the user changes this flag in the
897 * route, but I think that's acceptable.
899 if ((dst = __sk_dst_check(sk, 0)) == NULL)
902 dst->ops->update_pmtu(dst, mtu);
904 /* Something is about to be wrong... Remember soft error
905 * for the case, if this connection will not able to recover.
907 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
908 sk->sk_err_soft = EMSGSIZE;
912 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
913 tp->pmtu_cookie > mtu) {
914 tcp_sync_mss(sk, mtu);
916 /* Resend the TCP packet because it's
917 * clear that the old packet has been
918 * dropped. This is the new "fast" path mtu
921 tcp_simple_retransmit(sk);
922 } /* else let the usual retransmit timer handle it */
926 * This routine is called by the ICMP module when it gets some
927 * sort of error condition. If err < 0 then the socket should
928 * be closed and the error returned to the user. If err > 0
929 * it's just the icmp type << 8 | icmp code. After adjustment
930 * header points to the first 8 bytes of the tcp header. We need
931 * to find the appropriate port.
933 * The locking strategy used here is very "optimistic". When
934 * someone else accesses the socket the ICMP is just dropped
935 * and for some paths there is no check at all.
936 * A more general error queue to queue errors for later handling
937 * is probably better.
941 void tcp_v4_err(struct sk_buff *skb, u32 info)
943 struct iphdr *iph = (struct iphdr *)skb->data;
944 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
946 struct inet_sock *inet;
947 int type = skb->h.icmph->type;
948 int code = skb->h.icmph->code;
953 if (skb->len < (iph->ihl << 2) + 8) {
954 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
958 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
959 th->source, tcp_v4_iif(skb));
961 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
964 if (sk->sk_state == TCP_TIME_WAIT) {
965 tcp_tw_put((struct tcp_tw_bucket *)sk);
970 /* If too many ICMPs get dropped on busy
971 * servers this needs to be solved differently.
973 if (sock_owned_by_user(sk))
974 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
976 if (sk->sk_state == TCP_CLOSE)
980 seq = ntohl(th->seq);
981 if (sk->sk_state != TCP_LISTEN &&
982 !between(seq, tp->snd_una, tp->snd_nxt)) {
983 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
988 case ICMP_SOURCE_QUENCH:
989 /* Just silently ignore these. */
991 case ICMP_PARAMETERPROB:
994 case ICMP_DEST_UNREACH:
995 if (code > NR_ICMP_UNREACH)
998 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
999 if (!sock_owned_by_user(sk))
1000 do_pmtu_discovery(sk, iph, info);
1004 err = icmp_err_convert[code].errno;
1006 case ICMP_TIME_EXCEEDED:
1013 switch (sk->sk_state) {
1014 struct request_sock *req, **prev;
1016 if (sock_owned_by_user(sk))
1019 req = tcp_v4_search_req(tp, &prev, th->dest,
1020 iph->daddr, iph->saddr);
1024 /* ICMPs are not backlogged, hence we cannot get
1025 an established socket here.
1029 if (seq != tcp_rsk(req)->snt_isn) {
1030 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1035 * Still in SYN_RECV, just remove it silently.
1036 * There is no good way to pass the error to the newly
1037 * created socket, and POSIX does not want network
1038 * errors returned from accept().
1040 tcp_synq_drop(sk, req, prev);
1044 case TCP_SYN_RECV: /* Cannot happen.
1045 It can f.e. if SYNs crossed.
1047 if (!sock_owned_by_user(sk)) {
1048 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1051 sk->sk_error_report(sk);
1055 sk->sk_err_soft = err;
1060 /* If we've already connected we will keep trying
1061 * until we time out, or the user gives up.
1063 * rfc1122 4.2.3.9 allows to consider as hard errors
1064 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1065 * but it is obsoleted by pmtu discovery).
1067 * Note, that in modern internet, where routing is unreliable
1068 * and in each dark corner broken firewalls sit, sending random
1069 * errors ordered by their masters even this two messages finally lose
1070 * their original sense (even Linux sends invalid PORT_UNREACHs)
1072 * Now we are in compliance with RFCs.
1077 if (!sock_owned_by_user(sk) && inet->recverr) {
1079 sk->sk_error_report(sk);
1080 } else { /* Only an error on timeout */
1081 sk->sk_err_soft = err;
1089 /* This routine computes an IPv4 TCP checksum. */
1090 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1091 struct sk_buff *skb)
1093 struct inet_sock *inet = inet_sk(sk);
1095 if (skb->ip_summed == CHECKSUM_HW) {
1096 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1097 skb->csum = offsetof(struct tcphdr, check);
1099 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1100 csum_partial((char *)th,
1107 * This routine will send an RST to the other tcp.
1109 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1111 * Answer: if a packet caused RST, it is not for a socket
1112 * existing in our system, if it is matched to a socket,
1113 * it is just duplicate segment or bug in other side's TCP.
1114 * So that we build reply only basing on parameters
1115 * arrived with segment.
1116 * Exception: precedence violation. We do not implement it in any case.
1119 static void tcp_v4_send_reset(struct sk_buff *skb)
1121 struct tcphdr *th = skb->h.th;
1123 struct ip_reply_arg arg;
1125 /* Never send a reset in response to a reset. */
1129 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1132 /* Swap the send and the receive. */
1133 memset(&rth, 0, sizeof(struct tcphdr));
1134 rth.dest = th->source;
1135 rth.source = th->dest;
1136 rth.doff = sizeof(struct tcphdr) / 4;
1140 rth.seq = th->ack_seq;
1143 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1144 skb->len - (th->doff << 2));
1147 memset(&arg, 0, sizeof arg);
1148 arg.iov[0].iov_base = (unsigned char *)&rth;
1149 arg.iov[0].iov_len = sizeof rth;
1150 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1151 skb->nh.iph->saddr, /*XXX*/
1152 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1153 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1155 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1157 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1158 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1161 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1162 outside socket context is ugly, certainly. What can I do?
1165 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1168 struct tcphdr *th = skb->h.th;
1173 struct ip_reply_arg arg;
1175 memset(&rep.th, 0, sizeof(struct tcphdr));
1176 memset(&arg, 0, sizeof arg);
1178 arg.iov[0].iov_base = (unsigned char *)&rep;
1179 arg.iov[0].iov_len = sizeof(rep.th);
1181 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1182 (TCPOPT_TIMESTAMP << 8) |
1184 rep.tsopt[1] = htonl(tcp_time_stamp);
1185 rep.tsopt[2] = htonl(ts);
1186 arg.iov[0].iov_len = sizeof(rep);
1189 /* Swap the send and the receive. */
1190 rep.th.dest = th->source;
1191 rep.th.source = th->dest;
1192 rep.th.doff = arg.iov[0].iov_len / 4;
1193 rep.th.seq = htonl(seq);
1194 rep.th.ack_seq = htonl(ack);
1196 rep.th.window = htons(win);
1198 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1199 skb->nh.iph->saddr, /*XXX*/
1200 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1201 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1203 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1205 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1208 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1210 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1212 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1213 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1218 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1220 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1224 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1225 struct request_sock *req)
1228 const struct inet_request_sock *ireq = inet_rsk(req);
1229 struct ip_options *opt = inet_rsk(req)->opt;
1230 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1232 { .daddr = ((opt && opt->srr) ?
1235 .saddr = ireq->loc_addr,
1236 .tos = RT_CONN_FLAGS(sk) } },
1237 .proto = IPPROTO_TCP,
1239 { .sport = inet_sk(sk)->sport,
1240 .dport = ireq->rmt_port } } };
1242 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1243 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1246 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1248 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1255 * Send a SYN-ACK after having received an ACK.
1256 * This still operates on a request_sock only, not on a big
1259 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1260 struct dst_entry *dst)
1262 const struct inet_request_sock *ireq = inet_rsk(req);
1264 struct sk_buff * skb;
1266 /* First, grab a route. */
1267 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1270 skb = tcp_make_synack(sk, dst, req);
1273 struct tcphdr *th = skb->h.th;
1275 th->check = tcp_v4_check(th, skb->len,
1278 csum_partial((char *)th, skb->len,
1281 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1284 if (err == NET_XMIT_CN)
1294 * IPv4 request_sock destructor.
1296 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1298 if (inet_rsk(req)->opt)
1299 kfree(inet_rsk(req)->opt);
1302 static inline void syn_flood_warning(struct sk_buff *skb)
1304 static unsigned long warntime;
1306 if (time_after(jiffies, (warntime + HZ * 60))) {
1309 "possible SYN flooding on port %d. Sending cookies.\n",
1310 ntohs(skb->h.th->dest));
1315 * Save and compile IPv4 options into the request_sock if needed.
1317 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1318 struct sk_buff *skb)
1320 struct ip_options *opt = &(IPCB(skb)->opt);
1321 struct ip_options *dopt = NULL;
1323 if (opt && opt->optlen) {
1324 int opt_size = optlength(opt);
1325 dopt = kmalloc(opt_size, GFP_ATOMIC);
1327 if (ip_options_echo(dopt, skb)) {
1336 struct request_sock_ops tcp_request_sock_ops = {
1338 .obj_size = sizeof(struct tcp_request_sock),
1339 .rtx_syn_ack = tcp_v4_send_synack,
1340 .send_ack = tcp_v4_reqsk_send_ack,
1341 .destructor = tcp_v4_reqsk_destructor,
1342 .send_reset = tcp_v4_send_reset,
1345 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1347 struct inet_request_sock *ireq;
1348 struct tcp_options_received tmp_opt;
1349 struct request_sock *req;
1350 __u32 saddr = skb->nh.iph->saddr;
1351 __u32 daddr = skb->nh.iph->daddr;
1352 __u32 isn = TCP_SKB_CB(skb)->when;
1353 struct dst_entry *dst = NULL;
1354 #ifdef CONFIG_SYN_COOKIES
1355 int want_cookie = 0;
1357 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1360 /* Never answer to SYNs send to broadcast or multicast */
1361 if (((struct rtable *)skb->dst)->rt_flags &
1362 (RTCF_BROADCAST | RTCF_MULTICAST))
1365 /* TW buckets are converted to open requests without
1366 * limitations, they conserve resources and peer is
1367 * evidently real one.
1369 if (tcp_synq_is_full(sk) && !isn) {
1370 #ifdef CONFIG_SYN_COOKIES
1371 if (sysctl_tcp_syncookies) {
1378 /* Accept backlog is full. If we have already queued enough
1379 * of warm entries in syn queue, drop request. It is better than
1380 * clogging syn queue with openreqs with exponentially increasing
1383 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1386 req = reqsk_alloc(&tcp_request_sock_ops);
1390 tcp_clear_options(&tmp_opt);
1391 tmp_opt.mss_clamp = 536;
1392 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1394 tcp_parse_options(skb, &tmp_opt, 0);
1397 tcp_clear_options(&tmp_opt);
1398 tmp_opt.saw_tstamp = 0;
1401 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1402 /* Some OSes (unknown ones, but I see them on web server, which
1403 * contains information interesting only for windows'
1404 * users) do not send their stamp in SYN. It is easy case.
1405 * We simply do not advertise TS support.
1407 tmp_opt.saw_tstamp = 0;
1408 tmp_opt.tstamp_ok = 0;
1410 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1412 tcp_openreq_init(req, &tmp_opt, skb);
1414 ireq = inet_rsk(req);
1415 ireq->loc_addr = daddr;
1416 ireq->rmt_addr = saddr;
1417 ireq->opt = tcp_v4_save_options(sk, skb);
1419 TCP_ECN_create_request(req, skb->h.th);
1422 #ifdef CONFIG_SYN_COOKIES
1423 syn_flood_warning(skb);
1425 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1427 struct inet_peer *peer = NULL;
1429 /* VJ's idea. We save last timestamp seen
1430 * from the destination in peer table, when entering
1431 * state TIME-WAIT, and check against it before
1432 * accepting new connection request.
1434 * If "isn" is not zero, this request hit alive
1435 * timewait bucket, so that all the necessary checks
1436 * are made in the function processing timewait state.
1438 if (tmp_opt.saw_tstamp &&
1439 sysctl_tcp_tw_recycle &&
1440 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1441 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1442 peer->v4daddr == saddr) {
1443 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1444 (s32)(peer->tcp_ts - req->ts_recent) >
1446 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1451 /* Kill the following clause, if you dislike this way. */
1452 else if (!sysctl_tcp_syncookies &&
1453 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1454 (sysctl_max_syn_backlog >> 2)) &&
1455 (!peer || !peer->tcp_ts_stamp) &&
1456 (!dst || !dst_metric(dst, RTAX_RTT))) {
1457 /* Without syncookies last quarter of
1458 * backlog is filled with destinations,
1459 * proven to be alive.
1460 * It means that we continue to communicate
1461 * to destinations, already remembered
1462 * to the moment of synflood.
1464 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1465 "request from %u.%u."
1468 ntohs(skb->h.th->source)));
1473 isn = tcp_v4_init_sequence(sk, skb);
1475 tcp_rsk(req)->snt_isn = isn;
1477 if (tcp_v4_send_synack(sk, req, dst))
1483 tcp_v4_synq_add(sk, req);
1490 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1496 * The three way handshake has completed - we got a valid synack -
1497 * now create the new socket.
1499 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1500 struct request_sock *req,
1501 struct dst_entry *dst)
1503 struct inet_request_sock *ireq;
1504 struct inet_sock *newinet;
1505 struct tcp_sock *newtp;
1508 if (sk_acceptq_is_full(sk))
1511 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1514 newsk = tcp_create_openreq_child(sk, req, skb);
1518 sk_setup_caps(newsk, dst);
1520 newtp = tcp_sk(newsk);
1521 newinet = inet_sk(newsk);
1522 ireq = inet_rsk(req);
1523 newinet->daddr = ireq->rmt_addr;
1524 newinet->rcv_saddr = ireq->loc_addr;
1525 newinet->saddr = ireq->loc_addr;
1526 newinet->opt = ireq->opt;
1528 newinet->mc_index = tcp_v4_iif(skb);
1529 newinet->mc_ttl = skb->nh.iph->ttl;
1530 newtp->ext_header_len = 0;
1532 newtp->ext_header_len = newinet->opt->optlen;
1533 newinet->id = newtp->write_seq ^ jiffies;
1535 tcp_sync_mss(newsk, dst_mtu(dst));
1536 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1537 tcp_initialize_rcv_mss(newsk);
1539 __tcp_v4_hash(newsk, 0);
1540 __tcp_inherit_port(sk, newsk);
1545 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1547 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1552 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1554 struct tcphdr *th = skb->h.th;
1555 struct iphdr *iph = skb->nh.iph;
1556 struct tcp_sock *tp = tcp_sk(sk);
1558 struct request_sock **prev;
1559 /* Find possible connection requests. */
1560 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1561 iph->saddr, iph->daddr);
1563 return tcp_check_req(sk, skb, req, prev);
1565 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1572 if (nsk->sk_state != TCP_TIME_WAIT) {
1576 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1580 #ifdef CONFIG_SYN_COOKIES
1581 if (!th->rst && !th->syn && th->ack)
1582 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1587 static int tcp_v4_checksum_init(struct sk_buff *skb)
1589 if (skb->ip_summed == CHECKSUM_HW) {
1590 skb->ip_summed = CHECKSUM_UNNECESSARY;
1591 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1592 skb->nh.iph->daddr, skb->csum))
1595 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1596 skb->ip_summed = CHECKSUM_NONE;
1598 if (skb->len <= 76) {
1599 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1601 skb_checksum(skb, 0, skb->len, 0)))
1603 skb->ip_summed = CHECKSUM_UNNECESSARY;
1605 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1607 skb->nh.iph->daddr, 0);
1613 /* The socket must have it's spinlock held when we get
1616 * We have a potential double-lock case here, so even when
1617 * doing backlog processing we use the BH locking scheme.
1618 * This is because we cannot sleep with the original spinlock
1621 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1623 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1624 TCP_CHECK_TIMER(sk);
1625 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1627 TCP_CHECK_TIMER(sk);
1631 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1634 if (sk->sk_state == TCP_LISTEN) {
1635 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1640 if (tcp_child_process(sk, nsk, skb))
1646 TCP_CHECK_TIMER(sk);
1647 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1649 TCP_CHECK_TIMER(sk);
1653 tcp_v4_send_reset(skb);
1656 /* Be careful here. If this function gets more complicated and
1657 * gcc suffers from register pressure on the x86, sk (in %ebx)
1658 * might be destroyed here. This current version compiles correctly,
1659 * but you have been warned.
1664 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1672 int tcp_v4_rcv(struct sk_buff *skb)
1678 if (skb->pkt_type != PACKET_HOST)
1681 /* Count it even if it's bad */
1682 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1684 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1689 if (th->doff < sizeof(struct tcphdr) / 4)
1691 if (!pskb_may_pull(skb, th->doff * 4))
1694 /* An explanation is required here, I think.
1695 * Packet length and doff are validated by header prediction,
1696 * provided case of th->doff==0 is elimineted.
1697 * So, we defer the checks. */
1698 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1699 tcp_v4_checksum_init(skb) < 0))
1703 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1704 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1705 skb->len - th->doff * 4);
1706 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1707 TCP_SKB_CB(skb)->when = 0;
1708 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1709 TCP_SKB_CB(skb)->sacked = 0;
1711 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1712 skb->nh.iph->daddr, ntohs(th->dest),
1719 if (sk->sk_state == TCP_TIME_WAIT)
1722 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1723 goto discard_and_relse;
1725 if (sk_filter(sk, skb, 0))
1726 goto discard_and_relse;
1732 if (!sock_owned_by_user(sk)) {
1733 if (!tcp_prequeue(sk, skb))
1734 ret = tcp_v4_do_rcv(sk, skb);
1736 sk_add_backlog(sk, skb);
1744 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1747 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1749 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1751 tcp_v4_send_reset(skb);
1755 /* Discard frame. */
1764 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1765 tcp_tw_put((struct tcp_tw_bucket *) sk);
1769 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1770 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1771 tcp_tw_put((struct tcp_tw_bucket *) sk);
1774 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1775 skb, th, skb->len)) {
1777 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1781 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1782 tcp_tw_put((struct tcp_tw_bucket *)sk);
1786 /* Fall through to ACK */
1789 tcp_v4_timewait_ack(sk, skb);
1793 case TCP_TW_SUCCESS:;
1798 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1800 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1801 struct inet_sock *inet = inet_sk(sk);
1803 sin->sin_family = AF_INET;
1804 sin->sin_addr.s_addr = inet->daddr;
1805 sin->sin_port = inet->dport;
1808 /* VJ's idea. Save last timestamp seen from this destination
1809 * and hold it at least for normal timewait interval to use for duplicate
1810 * segment detection in subsequent connections, before they enter synchronized
1814 int tcp_v4_remember_stamp(struct sock *sk)
1816 struct inet_sock *inet = inet_sk(sk);
1817 struct tcp_sock *tp = tcp_sk(sk);
1818 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1819 struct inet_peer *peer = NULL;
1822 if (!rt || rt->rt_dst != inet->daddr) {
1823 peer = inet_getpeer(inet->daddr, 1);
1827 rt_bind_peer(rt, 1);
1832 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1833 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1834 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1835 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1836 peer->tcp_ts = tp->rx_opt.ts_recent;
1846 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1848 struct inet_peer *peer = NULL;
1850 peer = inet_getpeer(tw->tw_daddr, 1);
1853 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1854 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1855 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1856 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1857 peer->tcp_ts = tw->tw_ts_recent;
1866 struct tcp_func ipv4_specific = {
1867 .queue_xmit = ip_queue_xmit,
1868 .send_check = tcp_v4_send_check,
1869 .rebuild_header = inet_sk_rebuild_header,
1870 .conn_request = tcp_v4_conn_request,
1871 .syn_recv_sock = tcp_v4_syn_recv_sock,
1872 .remember_stamp = tcp_v4_remember_stamp,
1873 .net_header_len = sizeof(struct iphdr),
1874 .setsockopt = ip_setsockopt,
1875 .getsockopt = ip_getsockopt,
1876 .addr2sockaddr = v4_addr2sockaddr,
1877 .sockaddr_len = sizeof(struct sockaddr_in),
1880 /* NOTE: A lot of things set to zero explicitly by call to
1881 * sk_alloc() so need not be done here.
1883 static int tcp_v4_init_sock(struct sock *sk)
1885 struct tcp_sock *tp = tcp_sk(sk);
1887 skb_queue_head_init(&tp->out_of_order_queue);
1888 tcp_init_xmit_timers(sk);
1889 tcp_prequeue_init(tp);
1891 tp->rto = TCP_TIMEOUT_INIT;
1892 tp->mdev = TCP_TIMEOUT_INIT;
1894 /* So many TCP implementations out there (incorrectly) count the
1895 * initial SYN frame in their delayed-ACK and congestion control
1896 * algorithms that we must have the following bandaid to talk
1897 * efficiently to them. -DaveM
1901 /* See draft-stevens-tcpca-spec-01 for discussion of the
1902 * initialization of these values.
1904 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1905 tp->snd_cwnd_clamp = ~0;
1906 tp->mss_cache = 536;
1908 tp->reordering = sysctl_tcp_reordering;
1909 tp->ca_ops = &tcp_init_congestion_ops;
1911 sk->sk_state = TCP_CLOSE;
1913 sk->sk_write_space = sk_stream_write_space;
1914 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1916 tp->af_specific = &ipv4_specific;
1918 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1919 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1921 atomic_inc(&tcp_sockets_allocated);
1926 int tcp_v4_destroy_sock(struct sock *sk)
1928 struct tcp_sock *tp = tcp_sk(sk);
1930 tcp_clear_xmit_timers(sk);
1932 tcp_cleanup_congestion_control(tp);
1934 /* Cleanup up the write buffer. */
1935 sk_stream_writequeue_purge(sk);
1937 /* Cleans up our, hopefully empty, out_of_order_queue. */
1938 __skb_queue_purge(&tp->out_of_order_queue);
1940 /* Clean prequeue, it must be empty really */
1941 __skb_queue_purge(&tp->ucopy.prequeue);
1943 /* Clean up a referenced TCP bind bucket. */
1944 if (inet_sk(sk)->bind_hash)
1948 * If sendmsg cached page exists, toss it.
1950 if (sk->sk_sndmsg_page) {
1951 __free_page(sk->sk_sndmsg_page);
1952 sk->sk_sndmsg_page = NULL;
1955 atomic_dec(&tcp_sockets_allocated);
1960 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1962 #ifdef CONFIG_PROC_FS
1963 /* Proc filesystem TCP sock list dumping. */
1965 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1967 return hlist_empty(head) ? NULL :
1968 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1971 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1973 return tw->tw_node.next ?
1974 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1977 static void *listening_get_next(struct seq_file *seq, void *cur)
1979 struct tcp_sock *tp;
1980 struct hlist_node *node;
1981 struct sock *sk = cur;
1982 struct tcp_iter_state* st = seq->private;
1986 sk = sk_head(&tcp_listening_hash[0]);
1992 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1993 struct request_sock *req = cur;
1995 tp = tcp_sk(st->syn_wait_sk);
1999 if (req->rsk_ops->family == st->family) {
2005 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2008 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2010 sk = sk_next(st->syn_wait_sk);
2011 st->state = TCP_SEQ_STATE_LISTENING;
2012 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2015 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2016 if (reqsk_queue_len(&tp->accept_queue))
2018 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2022 sk_for_each_from(sk, node) {
2023 if (sk->sk_family == st->family) {
2028 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2029 if (reqsk_queue_len(&tp->accept_queue)) {
2031 st->uid = sock_i_uid(sk);
2032 st->syn_wait_sk = sk;
2033 st->state = TCP_SEQ_STATE_OPENREQ;
2037 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2039 if (++st->bucket < INET_LHTABLE_SIZE) {
2040 sk = sk_head(&tcp_listening_hash[st->bucket]);
2048 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2050 void *rc = listening_get_next(seq, NULL);
2052 while (rc && *pos) {
2053 rc = listening_get_next(seq, rc);
2059 static void *established_get_first(struct seq_file *seq)
2061 struct tcp_iter_state* st = seq->private;
2064 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2066 struct hlist_node *node;
2067 struct tcp_tw_bucket *tw;
2069 /* We can reschedule _before_ having picked the target: */
2070 cond_resched_softirq();
2072 read_lock(&tcp_ehash[st->bucket].lock);
2073 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2074 if (sk->sk_family != st->family) {
2080 st->state = TCP_SEQ_STATE_TIME_WAIT;
2081 tw_for_each(tw, node,
2082 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2083 if (tw->tw_family != st->family) {
2089 read_unlock(&tcp_ehash[st->bucket].lock);
2090 st->state = TCP_SEQ_STATE_ESTABLISHED;
2096 static void *established_get_next(struct seq_file *seq, void *cur)
2098 struct sock *sk = cur;
2099 struct tcp_tw_bucket *tw;
2100 struct hlist_node *node;
2101 struct tcp_iter_state* st = seq->private;
2105 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2109 while (tw && tw->tw_family != st->family) {
2116 read_unlock(&tcp_ehash[st->bucket].lock);
2117 st->state = TCP_SEQ_STATE_ESTABLISHED;
2119 /* We can reschedule between buckets: */
2120 cond_resched_softirq();
2122 if (++st->bucket < tcp_ehash_size) {
2123 read_lock(&tcp_ehash[st->bucket].lock);
2124 sk = sk_head(&tcp_ehash[st->bucket].chain);
2132 sk_for_each_from(sk, node) {
2133 if (sk->sk_family == st->family)
2137 st->state = TCP_SEQ_STATE_TIME_WAIT;
2138 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2146 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2148 void *rc = established_get_first(seq);
2151 rc = established_get_next(seq, rc);
2157 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2160 struct tcp_iter_state* st = seq->private;
2163 st->state = TCP_SEQ_STATE_LISTENING;
2164 rc = listening_get_idx(seq, &pos);
2167 tcp_listen_unlock();
2169 st->state = TCP_SEQ_STATE_ESTABLISHED;
2170 rc = established_get_idx(seq, pos);
2176 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2178 struct tcp_iter_state* st = seq->private;
2179 st->state = TCP_SEQ_STATE_LISTENING;
2181 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2184 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2187 struct tcp_iter_state* st;
2189 if (v == SEQ_START_TOKEN) {
2190 rc = tcp_get_idx(seq, 0);
2195 switch (st->state) {
2196 case TCP_SEQ_STATE_OPENREQ:
2197 case TCP_SEQ_STATE_LISTENING:
2198 rc = listening_get_next(seq, v);
2200 tcp_listen_unlock();
2202 st->state = TCP_SEQ_STATE_ESTABLISHED;
2203 rc = established_get_first(seq);
2206 case TCP_SEQ_STATE_ESTABLISHED:
2207 case TCP_SEQ_STATE_TIME_WAIT:
2208 rc = established_get_next(seq, v);
2216 static void tcp_seq_stop(struct seq_file *seq, void *v)
2218 struct tcp_iter_state* st = seq->private;
2220 switch (st->state) {
2221 case TCP_SEQ_STATE_OPENREQ:
2223 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2224 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2226 case TCP_SEQ_STATE_LISTENING:
2227 if (v != SEQ_START_TOKEN)
2228 tcp_listen_unlock();
2230 case TCP_SEQ_STATE_TIME_WAIT:
2231 case TCP_SEQ_STATE_ESTABLISHED:
2233 read_unlock(&tcp_ehash[st->bucket].lock);
2239 static int tcp_seq_open(struct inode *inode, struct file *file)
2241 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2242 struct seq_file *seq;
2243 struct tcp_iter_state *s;
2246 if (unlikely(afinfo == NULL))
2249 s = kmalloc(sizeof(*s), GFP_KERNEL);
2252 memset(s, 0, sizeof(*s));
2253 s->family = afinfo->family;
2254 s->seq_ops.start = tcp_seq_start;
2255 s->seq_ops.next = tcp_seq_next;
2256 s->seq_ops.show = afinfo->seq_show;
2257 s->seq_ops.stop = tcp_seq_stop;
2259 rc = seq_open(file, &s->seq_ops);
2262 seq = file->private_data;
2271 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2274 struct proc_dir_entry *p;
2278 afinfo->seq_fops->owner = afinfo->owner;
2279 afinfo->seq_fops->open = tcp_seq_open;
2280 afinfo->seq_fops->read = seq_read;
2281 afinfo->seq_fops->llseek = seq_lseek;
2282 afinfo->seq_fops->release = seq_release_private;
2284 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2292 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2296 proc_net_remove(afinfo->name);
2297 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2300 static void get_openreq4(struct sock *sk, struct request_sock *req,
2301 char *tmpbuf, int i, int uid)
2303 const struct inet_request_sock *ireq = inet_rsk(req);
2304 int ttd = req->expires - jiffies;
2306 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2307 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2310 ntohs(inet_sk(sk)->sport),
2312 ntohs(ireq->rmt_port),
2314 0, 0, /* could print option size, but that is af dependent. */
2315 1, /* timers active (only the expire timer) */
2316 jiffies_to_clock_t(ttd),
2319 0, /* non standard timer */
2320 0, /* open_requests have no inode */
2321 atomic_read(&sk->sk_refcnt),
2325 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2328 unsigned long timer_expires;
2329 struct tcp_sock *tp = tcp_sk(sp);
2330 struct inet_sock *inet = inet_sk(sp);
2331 unsigned int dest = inet->daddr;
2332 unsigned int src = inet->rcv_saddr;
2333 __u16 destp = ntohs(inet->dport);
2334 __u16 srcp = ntohs(inet->sport);
2336 if (tp->pending == TCP_TIME_RETRANS) {
2338 timer_expires = tp->timeout;
2339 } else if (tp->pending == TCP_TIME_PROBE0) {
2341 timer_expires = tp->timeout;
2342 } else if (timer_pending(&sp->sk_timer)) {
2344 timer_expires = sp->sk_timer.expires;
2347 timer_expires = jiffies;
2350 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2351 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2352 i, src, srcp, dest, destp, sp->sk_state,
2353 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2355 jiffies_to_clock_t(timer_expires - jiffies),
2360 atomic_read(&sp->sk_refcnt), sp,
2361 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2363 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2366 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2368 unsigned int dest, src;
2370 int ttd = tw->tw_ttd - jiffies;
2375 dest = tw->tw_daddr;
2376 src = tw->tw_rcv_saddr;
2377 destp = ntohs(tw->tw_dport);
2378 srcp = ntohs(tw->tw_sport);
2380 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2381 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2382 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2383 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2384 atomic_read(&tw->tw_refcnt), tw);
2389 static int tcp4_seq_show(struct seq_file *seq, void *v)
2391 struct tcp_iter_state* st;
2392 char tmpbuf[TMPSZ + 1];
2394 if (v == SEQ_START_TOKEN) {
2395 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2396 " sl local_address rem_address st tx_queue "
2397 "rx_queue tr tm->when retrnsmt uid timeout "
2403 switch (st->state) {
2404 case TCP_SEQ_STATE_LISTENING:
2405 case TCP_SEQ_STATE_ESTABLISHED:
2406 get_tcp4_sock(v, tmpbuf, st->num);
2408 case TCP_SEQ_STATE_OPENREQ:
2409 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2411 case TCP_SEQ_STATE_TIME_WAIT:
2412 get_timewait4_sock(v, tmpbuf, st->num);
2415 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2420 static struct file_operations tcp4_seq_fops;
2421 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2422 .owner = THIS_MODULE,
2425 .seq_show = tcp4_seq_show,
2426 .seq_fops = &tcp4_seq_fops,
2429 int __init tcp4_proc_init(void)
2431 return tcp_proc_register(&tcp4_seq_afinfo);
2434 void tcp4_proc_exit(void)
2436 tcp_proc_unregister(&tcp4_seq_afinfo);
2438 #endif /* CONFIG_PROC_FS */
2440 struct proto tcp_prot = {
2442 .owner = THIS_MODULE,
2444 .connect = tcp_v4_connect,
2445 .disconnect = tcp_disconnect,
2446 .accept = tcp_accept,
2448 .init = tcp_v4_init_sock,
2449 .destroy = tcp_v4_destroy_sock,
2450 .shutdown = tcp_shutdown,
2451 .setsockopt = tcp_setsockopt,
2452 .getsockopt = tcp_getsockopt,
2453 .sendmsg = tcp_sendmsg,
2454 .recvmsg = tcp_recvmsg,
2455 .backlog_rcv = tcp_v4_do_rcv,
2456 .hash = tcp_v4_hash,
2457 .unhash = tcp_unhash,
2458 .get_port = tcp_v4_get_port,
2459 .enter_memory_pressure = tcp_enter_memory_pressure,
2460 .sockets_allocated = &tcp_sockets_allocated,
2461 .memory_allocated = &tcp_memory_allocated,
2462 .memory_pressure = &tcp_memory_pressure,
2463 .sysctl_mem = sysctl_tcp_mem,
2464 .sysctl_wmem = sysctl_tcp_wmem,
2465 .sysctl_rmem = sysctl_tcp_rmem,
2466 .max_header = MAX_TCP_HEADER,
2467 .obj_size = sizeof(struct tcp_sock),
2468 .rsk_prot = &tcp_request_sock_ops,
2473 void __init tcp_v4_init(struct net_proto_family *ops)
2475 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2477 panic("Failed to create the TCP control socket.\n");
2478 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2479 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2481 /* Unhash it so that IP input processing does not even
2482 * see it, we do not wish this socket to see incoming
2485 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2488 EXPORT_SYMBOL(ipv4_specific);
2489 EXPORT_SYMBOL(tcp_bind_hash);
2490 EXPORT_SYMBOL(inet_bind_bucket_create);
2491 EXPORT_SYMBOL(tcp_hashinfo);
2492 EXPORT_SYMBOL(tcp_inherit_port);
2493 EXPORT_SYMBOL(tcp_listen_wlock);
2494 EXPORT_SYMBOL(tcp_port_rover);
2495 EXPORT_SYMBOL(tcp_prot);
2496 EXPORT_SYMBOL(tcp_put_port);
2497 EXPORT_SYMBOL(tcp_unhash);
2498 EXPORT_SYMBOL(tcp_v4_conn_request);
2499 EXPORT_SYMBOL(tcp_v4_connect);
2500 EXPORT_SYMBOL(tcp_v4_do_rcv);
2501 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2502 EXPORT_SYMBOL(tcp_v4_send_check);
2503 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2505 #ifdef CONFIG_PROC_FS
2506 EXPORT_SYMBOL(tcp_proc_register);
2507 EXPORT_SYMBOL(tcp_proc_unregister);
2509 EXPORT_SYMBOL(sysctl_local_port_range);
2510 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2511 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);