]> Git Repo - J-linux.git/blob - net/ipv4/tcp_ipv4.c
scsi: zfcp: Trace when request remove fails after qdio send fails
[J-linux.git] / net / ipv4 / tcp_ipv4.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              Implementation of the Transmission Control Protocol(TCP).
8  *
9  *              IPv4 specific functions
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  */
18
19 /*
20  * Changes:
21  *              David S. Miller :       New socket lookup architecture.
22  *                                      This code is dedicated to John Dyson.
23  *              David S. Miller :       Change semantics of established hash,
24  *                                      half is devoted to TIME_WAIT sockets
25  *                                      and the rest go in the other half.
26  *              Andi Kleen :            Add support for syncookies and fixed
27  *                                      some bugs: ip options weren't passed to
28  *                                      the TCP layer, missed a check for an
29  *                                      ACK bit.
30  *              Andi Kleen :            Implemented fast path mtu discovery.
31  *                                      Fixed many serious bugs in the
32  *                                      request_sock handling and moved
33  *                                      most of it into the af independent code.
34  *                                      Added tail drop and some other bugfixes.
35  *                                      Added new listen semantics.
36  *              Mike McLagan    :       Routing by source
37  *      Juan Jose Ciarlante:            ip_dynaddr bits
38  *              Andi Kleen:             various fixes.
39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
40  *                                      coma.
41  *      Andi Kleen              :       Fix new listen.
42  *      Andi Kleen              :       Fix accept error reporting.
43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
45  *                                      a single port at the same time.
46  */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98         return secure_tcp_seq(ip_hdr(skb)->daddr,
99                               ip_hdr(skb)->saddr,
100                               tcp_hdr(skb)->dest,
101                               tcp_hdr(skb)->source);
102 }
103
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112         const struct inet_timewait_sock *tw = inet_twsk(sktw);
113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114         struct tcp_sock *tp = tcp_sk(sk);
115
116         if (reuse == 2) {
117                 /* Still does not detect *everything* that goes through
118                  * lo, since we require a loopback src or dst address
119                  * or direct binding to 'lo' interface.
120                  */
121                 bool loopback = false;
122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123                         loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125                 if (tw->tw_family == AF_INET6) {
126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130                                 loopback = true;
131                 } else
132 #endif
133                 {
134                         if (ipv4_is_loopback(tw->tw_daddr) ||
135                             ipv4_is_loopback(tw->tw_rcv_saddr))
136                                 loopback = true;
137                 }
138                 if (!loopback)
139                         reuse = 0;
140         }
141
142         /* With PAWS, it is safe from the viewpoint
143            of data integrity. Even without PAWS it is safe provided sequence
144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146            Actually, the idea is close to VJ's one, only timestamp cache is
147            held not per host, but per port pair and TW bucket is used as state
148            holder.
149
150            If TW bucket has been already destroyed we fall back to VJ's scheme
151            and use initial timestamp retrieved from peer table.
152          */
153         if (tcptw->tw_ts_recent_stamp &&
154             (!twp || (reuse && time_after32(ktime_get_seconds(),
155                                             tcptw->tw_ts_recent_stamp)))) {
156                 /* In case of repair and re-using TIME-WAIT sockets we still
157                  * want to be sure that it is safe as above but honor the
158                  * sequence numbers and time stamps set as part of the repair
159                  * process.
160                  *
161                  * Without this check re-using a TIME-WAIT socket with TCP
162                  * repair would accumulate a -1 on the repair assigned
163                  * sequence number. The first time it is reused the sequence
164                  * is -1, the second time -2, etc. This fixes that issue
165                  * without appearing to create any others.
166                  */
167                 if (likely(!tp->repair)) {
168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170                         if (!seq)
171                                 seq = 1;
172                         WRITE_ONCE(tp->write_seq, seq);
173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175                 }
176                 sock_hold(sktw);
177                 return 1;
178         }
179
180         return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185                               int addr_len)
186 {
187         /* This check is replicated from tcp_v4_connect() and intended to
188          * prevent BPF program called below from accessing bytes that are out
189          * of the bound specified by user in addr_len.
190          */
191         if (addr_len < sizeof(struct sockaddr_in))
192                 return -EINVAL;
193
194         sock_owned_by_me(sk);
195
196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203         struct inet_timewait_death_row *tcp_death_row;
204         struct inet_sock *inet = inet_sk(sk);
205         struct tcp_sock *tp = tcp_sk(sk);
206         struct ip_options_rcu *inet_opt;
207         struct net *net = sock_net(sk);
208         __be16 orig_sport, orig_dport;
209         __be32 daddr, nexthop;
210         struct flowi4 *fl4;
211         struct rtable *rt;
212         int err;
213
214         if (addr_len < sizeof(struct sockaddr_in))
215                 return -EINVAL;
216
217         if (usin->sin_family != AF_INET)
218                 return -EAFNOSUPPORT;
219
220         nexthop = daddr = usin->sin_addr.s_addr;
221         inet_opt = rcu_dereference_protected(inet->inet_opt,
222                                              lockdep_sock_is_held(sk));
223         if (inet_opt && inet_opt->opt.srr) {
224                 if (!daddr)
225                         return -EINVAL;
226                 nexthop = inet_opt->opt.faddr;
227         }
228
229         orig_sport = inet->inet_sport;
230         orig_dport = usin->sin_port;
231         fl4 = &inet->cork.fl.u.ip4;
232         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
233                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
234                               orig_dport, sk);
235         if (IS_ERR(rt)) {
236                 err = PTR_ERR(rt);
237                 if (err == -ENETUNREACH)
238                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
239                 return err;
240         }
241
242         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
243                 ip_rt_put(rt);
244                 return -ENETUNREACH;
245         }
246
247         if (!inet_opt || !inet_opt->opt.srr)
248                 daddr = fl4->daddr;
249
250         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
251
252         if (!inet->inet_saddr) {
253                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
254                 if (err) {
255                         ip_rt_put(rt);
256                         return err;
257                 }
258         } else {
259                 sk_rcv_saddr_set(sk, inet->inet_saddr);
260         }
261
262         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
263                 /* Reset inherited state */
264                 tp->rx_opt.ts_recent       = 0;
265                 tp->rx_opt.ts_recent_stamp = 0;
266                 if (likely(!tp->repair))
267                         WRITE_ONCE(tp->write_seq, 0);
268         }
269
270         inet->inet_dport = usin->sin_port;
271         sk_daddr_set(sk, daddr);
272
273         inet_csk(sk)->icsk_ext_hdr_len = 0;
274         if (inet_opt)
275                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
276
277         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
278
279         /* Socket identity is still unknown (sport may be zero).
280          * However we set state to SYN-SENT and not releasing socket
281          * lock select source port, enter ourselves into the hash tables and
282          * complete initialization after this.
283          */
284         tcp_set_state(sk, TCP_SYN_SENT);
285         err = inet_hash_connect(tcp_death_row, sk);
286         if (err)
287                 goto failure;
288
289         sk_set_txhash(sk);
290
291         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
292                                inet->inet_sport, inet->inet_dport, sk);
293         if (IS_ERR(rt)) {
294                 err = PTR_ERR(rt);
295                 rt = NULL;
296                 goto failure;
297         }
298         /* OK, now commit destination to socket.  */
299         sk->sk_gso_type = SKB_GSO_TCPV4;
300         sk_setup_caps(sk, &rt->dst);
301         rt = NULL;
302
303         if (likely(!tp->repair)) {
304                 if (!tp->write_seq)
305                         WRITE_ONCE(tp->write_seq,
306                                    secure_tcp_seq(inet->inet_saddr,
307                                                   inet->inet_daddr,
308                                                   inet->inet_sport,
309                                                   usin->sin_port));
310                 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
311                                                  inet->inet_daddr);
312         }
313
314         inet->inet_id = get_random_u16();
315
316         if (tcp_fastopen_defer_connect(sk, &err))
317                 return err;
318         if (err)
319                 goto failure;
320
321         err = tcp_connect(sk);
322
323         if (err)
324                 goto failure;
325
326         return 0;
327
328 failure:
329         /*
330          * This unhashes the socket and releases the local port,
331          * if necessary.
332          */
333         tcp_set_state(sk, TCP_CLOSE);
334         inet_bhash2_reset_saddr(sk);
335         ip_rt_put(rt);
336         sk->sk_route_caps = 0;
337         inet->inet_dport = 0;
338         return err;
339 }
340 EXPORT_SYMBOL(tcp_v4_connect);
341
342 /*
343  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
344  * It can be called through tcp_release_cb() if socket was owned by user
345  * at the time tcp_v4_err() was called to handle ICMP message.
346  */
347 void tcp_v4_mtu_reduced(struct sock *sk)
348 {
349         struct inet_sock *inet = inet_sk(sk);
350         struct dst_entry *dst;
351         u32 mtu;
352
353         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
354                 return;
355         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
356         dst = inet_csk_update_pmtu(sk, mtu);
357         if (!dst)
358                 return;
359
360         /* Something is about to be wrong... Remember soft error
361          * for the case, if this connection will not able to recover.
362          */
363         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
364                 sk->sk_err_soft = EMSGSIZE;
365
366         mtu = dst_mtu(dst);
367
368         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
369             ip_sk_accept_pmtu(sk) &&
370             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
371                 tcp_sync_mss(sk, mtu);
372
373                 /* Resend the TCP packet because it's
374                  * clear that the old packet has been
375                  * dropped. This is the new "fast" path mtu
376                  * discovery.
377                  */
378                 tcp_simple_retransmit(sk);
379         } /* else let the usual retransmit timer handle it */
380 }
381 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
382
383 static void do_redirect(struct sk_buff *skb, struct sock *sk)
384 {
385         struct dst_entry *dst = __sk_dst_check(sk, 0);
386
387         if (dst)
388                 dst->ops->redirect(dst, sk, skb);
389 }
390
391
392 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
393 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
394 {
395         struct request_sock *req = inet_reqsk(sk);
396         struct net *net = sock_net(sk);
397
398         /* ICMPs are not backlogged, hence we cannot get
399          * an established socket here.
400          */
401         if (seq != tcp_rsk(req)->snt_isn) {
402                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
403         } else if (abort) {
404                 /*
405                  * Still in SYN_RECV, just remove it silently.
406                  * There is no good way to pass the error to the newly
407                  * created socket, and POSIX does not want network
408                  * errors returned from accept().
409                  */
410                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
411                 tcp_listendrop(req->rsk_listener);
412         }
413         reqsk_put(req);
414 }
415 EXPORT_SYMBOL(tcp_req_err);
416
417 /* TCP-LD (RFC 6069) logic */
418 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
419 {
420         struct inet_connection_sock *icsk = inet_csk(sk);
421         struct tcp_sock *tp = tcp_sk(sk);
422         struct sk_buff *skb;
423         s32 remaining;
424         u32 delta_us;
425
426         if (sock_owned_by_user(sk))
427                 return;
428
429         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
430             !icsk->icsk_backoff)
431                 return;
432
433         skb = tcp_rtx_queue_head(sk);
434         if (WARN_ON_ONCE(!skb))
435                 return;
436
437         icsk->icsk_backoff--;
438         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
439         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
440
441         tcp_mstamp_refresh(tp);
442         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
443         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
444
445         if (remaining > 0) {
446                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
447                                           remaining, TCP_RTO_MAX);
448         } else {
449                 /* RTO revert clocked out retransmission.
450                  * Will retransmit now.
451                  */
452                 tcp_retransmit_timer(sk);
453         }
454 }
455 EXPORT_SYMBOL(tcp_ld_RTO_revert);
456
457 /*
458  * This routine is called by the ICMP module when it gets some
459  * sort of error condition.  If err < 0 then the socket should
460  * be closed and the error returned to the user.  If err > 0
461  * it's just the icmp type << 8 | icmp code.  After adjustment
462  * header points to the first 8 bytes of the tcp header.  We need
463  * to find the appropriate port.
464  *
465  * The locking strategy used here is very "optimistic". When
466  * someone else accesses the socket the ICMP is just dropped
467  * and for some paths there is no check at all.
468  * A more general error queue to queue errors for later handling
469  * is probably better.
470  *
471  */
472
473 int tcp_v4_err(struct sk_buff *skb, u32 info)
474 {
475         const struct iphdr *iph = (const struct iphdr *)skb->data;
476         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
477         struct tcp_sock *tp;
478         struct inet_sock *inet;
479         const int type = icmp_hdr(skb)->type;
480         const int code = icmp_hdr(skb)->code;
481         struct sock *sk;
482         struct request_sock *fastopen;
483         u32 seq, snd_una;
484         int err;
485         struct net *net = dev_net(skb->dev);
486
487         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
488                                        iph->daddr, th->dest, iph->saddr,
489                                        ntohs(th->source), inet_iif(skb), 0);
490         if (!sk) {
491                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
492                 return -ENOENT;
493         }
494         if (sk->sk_state == TCP_TIME_WAIT) {
495                 inet_twsk_put(inet_twsk(sk));
496                 return 0;
497         }
498         seq = ntohl(th->seq);
499         if (sk->sk_state == TCP_NEW_SYN_RECV) {
500                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
501                                      type == ICMP_TIME_EXCEEDED ||
502                                      (type == ICMP_DEST_UNREACH &&
503                                       (code == ICMP_NET_UNREACH ||
504                                        code == ICMP_HOST_UNREACH)));
505                 return 0;
506         }
507
508         bh_lock_sock(sk);
509         /* If too many ICMPs get dropped on busy
510          * servers this needs to be solved differently.
511          * We do take care of PMTU discovery (RFC1191) special case :
512          * we can receive locally generated ICMP messages while socket is held.
513          */
514         if (sock_owned_by_user(sk)) {
515                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
516                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
517         }
518         if (sk->sk_state == TCP_CLOSE)
519                 goto out;
520
521         if (static_branch_unlikely(&ip4_min_ttl)) {
522                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
523                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
524                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
525                         goto out;
526                 }
527         }
528
529         tp = tcp_sk(sk);
530         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
531         fastopen = rcu_dereference(tp->fastopen_rsk);
532         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
533         if (sk->sk_state != TCP_LISTEN &&
534             !between(seq, snd_una, tp->snd_nxt)) {
535                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
536                 goto out;
537         }
538
539         switch (type) {
540         case ICMP_REDIRECT:
541                 if (!sock_owned_by_user(sk))
542                         do_redirect(skb, sk);
543                 goto out;
544         case ICMP_SOURCE_QUENCH:
545                 /* Just silently ignore these. */
546                 goto out;
547         case ICMP_PARAMETERPROB:
548                 err = EPROTO;
549                 break;
550         case ICMP_DEST_UNREACH:
551                 if (code > NR_ICMP_UNREACH)
552                         goto out;
553
554                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
555                         /* We are not interested in TCP_LISTEN and open_requests
556                          * (SYN-ACKs send out by Linux are always <576bytes so
557                          * they should go through unfragmented).
558                          */
559                         if (sk->sk_state == TCP_LISTEN)
560                                 goto out;
561
562                         WRITE_ONCE(tp->mtu_info, info);
563                         if (!sock_owned_by_user(sk)) {
564                                 tcp_v4_mtu_reduced(sk);
565                         } else {
566                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
567                                         sock_hold(sk);
568                         }
569                         goto out;
570                 }
571
572                 err = icmp_err_convert[code].errno;
573                 /* check if this ICMP message allows revert of backoff.
574                  * (see RFC 6069)
575                  */
576                 if (!fastopen &&
577                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
578                         tcp_ld_RTO_revert(sk, seq);
579                 break;
580         case ICMP_TIME_EXCEEDED:
581                 err = EHOSTUNREACH;
582                 break;
583         default:
584                 goto out;
585         }
586
587         switch (sk->sk_state) {
588         case TCP_SYN_SENT:
589         case TCP_SYN_RECV:
590                 /* Only in fast or simultaneous open. If a fast open socket is
591                  * already accepted it is treated as a connected one below.
592                  */
593                 if (fastopen && !fastopen->sk)
594                         break;
595
596                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
597
598                 if (!sock_owned_by_user(sk)) {
599                         sk->sk_err = err;
600
601                         sk_error_report(sk);
602
603                         tcp_done(sk);
604                 } else {
605                         sk->sk_err_soft = err;
606                 }
607                 goto out;
608         }
609
610         /* If we've already connected we will keep trying
611          * until we time out, or the user gives up.
612          *
613          * rfc1122 4.2.3.9 allows to consider as hard errors
614          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
615          * but it is obsoleted by pmtu discovery).
616          *
617          * Note, that in modern internet, where routing is unreliable
618          * and in each dark corner broken firewalls sit, sending random
619          * errors ordered by their masters even this two messages finally lose
620          * their original sense (even Linux sends invalid PORT_UNREACHs)
621          *
622          * Now we are in compliance with RFCs.
623          *                                                      --ANK (980905)
624          */
625
626         inet = inet_sk(sk);
627         if (!sock_owned_by_user(sk) && inet->recverr) {
628                 sk->sk_err = err;
629                 sk_error_report(sk);
630         } else  { /* Only an error on timeout */
631                 sk->sk_err_soft = err;
632         }
633
634 out:
635         bh_unlock_sock(sk);
636         sock_put(sk);
637         return 0;
638 }
639
640 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
641 {
642         struct tcphdr *th = tcp_hdr(skb);
643
644         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
645         skb->csum_start = skb_transport_header(skb) - skb->head;
646         skb->csum_offset = offsetof(struct tcphdr, check);
647 }
648
649 /* This routine computes an IPv4 TCP checksum. */
650 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
651 {
652         const struct inet_sock *inet = inet_sk(sk);
653
654         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
655 }
656 EXPORT_SYMBOL(tcp_v4_send_check);
657
658 /*
659  *      This routine will send an RST to the other tcp.
660  *
661  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
662  *                    for reset.
663  *      Answer: if a packet caused RST, it is not for a socket
664  *              existing in our system, if it is matched to a socket,
665  *              it is just duplicate segment or bug in other side's TCP.
666  *              So that we build reply only basing on parameters
667  *              arrived with segment.
668  *      Exception: precedence violation. We do not implement it in any case.
669  */
670
671 #ifdef CONFIG_TCP_MD5SIG
672 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
673 #else
674 #define OPTION_BYTES sizeof(__be32)
675 #endif
676
677 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
678 {
679         const struct tcphdr *th = tcp_hdr(skb);
680         struct {
681                 struct tcphdr th;
682                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
683         } rep;
684         struct ip_reply_arg arg;
685 #ifdef CONFIG_TCP_MD5SIG
686         struct tcp_md5sig_key *key = NULL;
687         const __u8 *hash_location = NULL;
688         unsigned char newhash[16];
689         int genhash;
690         struct sock *sk1 = NULL;
691 #endif
692         u64 transmit_time = 0;
693         struct sock *ctl_sk;
694         struct net *net;
695
696         /* Never send a reset in response to a reset. */
697         if (th->rst)
698                 return;
699
700         /* If sk not NULL, it means we did a successful lookup and incoming
701          * route had to be correct. prequeue might have dropped our dst.
702          */
703         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
704                 return;
705
706         /* Swap the send and the receive. */
707         memset(&rep, 0, sizeof(rep));
708         rep.th.dest   = th->source;
709         rep.th.source = th->dest;
710         rep.th.doff   = sizeof(struct tcphdr) / 4;
711         rep.th.rst    = 1;
712
713         if (th->ack) {
714                 rep.th.seq = th->ack_seq;
715         } else {
716                 rep.th.ack = 1;
717                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
718                                        skb->len - (th->doff << 2));
719         }
720
721         memset(&arg, 0, sizeof(arg));
722         arg.iov[0].iov_base = (unsigned char *)&rep;
723         arg.iov[0].iov_len  = sizeof(rep.th);
724
725         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
726 #ifdef CONFIG_TCP_MD5SIG
727         rcu_read_lock();
728         hash_location = tcp_parse_md5sig_option(th);
729         if (sk && sk_fullsock(sk)) {
730                 const union tcp_md5_addr *addr;
731                 int l3index;
732
733                 /* sdif set, means packet ingressed via a device
734                  * in an L3 domain and inet_iif is set to it.
735                  */
736                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
737                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
738                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
739         } else if (hash_location) {
740                 const union tcp_md5_addr *addr;
741                 int sdif = tcp_v4_sdif(skb);
742                 int dif = inet_iif(skb);
743                 int l3index;
744
745                 /*
746                  * active side is lost. Try to find listening socket through
747                  * source port, and then find md5 key through listening socket.
748                  * we are not loose security here:
749                  * Incoming packet is checked with md5 hash with finding key,
750                  * no RST generated if md5 hash doesn't match.
751                  */
752                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
753                                              NULL, 0, ip_hdr(skb)->saddr,
754                                              th->source, ip_hdr(skb)->daddr,
755                                              ntohs(th->source), dif, sdif);
756                 /* don't send rst if it can't find key */
757                 if (!sk1)
758                         goto out;
759
760                 /* sdif set, means packet ingressed via a device
761                  * in an L3 domain and dif is set to it.
762                  */
763                 l3index = sdif ? dif : 0;
764                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
765                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
766                 if (!key)
767                         goto out;
768
769
770                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
771                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
772                         goto out;
773
774         }
775
776         if (key) {
777                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
778                                    (TCPOPT_NOP << 16) |
779                                    (TCPOPT_MD5SIG << 8) |
780                                    TCPOLEN_MD5SIG);
781                 /* Update length and the length the header thinks exists */
782                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
783                 rep.th.doff = arg.iov[0].iov_len / 4;
784
785                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
786                                      key, ip_hdr(skb)->saddr,
787                                      ip_hdr(skb)->daddr, &rep.th);
788         }
789 #endif
790         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
791         if (rep.opt[0] == 0) {
792                 __be32 mrst = mptcp_reset_option(skb);
793
794                 if (mrst) {
795                         rep.opt[0] = mrst;
796                         arg.iov[0].iov_len += sizeof(mrst);
797                         rep.th.doff = arg.iov[0].iov_len / 4;
798                 }
799         }
800
801         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
802                                       ip_hdr(skb)->saddr, /* XXX */
803                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
804         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
805         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
806
807         /* When socket is gone, all binding information is lost.
808          * routing might fail in this case. No choice here, if we choose to force
809          * input interface, we will misroute in case of asymmetric route.
810          */
811         if (sk) {
812                 arg.bound_dev_if = sk->sk_bound_dev_if;
813                 if (sk_fullsock(sk))
814                         trace_tcp_send_reset(sk, skb);
815         }
816
817         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
818                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
819
820         arg.tos = ip_hdr(skb)->tos;
821         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
822         local_bh_disable();
823         ctl_sk = this_cpu_read(ipv4_tcp_sk);
824         sock_net_set(ctl_sk, net);
825         if (sk) {
826                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
827                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
828                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
829                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
830                 transmit_time = tcp_transmit_time(sk);
831                 xfrm_sk_clone_policy(ctl_sk, sk);
832         }
833         ip_send_unicast_reply(ctl_sk,
834                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
835                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
836                               &arg, arg.iov[0].iov_len,
837                               transmit_time);
838
839         ctl_sk->sk_mark = 0;
840         xfrm_sk_free_policy(ctl_sk);
841         sock_net_set(ctl_sk, &init_net);
842         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
843         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
844         local_bh_enable();
845
846 #ifdef CONFIG_TCP_MD5SIG
847 out:
848         rcu_read_unlock();
849 #endif
850 }
851
852 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
853    outside socket context is ugly, certainly. What can I do?
854  */
855
856 static void tcp_v4_send_ack(const struct sock *sk,
857                             struct sk_buff *skb, u32 seq, u32 ack,
858                             u32 win, u32 tsval, u32 tsecr, int oif,
859                             struct tcp_md5sig_key *key,
860                             int reply_flags, u8 tos)
861 {
862         const struct tcphdr *th = tcp_hdr(skb);
863         struct {
864                 struct tcphdr th;
865                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
866 #ifdef CONFIG_TCP_MD5SIG
867                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
868 #endif
869                         ];
870         } rep;
871         struct net *net = sock_net(sk);
872         struct ip_reply_arg arg;
873         struct sock *ctl_sk;
874         u64 transmit_time;
875
876         memset(&rep.th, 0, sizeof(struct tcphdr));
877         memset(&arg, 0, sizeof(arg));
878
879         arg.iov[0].iov_base = (unsigned char *)&rep;
880         arg.iov[0].iov_len  = sizeof(rep.th);
881         if (tsecr) {
882                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
883                                    (TCPOPT_TIMESTAMP << 8) |
884                                    TCPOLEN_TIMESTAMP);
885                 rep.opt[1] = htonl(tsval);
886                 rep.opt[2] = htonl(tsecr);
887                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
888         }
889
890         /* Swap the send and the receive. */
891         rep.th.dest    = th->source;
892         rep.th.source  = th->dest;
893         rep.th.doff    = arg.iov[0].iov_len / 4;
894         rep.th.seq     = htonl(seq);
895         rep.th.ack_seq = htonl(ack);
896         rep.th.ack     = 1;
897         rep.th.window  = htons(win);
898
899 #ifdef CONFIG_TCP_MD5SIG
900         if (key) {
901                 int offset = (tsecr) ? 3 : 0;
902
903                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
904                                           (TCPOPT_NOP << 16) |
905                                           (TCPOPT_MD5SIG << 8) |
906                                           TCPOLEN_MD5SIG);
907                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
908                 rep.th.doff = arg.iov[0].iov_len/4;
909
910                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
911                                     key, ip_hdr(skb)->saddr,
912                                     ip_hdr(skb)->daddr, &rep.th);
913         }
914 #endif
915         arg.flags = reply_flags;
916         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
917                                       ip_hdr(skb)->saddr, /* XXX */
918                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
919         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
920         if (oif)
921                 arg.bound_dev_if = oif;
922         arg.tos = tos;
923         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
924         local_bh_disable();
925         ctl_sk = this_cpu_read(ipv4_tcp_sk);
926         sock_net_set(ctl_sk, net);
927         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
928                            inet_twsk(sk)->tw_mark : sk->sk_mark;
929         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
930                            inet_twsk(sk)->tw_priority : sk->sk_priority;
931         transmit_time = tcp_transmit_time(sk);
932         ip_send_unicast_reply(ctl_sk,
933                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
934                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
935                               &arg, arg.iov[0].iov_len,
936                               transmit_time);
937
938         ctl_sk->sk_mark = 0;
939         sock_net_set(ctl_sk, &init_net);
940         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
941         local_bh_enable();
942 }
943
944 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
945 {
946         struct inet_timewait_sock *tw = inet_twsk(sk);
947         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
948
949         tcp_v4_send_ack(sk, skb,
950                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
951                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
952                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
953                         tcptw->tw_ts_recent,
954                         tw->tw_bound_dev_if,
955                         tcp_twsk_md5_key(tcptw),
956                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
957                         tw->tw_tos
958                         );
959
960         inet_twsk_put(tw);
961 }
962
963 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
964                                   struct request_sock *req)
965 {
966         const union tcp_md5_addr *addr;
967         int l3index;
968
969         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
970          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
971          */
972         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
973                                              tcp_sk(sk)->snd_nxt;
974
975         /* RFC 7323 2.3
976          * The window field (SEG.WND) of every outgoing segment, with the
977          * exception of <SYN> segments, MUST be right-shifted by
978          * Rcv.Wind.Shift bits:
979          */
980         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
981         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
982         tcp_v4_send_ack(sk, skb, seq,
983                         tcp_rsk(req)->rcv_nxt,
984                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
985                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
986                         req->ts_recent,
987                         0,
988                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
989                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
990                         ip_hdr(skb)->tos);
991 }
992
993 /*
994  *      Send a SYN-ACK after having received a SYN.
995  *      This still operates on a request_sock only, not on a big
996  *      socket.
997  */
998 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
999                               struct flowi *fl,
1000                               struct request_sock *req,
1001                               struct tcp_fastopen_cookie *foc,
1002                               enum tcp_synack_type synack_type,
1003                               struct sk_buff *syn_skb)
1004 {
1005         const struct inet_request_sock *ireq = inet_rsk(req);
1006         struct flowi4 fl4;
1007         int err = -1;
1008         struct sk_buff *skb;
1009         u8 tos;
1010
1011         /* First, grab a route. */
1012         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1013                 return -1;
1014
1015         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1016
1017         if (skb) {
1018                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1019
1020                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1021                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1022                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1023                                 inet_sk(sk)->tos;
1024
1025                 if (!INET_ECN_is_capable(tos) &&
1026                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1027                         tos |= INET_ECN_ECT_0;
1028
1029                 rcu_read_lock();
1030                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1031                                             ireq->ir_rmt_addr,
1032                                             rcu_dereference(ireq->ireq_opt),
1033                                             tos);
1034                 rcu_read_unlock();
1035                 err = net_xmit_eval(err);
1036         }
1037
1038         return err;
1039 }
1040
1041 /*
1042  *      IPv4 request_sock destructor.
1043  */
1044 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1045 {
1046         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1047 }
1048
1049 #ifdef CONFIG_TCP_MD5SIG
1050 /*
1051  * RFC2385 MD5 checksumming requires a mapping of
1052  * IP address->MD5 Key.
1053  * We need to maintain these in the sk structure.
1054  */
1055
1056 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1057 EXPORT_SYMBOL(tcp_md5_needed);
1058
1059 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1060 {
1061         if (!old)
1062                 return true;
1063
1064         /* l3index always overrides non-l3index */
1065         if (old->l3index && new->l3index == 0)
1066                 return false;
1067         if (old->l3index == 0 && new->l3index)
1068                 return true;
1069
1070         return old->prefixlen < new->prefixlen;
1071 }
1072
1073 /* Find the Key structure for an address.  */
1074 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1075                                            const union tcp_md5_addr *addr,
1076                                            int family)
1077 {
1078         const struct tcp_sock *tp = tcp_sk(sk);
1079         struct tcp_md5sig_key *key;
1080         const struct tcp_md5sig_info *md5sig;
1081         __be32 mask;
1082         struct tcp_md5sig_key *best_match = NULL;
1083         bool match;
1084
1085         /* caller either holds rcu_read_lock() or socket lock */
1086         md5sig = rcu_dereference_check(tp->md5sig_info,
1087                                        lockdep_sock_is_held(sk));
1088         if (!md5sig)
1089                 return NULL;
1090
1091         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1092                                  lockdep_sock_is_held(sk)) {
1093                 if (key->family != family)
1094                         continue;
1095                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1096                         continue;
1097                 if (family == AF_INET) {
1098                         mask = inet_make_mask(key->prefixlen);
1099                         match = (key->addr.a4.s_addr & mask) ==
1100                                 (addr->a4.s_addr & mask);
1101 #if IS_ENABLED(CONFIG_IPV6)
1102                 } else if (family == AF_INET6) {
1103                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1104                                                   key->prefixlen);
1105 #endif
1106                 } else {
1107                         match = false;
1108                 }
1109
1110                 if (match && better_md5_match(best_match, key))
1111                         best_match = key;
1112         }
1113         return best_match;
1114 }
1115 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1116
1117 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1118                                                       const union tcp_md5_addr *addr,
1119                                                       int family, u8 prefixlen,
1120                                                       int l3index, u8 flags)
1121 {
1122         const struct tcp_sock *tp = tcp_sk(sk);
1123         struct tcp_md5sig_key *key;
1124         unsigned int size = sizeof(struct in_addr);
1125         const struct tcp_md5sig_info *md5sig;
1126
1127         /* caller either holds rcu_read_lock() or socket lock */
1128         md5sig = rcu_dereference_check(tp->md5sig_info,
1129                                        lockdep_sock_is_held(sk));
1130         if (!md5sig)
1131                 return NULL;
1132 #if IS_ENABLED(CONFIG_IPV6)
1133         if (family == AF_INET6)
1134                 size = sizeof(struct in6_addr);
1135 #endif
1136         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1137                                  lockdep_sock_is_held(sk)) {
1138                 if (key->family != family)
1139                         continue;
1140                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1141                         continue;
1142                 if (key->l3index != l3index)
1143                         continue;
1144                 if (!memcmp(&key->addr, addr, size) &&
1145                     key->prefixlen == prefixlen)
1146                         return key;
1147         }
1148         return NULL;
1149 }
1150
1151 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1152                                          const struct sock *addr_sk)
1153 {
1154         const union tcp_md5_addr *addr;
1155         int l3index;
1156
1157         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1158                                                  addr_sk->sk_bound_dev_if);
1159         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1160         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1161 }
1162 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1163
1164 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1165 {
1166         struct tcp_sock *tp = tcp_sk(sk);
1167         struct tcp_md5sig_info *md5sig;
1168
1169         md5sig = kmalloc(sizeof(*md5sig), gfp);
1170         if (!md5sig)
1171                 return -ENOMEM;
1172
1173         sk_gso_disable(sk);
1174         INIT_HLIST_HEAD(&md5sig->head);
1175         rcu_assign_pointer(tp->md5sig_info, md5sig);
1176         return 0;
1177 }
1178
1179 /* This can be called on a newly created socket, from other files */
1180 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1181                             int family, u8 prefixlen, int l3index, u8 flags,
1182                             const u8 *newkey, u8 newkeylen, gfp_t gfp)
1183 {
1184         /* Add Key to the list */
1185         struct tcp_md5sig_key *key;
1186         struct tcp_sock *tp = tcp_sk(sk);
1187         struct tcp_md5sig_info *md5sig;
1188
1189         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1190         if (key) {
1191                 /* Pre-existing entry - just update that one.
1192                  * Note that the key might be used concurrently.
1193                  * data_race() is telling kcsan that we do not care of
1194                  * key mismatches, since changing MD5 key on live flows
1195                  * can lead to packet drops.
1196                  */
1197                 data_race(memcpy(key->key, newkey, newkeylen));
1198
1199                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1200                  * Also note that a reader could catch new key->keylen value
1201                  * but old key->key[], this is the reason we use __GFP_ZERO
1202                  * at sock_kmalloc() time below these lines.
1203                  */
1204                 WRITE_ONCE(key->keylen, newkeylen);
1205
1206                 return 0;
1207         }
1208
1209         md5sig = rcu_dereference_protected(tp->md5sig_info,
1210                                            lockdep_sock_is_held(sk));
1211
1212         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1213         if (!key)
1214                 return -ENOMEM;
1215         if (!tcp_alloc_md5sig_pool()) {
1216                 sock_kfree_s(sk, key, sizeof(*key));
1217                 return -ENOMEM;
1218         }
1219
1220         memcpy(key->key, newkey, newkeylen);
1221         key->keylen = newkeylen;
1222         key->family = family;
1223         key->prefixlen = prefixlen;
1224         key->l3index = l3index;
1225         key->flags = flags;
1226         memcpy(&key->addr, addr,
1227                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1228                                                                  sizeof(struct in_addr));
1229         hlist_add_head_rcu(&key->node, &md5sig->head);
1230         return 0;
1231 }
1232
1233 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1234                    int family, u8 prefixlen, int l3index, u8 flags,
1235                    const u8 *newkey, u8 newkeylen)
1236 {
1237         struct tcp_sock *tp = tcp_sk(sk);
1238
1239         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1240                 if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1241                         return -ENOMEM;
1242
1243                 if (!static_branch_inc(&tcp_md5_needed.key)) {
1244                         struct tcp_md5sig_info *md5sig;
1245
1246                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1247                         rcu_assign_pointer(tp->md5sig_info, NULL);
1248                         kfree_rcu(md5sig, rcu);
1249                         return -EUSERS;
1250                 }
1251         }
1252
1253         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1254                                 newkey, newkeylen, GFP_KERNEL);
1255 }
1256 EXPORT_SYMBOL(tcp_md5_do_add);
1257
1258 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1259                      int family, u8 prefixlen, int l3index,
1260                      struct tcp_md5sig_key *key)
1261 {
1262         struct tcp_sock *tp = tcp_sk(sk);
1263
1264         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1265                 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1266                         return -ENOMEM;
1267
1268                 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1269                         struct tcp_md5sig_info *md5sig;
1270
1271                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1272                         net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1273                         rcu_assign_pointer(tp->md5sig_info, NULL);
1274                         kfree_rcu(md5sig, rcu);
1275                         return -EUSERS;
1276                 }
1277         }
1278
1279         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1280                                 key->flags, key->key, key->keylen,
1281                                 sk_gfp_mask(sk, GFP_ATOMIC));
1282 }
1283 EXPORT_SYMBOL(tcp_md5_key_copy);
1284
1285 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1286                    u8 prefixlen, int l3index, u8 flags)
1287 {
1288         struct tcp_md5sig_key *key;
1289
1290         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1291         if (!key)
1292                 return -ENOENT;
1293         hlist_del_rcu(&key->node);
1294         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1295         kfree_rcu(key, rcu);
1296         return 0;
1297 }
1298 EXPORT_SYMBOL(tcp_md5_do_del);
1299
1300 static void tcp_clear_md5_list(struct sock *sk)
1301 {
1302         struct tcp_sock *tp = tcp_sk(sk);
1303         struct tcp_md5sig_key *key;
1304         struct hlist_node *n;
1305         struct tcp_md5sig_info *md5sig;
1306
1307         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1308
1309         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1310                 hlist_del_rcu(&key->node);
1311                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1312                 kfree_rcu(key, rcu);
1313         }
1314 }
1315
1316 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1317                                  sockptr_t optval, int optlen)
1318 {
1319         struct tcp_md5sig cmd;
1320         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1321         const union tcp_md5_addr *addr;
1322         u8 prefixlen = 32;
1323         int l3index = 0;
1324         u8 flags;
1325
1326         if (optlen < sizeof(cmd))
1327                 return -EINVAL;
1328
1329         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1330                 return -EFAULT;
1331
1332         if (sin->sin_family != AF_INET)
1333                 return -EINVAL;
1334
1335         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1336
1337         if (optname == TCP_MD5SIG_EXT &&
1338             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1339                 prefixlen = cmd.tcpm_prefixlen;
1340                 if (prefixlen > 32)
1341                         return -EINVAL;
1342         }
1343
1344         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1345             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1346                 struct net_device *dev;
1347
1348                 rcu_read_lock();
1349                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1350                 if (dev && netif_is_l3_master(dev))
1351                         l3index = dev->ifindex;
1352
1353                 rcu_read_unlock();
1354
1355                 /* ok to reference set/not set outside of rcu;
1356                  * right now device MUST be an L3 master
1357                  */
1358                 if (!dev || !l3index)
1359                         return -EINVAL;
1360         }
1361
1362         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1363
1364         if (!cmd.tcpm_keylen)
1365                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1366
1367         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1368                 return -EINVAL;
1369
1370         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1371                               cmd.tcpm_key, cmd.tcpm_keylen);
1372 }
1373
1374 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1375                                    __be32 daddr, __be32 saddr,
1376                                    const struct tcphdr *th, int nbytes)
1377 {
1378         struct tcp4_pseudohdr *bp;
1379         struct scatterlist sg;
1380         struct tcphdr *_th;
1381
1382         bp = hp->scratch;
1383         bp->saddr = saddr;
1384         bp->daddr = daddr;
1385         bp->pad = 0;
1386         bp->protocol = IPPROTO_TCP;
1387         bp->len = cpu_to_be16(nbytes);
1388
1389         _th = (struct tcphdr *)(bp + 1);
1390         memcpy(_th, th, sizeof(*th));
1391         _th->check = 0;
1392
1393         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1394         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1395                                 sizeof(*bp) + sizeof(*th));
1396         return crypto_ahash_update(hp->md5_req);
1397 }
1398
1399 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1400                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1401 {
1402         struct tcp_md5sig_pool *hp;
1403         struct ahash_request *req;
1404
1405         hp = tcp_get_md5sig_pool();
1406         if (!hp)
1407                 goto clear_hash_noput;
1408         req = hp->md5_req;
1409
1410         if (crypto_ahash_init(req))
1411                 goto clear_hash;
1412         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1413                 goto clear_hash;
1414         if (tcp_md5_hash_key(hp, key))
1415                 goto clear_hash;
1416         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1417         if (crypto_ahash_final(req))
1418                 goto clear_hash;
1419
1420         tcp_put_md5sig_pool();
1421         return 0;
1422
1423 clear_hash:
1424         tcp_put_md5sig_pool();
1425 clear_hash_noput:
1426         memset(md5_hash, 0, 16);
1427         return 1;
1428 }
1429
1430 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1431                         const struct sock *sk,
1432                         const struct sk_buff *skb)
1433 {
1434         struct tcp_md5sig_pool *hp;
1435         struct ahash_request *req;
1436         const struct tcphdr *th = tcp_hdr(skb);
1437         __be32 saddr, daddr;
1438
1439         if (sk) { /* valid for establish/request sockets */
1440                 saddr = sk->sk_rcv_saddr;
1441                 daddr = sk->sk_daddr;
1442         } else {
1443                 const struct iphdr *iph = ip_hdr(skb);
1444                 saddr = iph->saddr;
1445                 daddr = iph->daddr;
1446         }
1447
1448         hp = tcp_get_md5sig_pool();
1449         if (!hp)
1450                 goto clear_hash_noput;
1451         req = hp->md5_req;
1452
1453         if (crypto_ahash_init(req))
1454                 goto clear_hash;
1455
1456         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1457                 goto clear_hash;
1458         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1459                 goto clear_hash;
1460         if (tcp_md5_hash_key(hp, key))
1461                 goto clear_hash;
1462         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1463         if (crypto_ahash_final(req))
1464                 goto clear_hash;
1465
1466         tcp_put_md5sig_pool();
1467         return 0;
1468
1469 clear_hash:
1470         tcp_put_md5sig_pool();
1471 clear_hash_noput:
1472         memset(md5_hash, 0, 16);
1473         return 1;
1474 }
1475 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1476
1477 #endif
1478
1479 static void tcp_v4_init_req(struct request_sock *req,
1480                             const struct sock *sk_listener,
1481                             struct sk_buff *skb)
1482 {
1483         struct inet_request_sock *ireq = inet_rsk(req);
1484         struct net *net = sock_net(sk_listener);
1485
1486         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1487         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1488         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1489 }
1490
1491 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1492                                           struct sk_buff *skb,
1493                                           struct flowi *fl,
1494                                           struct request_sock *req)
1495 {
1496         tcp_v4_init_req(req, sk, skb);
1497
1498         if (security_inet_conn_request(sk, skb, req))
1499                 return NULL;
1500
1501         return inet_csk_route_req(sk, &fl->u.ip4, req);
1502 }
1503
1504 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1505         .family         =       PF_INET,
1506         .obj_size       =       sizeof(struct tcp_request_sock),
1507         .rtx_syn_ack    =       tcp_rtx_synack,
1508         .send_ack       =       tcp_v4_reqsk_send_ack,
1509         .destructor     =       tcp_v4_reqsk_destructor,
1510         .send_reset     =       tcp_v4_send_reset,
1511         .syn_ack_timeout =      tcp_syn_ack_timeout,
1512 };
1513
1514 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1515         .mss_clamp      =       TCP_MSS_DEFAULT,
1516 #ifdef CONFIG_TCP_MD5SIG
1517         .req_md5_lookup =       tcp_v4_md5_lookup,
1518         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1519 #endif
1520 #ifdef CONFIG_SYN_COOKIES
1521         .cookie_init_seq =      cookie_v4_init_sequence,
1522 #endif
1523         .route_req      =       tcp_v4_route_req,
1524         .init_seq       =       tcp_v4_init_seq,
1525         .init_ts_off    =       tcp_v4_init_ts_off,
1526         .send_synack    =       tcp_v4_send_synack,
1527 };
1528
1529 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1530 {
1531         /* Never answer to SYNs send to broadcast or multicast */
1532         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1533                 goto drop;
1534
1535         return tcp_conn_request(&tcp_request_sock_ops,
1536                                 &tcp_request_sock_ipv4_ops, sk, skb);
1537
1538 drop:
1539         tcp_listendrop(sk);
1540         return 0;
1541 }
1542 EXPORT_SYMBOL(tcp_v4_conn_request);
1543
1544
1545 /*
1546  * The three way handshake has completed - we got a valid synack -
1547  * now create the new socket.
1548  */
1549 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1550                                   struct request_sock *req,
1551                                   struct dst_entry *dst,
1552                                   struct request_sock *req_unhash,
1553                                   bool *own_req)
1554 {
1555         struct inet_request_sock *ireq;
1556         bool found_dup_sk = false;
1557         struct inet_sock *newinet;
1558         struct tcp_sock *newtp;
1559         struct sock *newsk;
1560 #ifdef CONFIG_TCP_MD5SIG
1561         const union tcp_md5_addr *addr;
1562         struct tcp_md5sig_key *key;
1563         int l3index;
1564 #endif
1565         struct ip_options_rcu *inet_opt;
1566
1567         if (sk_acceptq_is_full(sk))
1568                 goto exit_overflow;
1569
1570         newsk = tcp_create_openreq_child(sk, req, skb);
1571         if (!newsk)
1572                 goto exit_nonewsk;
1573
1574         newsk->sk_gso_type = SKB_GSO_TCPV4;
1575         inet_sk_rx_dst_set(newsk, skb);
1576
1577         newtp                 = tcp_sk(newsk);
1578         newinet               = inet_sk(newsk);
1579         ireq                  = inet_rsk(req);
1580         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1581         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1582         newsk->sk_bound_dev_if = ireq->ir_iif;
1583         newinet->inet_saddr   = ireq->ir_loc_addr;
1584         inet_opt              = rcu_dereference(ireq->ireq_opt);
1585         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1586         newinet->mc_index     = inet_iif(skb);
1587         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1588         newinet->rcv_tos      = ip_hdr(skb)->tos;
1589         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1590         if (inet_opt)
1591                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1592         newinet->inet_id = get_random_u16();
1593
1594         /* Set ToS of the new socket based upon the value of incoming SYN.
1595          * ECT bits are set later in tcp_init_transfer().
1596          */
1597         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1598                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1599
1600         if (!dst) {
1601                 dst = inet_csk_route_child_sock(sk, newsk, req);
1602                 if (!dst)
1603                         goto put_and_exit;
1604         } else {
1605                 /* syncookie case : see end of cookie_v4_check() */
1606         }
1607         sk_setup_caps(newsk, dst);
1608
1609         tcp_ca_openreq_child(newsk, dst);
1610
1611         tcp_sync_mss(newsk, dst_mtu(dst));
1612         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1613
1614         tcp_initialize_rcv_mss(newsk);
1615
1616 #ifdef CONFIG_TCP_MD5SIG
1617         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1618         /* Copy over the MD5 key from the original socket */
1619         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1620         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1621         if (key) {
1622                 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1623                         goto put_and_exit;
1624                 sk_gso_disable(newsk);
1625         }
1626 #endif
1627
1628         if (__inet_inherit_port(sk, newsk) < 0)
1629                 goto put_and_exit;
1630         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1631                                        &found_dup_sk);
1632         if (likely(*own_req)) {
1633                 tcp_move_syn(newtp, req);
1634                 ireq->ireq_opt = NULL;
1635         } else {
1636                 newinet->inet_opt = NULL;
1637
1638                 if (!req_unhash && found_dup_sk) {
1639                         /* This code path should only be executed in the
1640                          * syncookie case only
1641                          */
1642                         bh_unlock_sock(newsk);
1643                         sock_put(newsk);
1644                         newsk = NULL;
1645                 }
1646         }
1647         return newsk;
1648
1649 exit_overflow:
1650         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1651 exit_nonewsk:
1652         dst_release(dst);
1653 exit:
1654         tcp_listendrop(sk);
1655         return NULL;
1656 put_and_exit:
1657         newinet->inet_opt = NULL;
1658         inet_csk_prepare_forced_close(newsk);
1659         tcp_done(newsk);
1660         goto exit;
1661 }
1662 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1663
1664 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1665 {
1666 #ifdef CONFIG_SYN_COOKIES
1667         const struct tcphdr *th = tcp_hdr(skb);
1668
1669         if (!th->syn)
1670                 sk = cookie_v4_check(sk, skb);
1671 #endif
1672         return sk;
1673 }
1674
1675 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1676                          struct tcphdr *th, u32 *cookie)
1677 {
1678         u16 mss = 0;
1679 #ifdef CONFIG_SYN_COOKIES
1680         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1681                                     &tcp_request_sock_ipv4_ops, sk, th);
1682         if (mss) {
1683                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1684                 tcp_synq_overflow(sk);
1685         }
1686 #endif
1687         return mss;
1688 }
1689
1690 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1691                                                            u32));
1692 /* The socket must have it's spinlock held when we get
1693  * here, unless it is a TCP_LISTEN socket.
1694  *
1695  * We have a potential double-lock case here, so even when
1696  * doing backlog processing we use the BH locking scheme.
1697  * This is because we cannot sleep with the original spinlock
1698  * held.
1699  */
1700 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1701 {
1702         enum skb_drop_reason reason;
1703         struct sock *rsk;
1704
1705         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1706                 struct dst_entry *dst;
1707
1708                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1709                                                 lockdep_sock_is_held(sk));
1710
1711                 sock_rps_save_rxhash(sk, skb);
1712                 sk_mark_napi_id(sk, skb);
1713                 if (dst) {
1714                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1715                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1716                                              dst, 0)) {
1717                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1718                                 dst_release(dst);
1719                         }
1720                 }
1721                 tcp_rcv_established(sk, skb);
1722                 return 0;
1723         }
1724
1725         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1726         if (tcp_checksum_complete(skb))
1727                 goto csum_err;
1728
1729         if (sk->sk_state == TCP_LISTEN) {
1730                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1731
1732                 if (!nsk)
1733                         goto discard;
1734                 if (nsk != sk) {
1735                         if (tcp_child_process(sk, nsk, skb)) {
1736                                 rsk = nsk;
1737                                 goto reset;
1738                         }
1739                         return 0;
1740                 }
1741         } else
1742                 sock_rps_save_rxhash(sk, skb);
1743
1744         if (tcp_rcv_state_process(sk, skb)) {
1745                 rsk = sk;
1746                 goto reset;
1747         }
1748         return 0;
1749
1750 reset:
1751         tcp_v4_send_reset(rsk, skb);
1752 discard:
1753         kfree_skb_reason(skb, reason);
1754         /* Be careful here. If this function gets more complicated and
1755          * gcc suffers from register pressure on the x86, sk (in %ebx)
1756          * might be destroyed here. This current version compiles correctly,
1757          * but you have been warned.
1758          */
1759         return 0;
1760
1761 csum_err:
1762         reason = SKB_DROP_REASON_TCP_CSUM;
1763         trace_tcp_bad_csum(skb);
1764         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1765         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1766         goto discard;
1767 }
1768 EXPORT_SYMBOL(tcp_v4_do_rcv);
1769
1770 int tcp_v4_early_demux(struct sk_buff *skb)
1771 {
1772         struct net *net = dev_net(skb->dev);
1773         const struct iphdr *iph;
1774         const struct tcphdr *th;
1775         struct sock *sk;
1776
1777         if (skb->pkt_type != PACKET_HOST)
1778                 return 0;
1779
1780         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1781                 return 0;
1782
1783         iph = ip_hdr(skb);
1784         th = tcp_hdr(skb);
1785
1786         if (th->doff < sizeof(struct tcphdr) / 4)
1787                 return 0;
1788
1789         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1790                                        iph->saddr, th->source,
1791                                        iph->daddr, ntohs(th->dest),
1792                                        skb->skb_iif, inet_sdif(skb));
1793         if (sk) {
1794                 skb->sk = sk;
1795                 skb->destructor = sock_edemux;
1796                 if (sk_fullsock(sk)) {
1797                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1798
1799                         if (dst)
1800                                 dst = dst_check(dst, 0);
1801                         if (dst &&
1802                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1803                                 skb_dst_set_noref(skb, dst);
1804                 }
1805         }
1806         return 0;
1807 }
1808
1809 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1810                      enum skb_drop_reason *reason)
1811 {
1812         u32 limit, tail_gso_size, tail_gso_segs;
1813         struct skb_shared_info *shinfo;
1814         const struct tcphdr *th;
1815         struct tcphdr *thtail;
1816         struct sk_buff *tail;
1817         unsigned int hdrlen;
1818         bool fragstolen;
1819         u32 gso_segs;
1820         u32 gso_size;
1821         int delta;
1822
1823         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1824          * we can fix skb->truesize to its real value to avoid future drops.
1825          * This is valid because skb is not yet charged to the socket.
1826          * It has been noticed pure SACK packets were sometimes dropped
1827          * (if cooked by drivers without copybreak feature).
1828          */
1829         skb_condense(skb);
1830
1831         skb_dst_drop(skb);
1832
1833         if (unlikely(tcp_checksum_complete(skb))) {
1834                 bh_unlock_sock(sk);
1835                 trace_tcp_bad_csum(skb);
1836                 *reason = SKB_DROP_REASON_TCP_CSUM;
1837                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1838                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1839                 return true;
1840         }
1841
1842         /* Attempt coalescing to last skb in backlog, even if we are
1843          * above the limits.
1844          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1845          */
1846         th = (const struct tcphdr *)skb->data;
1847         hdrlen = th->doff * 4;
1848
1849         tail = sk->sk_backlog.tail;
1850         if (!tail)
1851                 goto no_coalesce;
1852         thtail = (struct tcphdr *)tail->data;
1853
1854         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1855             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1856             ((TCP_SKB_CB(tail)->tcp_flags |
1857               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1858             !((TCP_SKB_CB(tail)->tcp_flags &
1859               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1860             ((TCP_SKB_CB(tail)->tcp_flags ^
1861               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1862 #ifdef CONFIG_TLS_DEVICE
1863             tail->decrypted != skb->decrypted ||
1864 #endif
1865             thtail->doff != th->doff ||
1866             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1867                 goto no_coalesce;
1868
1869         __skb_pull(skb, hdrlen);
1870
1871         shinfo = skb_shinfo(skb);
1872         gso_size = shinfo->gso_size ?: skb->len;
1873         gso_segs = shinfo->gso_segs ?: 1;
1874
1875         shinfo = skb_shinfo(tail);
1876         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1877         tail_gso_segs = shinfo->gso_segs ?: 1;
1878
1879         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1880                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1881
1882                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1883                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1884                         thtail->window = th->window;
1885                 }
1886
1887                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1888                  * thtail->fin, so that the fast path in tcp_rcv_established()
1889                  * is not entered if we append a packet with a FIN.
1890                  * SYN, RST, URG are not present.
1891                  * ACK is set on both packets.
1892                  * PSH : we do not really care in TCP stack,
1893                  *       at least for 'GRO' packets.
1894                  */
1895                 thtail->fin |= th->fin;
1896                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1897
1898                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1899                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1900                         tail->tstamp = skb->tstamp;
1901                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1902                 }
1903
1904                 /* Not as strict as GRO. We only need to carry mss max value */
1905                 shinfo->gso_size = max(gso_size, tail_gso_size);
1906                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1907
1908                 sk->sk_backlog.len += delta;
1909                 __NET_INC_STATS(sock_net(sk),
1910                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1911                 kfree_skb_partial(skb, fragstolen);
1912                 return false;
1913         }
1914         __skb_push(skb, hdrlen);
1915
1916 no_coalesce:
1917         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1918
1919         /* Only socket owner can try to collapse/prune rx queues
1920          * to reduce memory overhead, so add a little headroom here.
1921          * Few sockets backlog are possibly concurrently non empty.
1922          */
1923         limit += 64 * 1024;
1924
1925         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1926                 bh_unlock_sock(sk);
1927                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1928                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1929                 return true;
1930         }
1931         return false;
1932 }
1933 EXPORT_SYMBOL(tcp_add_backlog);
1934
1935 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1936 {
1937         struct tcphdr *th = (struct tcphdr *)skb->data;
1938
1939         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1940 }
1941 EXPORT_SYMBOL(tcp_filter);
1942
1943 static void tcp_v4_restore_cb(struct sk_buff *skb)
1944 {
1945         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1946                 sizeof(struct inet_skb_parm));
1947 }
1948
1949 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1950                            const struct tcphdr *th)
1951 {
1952         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1953          * barrier() makes sure compiler wont play fool^Waliasing games.
1954          */
1955         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1956                 sizeof(struct inet_skb_parm));
1957         barrier();
1958
1959         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1960         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1961                                     skb->len - th->doff * 4);
1962         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1963         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1964         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1965         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1966         TCP_SKB_CB(skb)->sacked  = 0;
1967         TCP_SKB_CB(skb)->has_rxtstamp =
1968                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1969 }
1970
1971 /*
1972  *      From tcp_input.c
1973  */
1974
1975 int tcp_v4_rcv(struct sk_buff *skb)
1976 {
1977         struct net *net = dev_net(skb->dev);
1978         enum skb_drop_reason drop_reason;
1979         int sdif = inet_sdif(skb);
1980         int dif = inet_iif(skb);
1981         const struct iphdr *iph;
1982         const struct tcphdr *th;
1983         bool refcounted;
1984         struct sock *sk;
1985         int ret;
1986
1987         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1988         if (skb->pkt_type != PACKET_HOST)
1989                 goto discard_it;
1990
1991         /* Count it even if it's bad */
1992         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1993
1994         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1995                 goto discard_it;
1996
1997         th = (const struct tcphdr *)skb->data;
1998
1999         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2000                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2001                 goto bad_packet;
2002         }
2003         if (!pskb_may_pull(skb, th->doff * 4))
2004                 goto discard_it;
2005
2006         /* An explanation is required here, I think.
2007          * Packet length and doff are validated by header prediction,
2008          * provided case of th->doff==0 is eliminated.
2009          * So, we defer the checks. */
2010
2011         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2012                 goto csum_error;
2013
2014         th = (const struct tcphdr *)skb->data;
2015         iph = ip_hdr(skb);
2016 lookup:
2017         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2018                                skb, __tcp_hdrlen(th), th->source,
2019                                th->dest, sdif, &refcounted);
2020         if (!sk)
2021                 goto no_tcp_socket;
2022
2023 process:
2024         if (sk->sk_state == TCP_TIME_WAIT)
2025                 goto do_time_wait;
2026
2027         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2028                 struct request_sock *req = inet_reqsk(sk);
2029                 bool req_stolen = false;
2030                 struct sock *nsk;
2031
2032                 sk = req->rsk_listener;
2033                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2034                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2035                 else
2036                         drop_reason = tcp_inbound_md5_hash(sk, skb,
2037                                                    &iph->saddr, &iph->daddr,
2038                                                    AF_INET, dif, sdif);
2039                 if (unlikely(drop_reason)) {
2040                         sk_drops_add(sk, skb);
2041                         reqsk_put(req);
2042                         goto discard_it;
2043                 }
2044                 if (tcp_checksum_complete(skb)) {
2045                         reqsk_put(req);
2046                         goto csum_error;
2047                 }
2048                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2049                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2050                         if (!nsk) {
2051                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2052                                 goto lookup;
2053                         }
2054                         sk = nsk;
2055                         /* reuseport_migrate_sock() has already held one sk_refcnt
2056                          * before returning.
2057                          */
2058                 } else {
2059                         /* We own a reference on the listener, increase it again
2060                          * as we might lose it too soon.
2061                          */
2062                         sock_hold(sk);
2063                 }
2064                 refcounted = true;
2065                 nsk = NULL;
2066                 if (!tcp_filter(sk, skb)) {
2067                         th = (const struct tcphdr *)skb->data;
2068                         iph = ip_hdr(skb);
2069                         tcp_v4_fill_cb(skb, iph, th);
2070                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2071                 } else {
2072                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2073                 }
2074                 if (!nsk) {
2075                         reqsk_put(req);
2076                         if (req_stolen) {
2077                                 /* Another cpu got exclusive access to req
2078                                  * and created a full blown socket.
2079                                  * Try to feed this packet to this socket
2080                                  * instead of discarding it.
2081                                  */
2082                                 tcp_v4_restore_cb(skb);
2083                                 sock_put(sk);
2084                                 goto lookup;
2085                         }
2086                         goto discard_and_relse;
2087                 }
2088                 nf_reset_ct(skb);
2089                 if (nsk == sk) {
2090                         reqsk_put(req);
2091                         tcp_v4_restore_cb(skb);
2092                 } else if (tcp_child_process(sk, nsk, skb)) {
2093                         tcp_v4_send_reset(nsk, skb);
2094                         goto discard_and_relse;
2095                 } else {
2096                         sock_put(sk);
2097                         return 0;
2098                 }
2099         }
2100
2101         if (static_branch_unlikely(&ip4_min_ttl)) {
2102                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2103                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2104                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2105                         goto discard_and_relse;
2106                 }
2107         }
2108
2109         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2110                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2111                 goto discard_and_relse;
2112         }
2113
2114         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2115                                            &iph->daddr, AF_INET, dif, sdif);
2116         if (drop_reason)
2117                 goto discard_and_relse;
2118
2119         nf_reset_ct(skb);
2120
2121         if (tcp_filter(sk, skb)) {
2122                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2123                 goto discard_and_relse;
2124         }
2125         th = (const struct tcphdr *)skb->data;
2126         iph = ip_hdr(skb);
2127         tcp_v4_fill_cb(skb, iph, th);
2128
2129         skb->dev = NULL;
2130
2131         if (sk->sk_state == TCP_LISTEN) {
2132                 ret = tcp_v4_do_rcv(sk, skb);
2133                 goto put_and_return;
2134         }
2135
2136         sk_incoming_cpu_update(sk);
2137
2138         bh_lock_sock_nested(sk);
2139         tcp_segs_in(tcp_sk(sk), skb);
2140         ret = 0;
2141         if (!sock_owned_by_user(sk)) {
2142                 ret = tcp_v4_do_rcv(sk, skb);
2143         } else {
2144                 if (tcp_add_backlog(sk, skb, &drop_reason))
2145                         goto discard_and_relse;
2146         }
2147         bh_unlock_sock(sk);
2148
2149 put_and_return:
2150         if (refcounted)
2151                 sock_put(sk);
2152
2153         return ret;
2154
2155 no_tcp_socket:
2156         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2157         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2158                 goto discard_it;
2159
2160         tcp_v4_fill_cb(skb, iph, th);
2161
2162         if (tcp_checksum_complete(skb)) {
2163 csum_error:
2164                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2165                 trace_tcp_bad_csum(skb);
2166                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2167 bad_packet:
2168                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2169         } else {
2170                 tcp_v4_send_reset(NULL, skb);
2171         }
2172
2173 discard_it:
2174         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2175         /* Discard frame. */
2176         kfree_skb_reason(skb, drop_reason);
2177         return 0;
2178
2179 discard_and_relse:
2180         sk_drops_add(sk, skb);
2181         if (refcounted)
2182                 sock_put(sk);
2183         goto discard_it;
2184
2185 do_time_wait:
2186         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2187                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2188                 inet_twsk_put(inet_twsk(sk));
2189                 goto discard_it;
2190         }
2191
2192         tcp_v4_fill_cb(skb, iph, th);
2193
2194         if (tcp_checksum_complete(skb)) {
2195                 inet_twsk_put(inet_twsk(sk));
2196                 goto csum_error;
2197         }
2198         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2199         case TCP_TW_SYN: {
2200                 struct sock *sk2 = inet_lookup_listener(net,
2201                                                         net->ipv4.tcp_death_row.hashinfo,
2202                                                         skb, __tcp_hdrlen(th),
2203                                                         iph->saddr, th->source,
2204                                                         iph->daddr, th->dest,
2205                                                         inet_iif(skb),
2206                                                         sdif);
2207                 if (sk2) {
2208                         inet_twsk_deschedule_put(inet_twsk(sk));
2209                         sk = sk2;
2210                         tcp_v4_restore_cb(skb);
2211                         refcounted = false;
2212                         goto process;
2213                 }
2214         }
2215                 /* to ACK */
2216                 fallthrough;
2217         case TCP_TW_ACK:
2218                 tcp_v4_timewait_ack(sk, skb);
2219                 break;
2220         case TCP_TW_RST:
2221                 tcp_v4_send_reset(sk, skb);
2222                 inet_twsk_deschedule_put(inet_twsk(sk));
2223                 goto discard_it;
2224         case TCP_TW_SUCCESS:;
2225         }
2226         goto discard_it;
2227 }
2228
2229 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2230         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2231         .twsk_unique    = tcp_twsk_unique,
2232         .twsk_destructor= tcp_twsk_destructor,
2233 };
2234
2235 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2236 {
2237         struct dst_entry *dst = skb_dst(skb);
2238
2239         if (dst && dst_hold_safe(dst)) {
2240                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2241                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2242         }
2243 }
2244 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2245
2246 const struct inet_connection_sock_af_ops ipv4_specific = {
2247         .queue_xmit        = ip_queue_xmit,
2248         .send_check        = tcp_v4_send_check,
2249         .rebuild_header    = inet_sk_rebuild_header,
2250         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2251         .conn_request      = tcp_v4_conn_request,
2252         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2253         .net_header_len    = sizeof(struct iphdr),
2254         .setsockopt        = ip_setsockopt,
2255         .getsockopt        = ip_getsockopt,
2256         .addr2sockaddr     = inet_csk_addr2sockaddr,
2257         .sockaddr_len      = sizeof(struct sockaddr_in),
2258         .mtu_reduced       = tcp_v4_mtu_reduced,
2259 };
2260 EXPORT_SYMBOL(ipv4_specific);
2261
2262 #ifdef CONFIG_TCP_MD5SIG
2263 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2264         .md5_lookup             = tcp_v4_md5_lookup,
2265         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2266         .md5_parse              = tcp_v4_parse_md5_keys,
2267 };
2268 #endif
2269
2270 /* NOTE: A lot of things set to zero explicitly by call to
2271  *       sk_alloc() so need not be done here.
2272  */
2273 static int tcp_v4_init_sock(struct sock *sk)
2274 {
2275         struct inet_connection_sock *icsk = inet_csk(sk);
2276
2277         tcp_init_sock(sk);
2278
2279         icsk->icsk_af_ops = &ipv4_specific;
2280
2281 #ifdef CONFIG_TCP_MD5SIG
2282         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2283 #endif
2284
2285         return 0;
2286 }
2287
2288 void tcp_v4_destroy_sock(struct sock *sk)
2289 {
2290         struct tcp_sock *tp = tcp_sk(sk);
2291
2292         trace_tcp_destroy_sock(sk);
2293
2294         tcp_clear_xmit_timers(sk);
2295
2296         tcp_cleanup_congestion_control(sk);
2297
2298         tcp_cleanup_ulp(sk);
2299
2300         /* Cleanup up the write buffer. */
2301         tcp_write_queue_purge(sk);
2302
2303         /* Check if we want to disable active TFO */
2304         tcp_fastopen_active_disable_ofo_check(sk);
2305
2306         /* Cleans up our, hopefully empty, out_of_order_queue. */
2307         skb_rbtree_purge(&tp->out_of_order_queue);
2308
2309 #ifdef CONFIG_TCP_MD5SIG
2310         /* Clean up the MD5 key list, if any */
2311         if (tp->md5sig_info) {
2312                 tcp_clear_md5_list(sk);
2313                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2314                 tp->md5sig_info = NULL;
2315                 static_branch_slow_dec_deferred(&tcp_md5_needed);
2316         }
2317 #endif
2318
2319         /* Clean up a referenced TCP bind bucket. */
2320         if (inet_csk(sk)->icsk_bind_hash)
2321                 inet_put_port(sk);
2322
2323         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2324
2325         /* If socket is aborted during connect operation */
2326         tcp_free_fastopen_req(tp);
2327         tcp_fastopen_destroy_cipher(sk);
2328         tcp_saved_syn_free(tp);
2329
2330         sk_sockets_allocated_dec(sk);
2331 }
2332 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2333
2334 #ifdef CONFIG_PROC_FS
2335 /* Proc filesystem TCP sock list dumping. */
2336
2337 static unsigned short seq_file_family(const struct seq_file *seq);
2338
2339 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2340 {
2341         unsigned short family = seq_file_family(seq);
2342
2343         /* AF_UNSPEC is used as a match all */
2344         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2345                 net_eq(sock_net(sk), seq_file_net(seq)));
2346 }
2347
2348 /* Find a non empty bucket (starting from st->bucket)
2349  * and return the first sk from it.
2350  */
2351 static void *listening_get_first(struct seq_file *seq)
2352 {
2353         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2354         struct tcp_iter_state *st = seq->private;
2355
2356         st->offset = 0;
2357         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2358                 struct inet_listen_hashbucket *ilb2;
2359                 struct hlist_nulls_node *node;
2360                 struct sock *sk;
2361
2362                 ilb2 = &hinfo->lhash2[st->bucket];
2363                 if (hlist_nulls_empty(&ilb2->nulls_head))
2364                         continue;
2365
2366                 spin_lock(&ilb2->lock);
2367                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2368                         if (seq_sk_match(seq, sk))
2369                                 return sk;
2370                 }
2371                 spin_unlock(&ilb2->lock);
2372         }
2373
2374         return NULL;
2375 }
2376
2377 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2378  * If "cur" is the last one in the st->bucket,
2379  * call listening_get_first() to return the first sk of the next
2380  * non empty bucket.
2381  */
2382 static void *listening_get_next(struct seq_file *seq, void *cur)
2383 {
2384         struct tcp_iter_state *st = seq->private;
2385         struct inet_listen_hashbucket *ilb2;
2386         struct hlist_nulls_node *node;
2387         struct inet_hashinfo *hinfo;
2388         struct sock *sk = cur;
2389
2390         ++st->num;
2391         ++st->offset;
2392
2393         sk = sk_nulls_next(sk);
2394         sk_nulls_for_each_from(sk, node) {
2395                 if (seq_sk_match(seq, sk))
2396                         return sk;
2397         }
2398
2399         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2400         ilb2 = &hinfo->lhash2[st->bucket];
2401         spin_unlock(&ilb2->lock);
2402         ++st->bucket;
2403         return listening_get_first(seq);
2404 }
2405
2406 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2407 {
2408         struct tcp_iter_state *st = seq->private;
2409         void *rc;
2410
2411         st->bucket = 0;
2412         st->offset = 0;
2413         rc = listening_get_first(seq);
2414
2415         while (rc && *pos) {
2416                 rc = listening_get_next(seq, rc);
2417                 --*pos;
2418         }
2419         return rc;
2420 }
2421
2422 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2423                                 const struct tcp_iter_state *st)
2424 {
2425         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2426 }
2427
2428 /*
2429  * Get first established socket starting from bucket given in st->bucket.
2430  * If st->bucket is zero, the very first socket in the hash is returned.
2431  */
2432 static void *established_get_first(struct seq_file *seq)
2433 {
2434         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2435         struct tcp_iter_state *st = seq->private;
2436
2437         st->offset = 0;
2438         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2439                 struct sock *sk;
2440                 struct hlist_nulls_node *node;
2441                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2442
2443                 /* Lockless fast path for the common case of empty buckets */
2444                 if (empty_bucket(hinfo, st))
2445                         continue;
2446
2447                 spin_lock_bh(lock);
2448                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2449                         if (seq_sk_match(seq, sk))
2450                                 return sk;
2451                 }
2452                 spin_unlock_bh(lock);
2453         }
2454
2455         return NULL;
2456 }
2457
2458 static void *established_get_next(struct seq_file *seq, void *cur)
2459 {
2460         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2461         struct tcp_iter_state *st = seq->private;
2462         struct hlist_nulls_node *node;
2463         struct sock *sk = cur;
2464
2465         ++st->num;
2466         ++st->offset;
2467
2468         sk = sk_nulls_next(sk);
2469
2470         sk_nulls_for_each_from(sk, node) {
2471                 if (seq_sk_match(seq, sk))
2472                         return sk;
2473         }
2474
2475         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2476         ++st->bucket;
2477         return established_get_first(seq);
2478 }
2479
2480 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2481 {
2482         struct tcp_iter_state *st = seq->private;
2483         void *rc;
2484
2485         st->bucket = 0;
2486         rc = established_get_first(seq);
2487
2488         while (rc && pos) {
2489                 rc = established_get_next(seq, rc);
2490                 --pos;
2491         }
2492         return rc;
2493 }
2494
2495 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2496 {
2497         void *rc;
2498         struct tcp_iter_state *st = seq->private;
2499
2500         st->state = TCP_SEQ_STATE_LISTENING;
2501         rc        = listening_get_idx(seq, &pos);
2502
2503         if (!rc) {
2504                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2505                 rc        = established_get_idx(seq, pos);
2506         }
2507
2508         return rc;
2509 }
2510
2511 static void *tcp_seek_last_pos(struct seq_file *seq)
2512 {
2513         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2514         struct tcp_iter_state *st = seq->private;
2515         int bucket = st->bucket;
2516         int offset = st->offset;
2517         int orig_num = st->num;
2518         void *rc = NULL;
2519
2520         switch (st->state) {
2521         case TCP_SEQ_STATE_LISTENING:
2522                 if (st->bucket > hinfo->lhash2_mask)
2523                         break;
2524                 rc = listening_get_first(seq);
2525                 while (offset-- && rc && bucket == st->bucket)
2526                         rc = listening_get_next(seq, rc);
2527                 if (rc)
2528                         break;
2529                 st->bucket = 0;
2530                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2531                 fallthrough;
2532         case TCP_SEQ_STATE_ESTABLISHED:
2533                 if (st->bucket > hinfo->ehash_mask)
2534                         break;
2535                 rc = established_get_first(seq);
2536                 while (offset-- && rc && bucket == st->bucket)
2537                         rc = established_get_next(seq, rc);
2538         }
2539
2540         st->num = orig_num;
2541
2542         return rc;
2543 }
2544
2545 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2546 {
2547         struct tcp_iter_state *st = seq->private;
2548         void *rc;
2549
2550         if (*pos && *pos == st->last_pos) {
2551                 rc = tcp_seek_last_pos(seq);
2552                 if (rc)
2553                         goto out;
2554         }
2555
2556         st->state = TCP_SEQ_STATE_LISTENING;
2557         st->num = 0;
2558         st->bucket = 0;
2559         st->offset = 0;
2560         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2561
2562 out:
2563         st->last_pos = *pos;
2564         return rc;
2565 }
2566 EXPORT_SYMBOL(tcp_seq_start);
2567
2568 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2569 {
2570         struct tcp_iter_state *st = seq->private;
2571         void *rc = NULL;
2572
2573         if (v == SEQ_START_TOKEN) {
2574                 rc = tcp_get_idx(seq, 0);
2575                 goto out;
2576         }
2577
2578         switch (st->state) {
2579         case TCP_SEQ_STATE_LISTENING:
2580                 rc = listening_get_next(seq, v);
2581                 if (!rc) {
2582                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2583                         st->bucket = 0;
2584                         st->offset = 0;
2585                         rc        = established_get_first(seq);
2586                 }
2587                 break;
2588         case TCP_SEQ_STATE_ESTABLISHED:
2589                 rc = established_get_next(seq, v);
2590                 break;
2591         }
2592 out:
2593         ++*pos;
2594         st->last_pos = *pos;
2595         return rc;
2596 }
2597 EXPORT_SYMBOL(tcp_seq_next);
2598
2599 void tcp_seq_stop(struct seq_file *seq, void *v)
2600 {
2601         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2602         struct tcp_iter_state *st = seq->private;
2603
2604         switch (st->state) {
2605         case TCP_SEQ_STATE_LISTENING:
2606                 if (v != SEQ_START_TOKEN)
2607                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2608                 break;
2609         case TCP_SEQ_STATE_ESTABLISHED:
2610                 if (v)
2611                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2612                 break;
2613         }
2614 }
2615 EXPORT_SYMBOL(tcp_seq_stop);
2616
2617 static void get_openreq4(const struct request_sock *req,
2618                          struct seq_file *f, int i)
2619 {
2620         const struct inet_request_sock *ireq = inet_rsk(req);
2621         long delta = req->rsk_timer.expires - jiffies;
2622
2623         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2624                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2625                 i,
2626                 ireq->ir_loc_addr,
2627                 ireq->ir_num,
2628                 ireq->ir_rmt_addr,
2629                 ntohs(ireq->ir_rmt_port),
2630                 TCP_SYN_RECV,
2631                 0, 0, /* could print option size, but that is af dependent. */
2632                 1,    /* timers active (only the expire timer) */
2633                 jiffies_delta_to_clock_t(delta),
2634                 req->num_timeout,
2635                 from_kuid_munged(seq_user_ns(f),
2636                                  sock_i_uid(req->rsk_listener)),
2637                 0,  /* non standard timer */
2638                 0, /* open_requests have no inode */
2639                 0,
2640                 req);
2641 }
2642
2643 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2644 {
2645         int timer_active;
2646         unsigned long timer_expires;
2647         const struct tcp_sock *tp = tcp_sk(sk);
2648         const struct inet_connection_sock *icsk = inet_csk(sk);
2649         const struct inet_sock *inet = inet_sk(sk);
2650         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2651         __be32 dest = inet->inet_daddr;
2652         __be32 src = inet->inet_rcv_saddr;
2653         __u16 destp = ntohs(inet->inet_dport);
2654         __u16 srcp = ntohs(inet->inet_sport);
2655         int rx_queue;
2656         int state;
2657
2658         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2659             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2660             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2661                 timer_active    = 1;
2662                 timer_expires   = icsk->icsk_timeout;
2663         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2664                 timer_active    = 4;
2665                 timer_expires   = icsk->icsk_timeout;
2666         } else if (timer_pending(&sk->sk_timer)) {
2667                 timer_active    = 2;
2668                 timer_expires   = sk->sk_timer.expires;
2669         } else {
2670                 timer_active    = 0;
2671                 timer_expires = jiffies;
2672         }
2673
2674         state = inet_sk_state_load(sk);
2675         if (state == TCP_LISTEN)
2676                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2677         else
2678                 /* Because we don't lock the socket,
2679                  * we might find a transient negative value.
2680                  */
2681                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2682                                       READ_ONCE(tp->copied_seq), 0);
2683
2684         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2685                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2686                 i, src, srcp, dest, destp, state,
2687                 READ_ONCE(tp->write_seq) - tp->snd_una,
2688                 rx_queue,
2689                 timer_active,
2690                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2691                 icsk->icsk_retransmits,
2692                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2693                 icsk->icsk_probes_out,
2694                 sock_i_ino(sk),
2695                 refcount_read(&sk->sk_refcnt), sk,
2696                 jiffies_to_clock_t(icsk->icsk_rto),
2697                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2698                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2699                 tcp_snd_cwnd(tp),
2700                 state == TCP_LISTEN ?
2701                     fastopenq->max_qlen :
2702                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2703 }
2704
2705 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2706                                struct seq_file *f, int i)
2707 {
2708         long delta = tw->tw_timer.expires - jiffies;
2709         __be32 dest, src;
2710         __u16 destp, srcp;
2711
2712         dest  = tw->tw_daddr;
2713         src   = tw->tw_rcv_saddr;
2714         destp = ntohs(tw->tw_dport);
2715         srcp  = ntohs(tw->tw_sport);
2716
2717         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2718                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2719                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2720                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2721                 refcount_read(&tw->tw_refcnt), tw);
2722 }
2723
2724 #define TMPSZ 150
2725
2726 static int tcp4_seq_show(struct seq_file *seq, void *v)
2727 {
2728         struct tcp_iter_state *st;
2729         struct sock *sk = v;
2730
2731         seq_setwidth(seq, TMPSZ - 1);
2732         if (v == SEQ_START_TOKEN) {
2733                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2734                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2735                            "inode");
2736                 goto out;
2737         }
2738         st = seq->private;
2739
2740         if (sk->sk_state == TCP_TIME_WAIT)
2741                 get_timewait4_sock(v, seq, st->num);
2742         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2743                 get_openreq4(v, seq, st->num);
2744         else
2745                 get_tcp4_sock(v, seq, st->num);
2746 out:
2747         seq_pad(seq, '\n');
2748         return 0;
2749 }
2750
2751 #ifdef CONFIG_BPF_SYSCALL
2752 struct bpf_tcp_iter_state {
2753         struct tcp_iter_state state;
2754         unsigned int cur_sk;
2755         unsigned int end_sk;
2756         unsigned int max_sk;
2757         struct sock **batch;
2758         bool st_bucket_done;
2759 };
2760
2761 struct bpf_iter__tcp {
2762         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2763         __bpf_md_ptr(struct sock_common *, sk_common);
2764         uid_t uid __aligned(8);
2765 };
2766
2767 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2768                              struct sock_common *sk_common, uid_t uid)
2769 {
2770         struct bpf_iter__tcp ctx;
2771
2772         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2773         ctx.meta = meta;
2774         ctx.sk_common = sk_common;
2775         ctx.uid = uid;
2776         return bpf_iter_run_prog(prog, &ctx);
2777 }
2778
2779 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2780 {
2781         while (iter->cur_sk < iter->end_sk)
2782                 sock_put(iter->batch[iter->cur_sk++]);
2783 }
2784
2785 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2786                                       unsigned int new_batch_sz)
2787 {
2788         struct sock **new_batch;
2789
2790         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2791                              GFP_USER | __GFP_NOWARN);
2792         if (!new_batch)
2793                 return -ENOMEM;
2794
2795         bpf_iter_tcp_put_batch(iter);
2796         kvfree(iter->batch);
2797         iter->batch = new_batch;
2798         iter->max_sk = new_batch_sz;
2799
2800         return 0;
2801 }
2802
2803 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2804                                                  struct sock *start_sk)
2805 {
2806         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2807         struct bpf_tcp_iter_state *iter = seq->private;
2808         struct tcp_iter_state *st = &iter->state;
2809         struct hlist_nulls_node *node;
2810         unsigned int expected = 1;
2811         struct sock *sk;
2812
2813         sock_hold(start_sk);
2814         iter->batch[iter->end_sk++] = start_sk;
2815
2816         sk = sk_nulls_next(start_sk);
2817         sk_nulls_for_each_from(sk, node) {
2818                 if (seq_sk_match(seq, sk)) {
2819                         if (iter->end_sk < iter->max_sk) {
2820                                 sock_hold(sk);
2821                                 iter->batch[iter->end_sk++] = sk;
2822                         }
2823                         expected++;
2824                 }
2825         }
2826         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2827
2828         return expected;
2829 }
2830
2831 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2832                                                    struct sock *start_sk)
2833 {
2834         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2835         struct bpf_tcp_iter_state *iter = seq->private;
2836         struct tcp_iter_state *st = &iter->state;
2837         struct hlist_nulls_node *node;
2838         unsigned int expected = 1;
2839         struct sock *sk;
2840
2841         sock_hold(start_sk);
2842         iter->batch[iter->end_sk++] = start_sk;
2843
2844         sk = sk_nulls_next(start_sk);
2845         sk_nulls_for_each_from(sk, node) {
2846                 if (seq_sk_match(seq, sk)) {
2847                         if (iter->end_sk < iter->max_sk) {
2848                                 sock_hold(sk);
2849                                 iter->batch[iter->end_sk++] = sk;
2850                         }
2851                         expected++;
2852                 }
2853         }
2854         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2855
2856         return expected;
2857 }
2858
2859 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2860 {
2861         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2862         struct bpf_tcp_iter_state *iter = seq->private;
2863         struct tcp_iter_state *st = &iter->state;
2864         unsigned int expected;
2865         bool resized = false;
2866         struct sock *sk;
2867
2868         /* The st->bucket is done.  Directly advance to the next
2869          * bucket instead of having the tcp_seek_last_pos() to skip
2870          * one by one in the current bucket and eventually find out
2871          * it has to advance to the next bucket.
2872          */
2873         if (iter->st_bucket_done) {
2874                 st->offset = 0;
2875                 st->bucket++;
2876                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2877                     st->bucket > hinfo->lhash2_mask) {
2878                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2879                         st->bucket = 0;
2880                 }
2881         }
2882
2883 again:
2884         /* Get a new batch */
2885         iter->cur_sk = 0;
2886         iter->end_sk = 0;
2887         iter->st_bucket_done = false;
2888
2889         sk = tcp_seek_last_pos(seq);
2890         if (!sk)
2891                 return NULL; /* Done */
2892
2893         if (st->state == TCP_SEQ_STATE_LISTENING)
2894                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2895         else
2896                 expected = bpf_iter_tcp_established_batch(seq, sk);
2897
2898         if (iter->end_sk == expected) {
2899                 iter->st_bucket_done = true;
2900                 return sk;
2901         }
2902
2903         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2904                 resized = true;
2905                 goto again;
2906         }
2907
2908         return sk;
2909 }
2910
2911 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2912 {
2913         /* bpf iter does not support lseek, so it always
2914          * continue from where it was stop()-ped.
2915          */
2916         if (*pos)
2917                 return bpf_iter_tcp_batch(seq);
2918
2919         return SEQ_START_TOKEN;
2920 }
2921
2922 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2923 {
2924         struct bpf_tcp_iter_state *iter = seq->private;
2925         struct tcp_iter_state *st = &iter->state;
2926         struct sock *sk;
2927
2928         /* Whenever seq_next() is called, the iter->cur_sk is
2929          * done with seq_show(), so advance to the next sk in
2930          * the batch.
2931          */
2932         if (iter->cur_sk < iter->end_sk) {
2933                 /* Keeping st->num consistent in tcp_iter_state.
2934                  * bpf_iter_tcp does not use st->num.
2935                  * meta.seq_num is used instead.
2936                  */
2937                 st->num++;
2938                 /* Move st->offset to the next sk in the bucket such that
2939                  * the future start() will resume at st->offset in
2940                  * st->bucket.  See tcp_seek_last_pos().
2941                  */
2942                 st->offset++;
2943                 sock_put(iter->batch[iter->cur_sk++]);
2944         }
2945
2946         if (iter->cur_sk < iter->end_sk)
2947                 sk = iter->batch[iter->cur_sk];
2948         else
2949                 sk = bpf_iter_tcp_batch(seq);
2950
2951         ++*pos;
2952         /* Keeping st->last_pos consistent in tcp_iter_state.
2953          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2954          */
2955         st->last_pos = *pos;
2956         return sk;
2957 }
2958
2959 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2960 {
2961         struct bpf_iter_meta meta;
2962         struct bpf_prog *prog;
2963         struct sock *sk = v;
2964         bool slow;
2965         uid_t uid;
2966         int ret;
2967
2968         if (v == SEQ_START_TOKEN)
2969                 return 0;
2970
2971         if (sk_fullsock(sk))
2972                 slow = lock_sock_fast(sk);
2973
2974         if (unlikely(sk_unhashed(sk))) {
2975                 ret = SEQ_SKIP;
2976                 goto unlock;
2977         }
2978
2979         if (sk->sk_state == TCP_TIME_WAIT) {
2980                 uid = 0;
2981         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2982                 const struct request_sock *req = v;
2983
2984                 uid = from_kuid_munged(seq_user_ns(seq),
2985                                        sock_i_uid(req->rsk_listener));
2986         } else {
2987                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2988         }
2989
2990         meta.seq = seq;
2991         prog = bpf_iter_get_info(&meta, false);
2992         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2993
2994 unlock:
2995         if (sk_fullsock(sk))
2996                 unlock_sock_fast(sk, slow);
2997         return ret;
2998
2999 }
3000
3001 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3002 {
3003         struct bpf_tcp_iter_state *iter = seq->private;
3004         struct bpf_iter_meta meta;
3005         struct bpf_prog *prog;
3006
3007         if (!v) {
3008                 meta.seq = seq;
3009                 prog = bpf_iter_get_info(&meta, true);
3010                 if (prog)
3011                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
3012         }
3013
3014         if (iter->cur_sk < iter->end_sk) {
3015                 bpf_iter_tcp_put_batch(iter);
3016                 iter->st_bucket_done = false;
3017         }
3018 }
3019
3020 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3021         .show           = bpf_iter_tcp_seq_show,
3022         .start          = bpf_iter_tcp_seq_start,
3023         .next           = bpf_iter_tcp_seq_next,
3024         .stop           = bpf_iter_tcp_seq_stop,
3025 };
3026 #endif
3027 static unsigned short seq_file_family(const struct seq_file *seq)
3028 {
3029         const struct tcp_seq_afinfo *afinfo;
3030
3031 #ifdef CONFIG_BPF_SYSCALL
3032         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3033         if (seq->op == &bpf_iter_tcp_seq_ops)
3034                 return AF_UNSPEC;
3035 #endif
3036
3037         /* Iterated from proc fs */
3038         afinfo = pde_data(file_inode(seq->file));
3039         return afinfo->family;
3040 }
3041
3042 static const struct seq_operations tcp4_seq_ops = {
3043         .show           = tcp4_seq_show,
3044         .start          = tcp_seq_start,
3045         .next           = tcp_seq_next,
3046         .stop           = tcp_seq_stop,
3047 };
3048
3049 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3050         .family         = AF_INET,
3051 };
3052
3053 static int __net_init tcp4_proc_init_net(struct net *net)
3054 {
3055         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3056                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3057                 return -ENOMEM;
3058         return 0;
3059 }
3060
3061 static void __net_exit tcp4_proc_exit_net(struct net *net)
3062 {
3063         remove_proc_entry("tcp", net->proc_net);
3064 }
3065
3066 static struct pernet_operations tcp4_net_ops = {
3067         .init = tcp4_proc_init_net,
3068         .exit = tcp4_proc_exit_net,
3069 };
3070
3071 int __init tcp4_proc_init(void)
3072 {
3073         return register_pernet_subsys(&tcp4_net_ops);
3074 }
3075
3076 void tcp4_proc_exit(void)
3077 {
3078         unregister_pernet_subsys(&tcp4_net_ops);
3079 }
3080 #endif /* CONFIG_PROC_FS */
3081
3082 /* @wake is one when sk_stream_write_space() calls us.
3083  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3084  * This mimics the strategy used in sock_def_write_space().
3085  */
3086 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3087 {
3088         const struct tcp_sock *tp = tcp_sk(sk);
3089         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3090                             READ_ONCE(tp->snd_nxt);
3091
3092         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3093 }
3094 EXPORT_SYMBOL(tcp_stream_memory_free);
3095
3096 struct proto tcp_prot = {
3097         .name                   = "TCP",
3098         .owner                  = THIS_MODULE,
3099         .close                  = tcp_close,
3100         .pre_connect            = tcp_v4_pre_connect,
3101         .connect                = tcp_v4_connect,
3102         .disconnect             = tcp_disconnect,
3103         .accept                 = inet_csk_accept,
3104         .ioctl                  = tcp_ioctl,
3105         .init                   = tcp_v4_init_sock,
3106         .destroy                = tcp_v4_destroy_sock,
3107         .shutdown               = tcp_shutdown,
3108         .setsockopt             = tcp_setsockopt,
3109         .getsockopt             = tcp_getsockopt,
3110         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3111         .keepalive              = tcp_set_keepalive,
3112         .recvmsg                = tcp_recvmsg,
3113         .sendmsg                = tcp_sendmsg,
3114         .sendpage               = tcp_sendpage,
3115         .backlog_rcv            = tcp_v4_do_rcv,
3116         .release_cb             = tcp_release_cb,
3117         .hash                   = inet_hash,
3118         .unhash                 = inet_unhash,
3119         .get_port               = inet_csk_get_port,
3120         .put_port               = inet_put_port,
3121 #ifdef CONFIG_BPF_SYSCALL
3122         .psock_update_sk_prot   = tcp_bpf_update_proto,
3123 #endif
3124         .enter_memory_pressure  = tcp_enter_memory_pressure,
3125         .leave_memory_pressure  = tcp_leave_memory_pressure,
3126         .stream_memory_free     = tcp_stream_memory_free,
3127         .sockets_allocated      = &tcp_sockets_allocated,
3128         .orphan_count           = &tcp_orphan_count,
3129
3130         .memory_allocated       = &tcp_memory_allocated,
3131         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3132
3133         .memory_pressure        = &tcp_memory_pressure,
3134         .sysctl_mem             = sysctl_tcp_mem,
3135         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3136         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3137         .max_header             = MAX_TCP_HEADER,
3138         .obj_size               = sizeof(struct tcp_sock),
3139         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3140         .twsk_prot              = &tcp_timewait_sock_ops,
3141         .rsk_prot               = &tcp_request_sock_ops,
3142         .h.hashinfo             = NULL,
3143         .no_autobind            = true,
3144         .diag_destroy           = tcp_abort,
3145 };
3146 EXPORT_SYMBOL(tcp_prot);
3147
3148 static void __net_exit tcp_sk_exit(struct net *net)
3149 {
3150         if (net->ipv4.tcp_congestion_control)
3151                 bpf_module_put(net->ipv4.tcp_congestion_control,
3152                                net->ipv4.tcp_congestion_control->owner);
3153 }
3154
3155 static void __net_init tcp_set_hashinfo(struct net *net)
3156 {
3157         struct inet_hashinfo *hinfo;
3158         unsigned int ehash_entries;
3159         struct net *old_net;
3160
3161         if (net_eq(net, &init_net))
3162                 goto fallback;
3163
3164         old_net = current->nsproxy->net_ns;
3165         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3166         if (!ehash_entries)
3167                 goto fallback;
3168
3169         ehash_entries = roundup_pow_of_two(ehash_entries);
3170         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3171         if (!hinfo) {
3172                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3173                         "for a netns, fallback to the global one\n",
3174                         ehash_entries);
3175 fallback:
3176                 hinfo = &tcp_hashinfo;
3177                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3178         }
3179
3180         net->ipv4.tcp_death_row.hashinfo = hinfo;
3181         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3182         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3183 }
3184
3185 static int __net_init tcp_sk_init(struct net *net)
3186 {
3187         net->ipv4.sysctl_tcp_ecn = 2;
3188         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3189
3190         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3191         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3192         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3193         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3194         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3195
3196         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3197         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3198         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3199
3200         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3201         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3202         net->ipv4.sysctl_tcp_syncookies = 1;
3203         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3204         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3205         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3206         net->ipv4.sysctl_tcp_orphan_retries = 0;
3207         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3208         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3209         net->ipv4.sysctl_tcp_tw_reuse = 2;
3210         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3211
3212         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3213         tcp_set_hashinfo(net);
3214
3215         net->ipv4.sysctl_tcp_sack = 1;
3216         net->ipv4.sysctl_tcp_window_scaling = 1;
3217         net->ipv4.sysctl_tcp_timestamps = 1;
3218         net->ipv4.sysctl_tcp_early_retrans = 3;
3219         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3220         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3221         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3222         net->ipv4.sysctl_tcp_max_reordering = 300;
3223         net->ipv4.sysctl_tcp_dsack = 1;
3224         net->ipv4.sysctl_tcp_app_win = 31;
3225         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3226         net->ipv4.sysctl_tcp_frto = 2;
3227         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3228         /* This limits the percentage of the congestion window which we
3229          * will allow a single TSO frame to consume.  Building TSO frames
3230          * which are too large can cause TCP streams to be bursty.
3231          */
3232         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3233         /* Default TSQ limit of 16 TSO segments */
3234         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3235
3236         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3237         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3238
3239         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3240         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3241         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3242         net->ipv4.sysctl_tcp_autocorking = 1;
3243         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3244         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3245         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3246         if (net != &init_net) {
3247                 memcpy(net->ipv4.sysctl_tcp_rmem,
3248                        init_net.ipv4.sysctl_tcp_rmem,
3249                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3250                 memcpy(net->ipv4.sysctl_tcp_wmem,
3251                        init_net.ipv4.sysctl_tcp_wmem,
3252                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3253         }
3254         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3255         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3256         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3257         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3258         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3259         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3260
3261         /* Set default values for PLB */
3262         net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3263         net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3264         net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3265         net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3266         /* Default congestion threshold for PLB to mark a round is 50% */
3267         net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3268
3269         /* Reno is always built in */
3270         if (!net_eq(net, &init_net) &&
3271             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3272                                init_net.ipv4.tcp_congestion_control->owner))
3273                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3274         else
3275                 net->ipv4.tcp_congestion_control = &tcp_reno;
3276
3277         return 0;
3278 }
3279
3280 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3281 {
3282         struct net *net;
3283
3284         tcp_twsk_purge(net_exit_list, AF_INET);
3285
3286         list_for_each_entry(net, net_exit_list, exit_list) {
3287                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3288                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3289                 tcp_fastopen_ctx_destroy(net);
3290         }
3291 }
3292
3293 static struct pernet_operations __net_initdata tcp_sk_ops = {
3294        .init       = tcp_sk_init,
3295        .exit       = tcp_sk_exit,
3296        .exit_batch = tcp_sk_exit_batch,
3297 };
3298
3299 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3300 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3301                      struct sock_common *sk_common, uid_t uid)
3302
3303 #define INIT_BATCH_SZ 16
3304
3305 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3306 {
3307         struct bpf_tcp_iter_state *iter = priv_data;
3308         int err;
3309
3310         err = bpf_iter_init_seq_net(priv_data, aux);
3311         if (err)
3312                 return err;
3313
3314         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3315         if (err) {
3316                 bpf_iter_fini_seq_net(priv_data);
3317                 return err;
3318         }
3319
3320         return 0;
3321 }
3322
3323 static void bpf_iter_fini_tcp(void *priv_data)
3324 {
3325         struct bpf_tcp_iter_state *iter = priv_data;
3326
3327         bpf_iter_fini_seq_net(priv_data);
3328         kvfree(iter->batch);
3329 }
3330
3331 static const struct bpf_iter_seq_info tcp_seq_info = {
3332         .seq_ops                = &bpf_iter_tcp_seq_ops,
3333         .init_seq_private       = bpf_iter_init_tcp,
3334         .fini_seq_private       = bpf_iter_fini_tcp,
3335         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3336 };
3337
3338 static const struct bpf_func_proto *
3339 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3340                             const struct bpf_prog *prog)
3341 {
3342         switch (func_id) {
3343         case BPF_FUNC_setsockopt:
3344                 return &bpf_sk_setsockopt_proto;
3345         case BPF_FUNC_getsockopt:
3346                 return &bpf_sk_getsockopt_proto;
3347         default:
3348                 return NULL;
3349         }
3350 }
3351
3352 static struct bpf_iter_reg tcp_reg_info = {
3353         .target                 = "tcp",
3354         .ctx_arg_info_size      = 1,
3355         .ctx_arg_info           = {
3356                 { offsetof(struct bpf_iter__tcp, sk_common),
3357                   PTR_TO_BTF_ID_OR_NULL },
3358         },
3359         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3360         .seq_info               = &tcp_seq_info,
3361 };
3362
3363 static void __init bpf_iter_register(void)
3364 {
3365         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3366         if (bpf_iter_reg_target(&tcp_reg_info))
3367                 pr_warn("Warning: could not register bpf iterator tcp\n");
3368 }
3369
3370 #endif
3371
3372 void __init tcp_v4_init(void)
3373 {
3374         int cpu, res;
3375
3376         for_each_possible_cpu(cpu) {
3377                 struct sock *sk;
3378
3379                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3380                                            IPPROTO_TCP, &init_net);
3381                 if (res)
3382                         panic("Failed to create the TCP control socket.\n");
3383                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3384
3385                 /* Please enforce IP_DF and IPID==0 for RST and
3386                  * ACK sent in SYN-RECV and TIME-WAIT state.
3387                  */
3388                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3389
3390                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3391         }
3392         if (register_pernet_subsys(&tcp_sk_ops))
3393                 panic("Failed to create the TCP control socket.\n");
3394
3395 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3396         bpf_iter_register();
3397 #endif
3398 }
This page took 0.229156 seconds and 4 git commands to generate.