]> Git Repo - linux.git/blob - net/mptcp/subflow.c
vc_screen: move load of struct vc_data pointer in vcs_read() to avoid UAF
[linux.git] / net / mptcp / subflow.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3  *
4  * Copyright (c) 2017 - 2019, Intel Corporation.
5  */
6
7 #define pr_fmt(fmt) "MPTCP: " fmt
8
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <crypto/algapi.h>
13 #include <crypto/sha2.h>
14 #include <net/sock.h>
15 #include <net/inet_common.h>
16 #include <net/inet_hashtables.h>
17 #include <net/protocol.h>
18 #include <net/tcp.h>
19 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
20 #include <net/ip6_route.h>
21 #include <net/transp_v6.h>
22 #endif
23 #include <net/mptcp.h>
24 #include <uapi/linux/mptcp.h>
25 #include "protocol.h"
26 #include "mib.h"
27
28 #include <trace/events/mptcp.h>
29
30 static void mptcp_subflow_ops_undo_override(struct sock *ssk);
31
32 static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
33                                   enum linux_mptcp_mib_field field)
34 {
35         MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
36 }
37
38 static void subflow_req_destructor(struct request_sock *req)
39 {
40         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
41
42         pr_debug("subflow_req=%p", subflow_req);
43
44         if (subflow_req->msk)
45                 sock_put((struct sock *)subflow_req->msk);
46
47         mptcp_token_destroy_request(req);
48 }
49
50 static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
51                                   void *hmac)
52 {
53         u8 msg[8];
54
55         put_unaligned_be32(nonce1, &msg[0]);
56         put_unaligned_be32(nonce2, &msg[4]);
57
58         mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
59 }
60
61 static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
62 {
63         return mptcp_is_fully_established((void *)msk) &&
64                 ((mptcp_pm_is_userspace(msk) &&
65                   mptcp_userspace_pm_active(msk)) ||
66                  READ_ONCE(msk->pm.accept_subflow));
67 }
68
69 /* validate received token and create truncated hmac and nonce for SYN-ACK */
70 static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req)
71 {
72         struct mptcp_sock *msk = subflow_req->msk;
73         u8 hmac[SHA256_DIGEST_SIZE];
74
75         get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
76
77         subflow_generate_hmac(msk->local_key, msk->remote_key,
78                               subflow_req->local_nonce,
79                               subflow_req->remote_nonce, hmac);
80
81         subflow_req->thmac = get_unaligned_be64(hmac);
82 }
83
84 static struct mptcp_sock *subflow_token_join_request(struct request_sock *req)
85 {
86         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
87         struct mptcp_sock *msk;
88         int local_id;
89
90         msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token);
91         if (!msk) {
92                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
93                 return NULL;
94         }
95
96         local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
97         if (local_id < 0) {
98                 sock_put((struct sock *)msk);
99                 return NULL;
100         }
101         subflow_req->local_id = local_id;
102
103         return msk;
104 }
105
106 static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
107 {
108         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
109
110         subflow_req->mp_capable = 0;
111         subflow_req->mp_join = 0;
112         subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener));
113         subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener));
114         subflow_req->msk = NULL;
115         mptcp_token_init_request(req);
116 }
117
118 static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk)
119 {
120         return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport;
121 }
122
123 static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason)
124 {
125         struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
126
127         if (mpext) {
128                 memset(mpext, 0, sizeof(*mpext));
129                 mpext->reset_reason = reason;
130         }
131 }
132
133 /* Init mptcp request socket.
134  *
135  * Returns an error code if a JOIN has failed and a TCP reset
136  * should be sent.
137  */
138 static int subflow_check_req(struct request_sock *req,
139                              const struct sock *sk_listener,
140                              struct sk_buff *skb)
141 {
142         struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
143         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
144         struct mptcp_options_received mp_opt;
145         bool opt_mp_capable, opt_mp_join;
146
147         pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
148
149 #ifdef CONFIG_TCP_MD5SIG
150         /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
151          * TCP option space.
152          */
153         if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
154                 return -EINVAL;
155 #endif
156
157         mptcp_get_options(skb, &mp_opt);
158
159         opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC);
160         opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ);
161         if (opt_mp_capable) {
162                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
163
164                 if (opt_mp_join)
165                         return 0;
166         } else if (opt_mp_join) {
167                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
168         }
169
170         if (opt_mp_capable && listener->request_mptcp) {
171                 int err, retries = MPTCP_TOKEN_MAX_RETRIES;
172
173                 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
174 again:
175                 do {
176                         get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key));
177                 } while (subflow_req->local_key == 0);
178
179                 if (unlikely(req->syncookie)) {
180                         mptcp_crypto_key_sha(subflow_req->local_key,
181                                              &subflow_req->token,
182                                              &subflow_req->idsn);
183                         if (mptcp_token_exists(subflow_req->token)) {
184                                 if (retries-- > 0)
185                                         goto again;
186                                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT);
187                         } else {
188                                 subflow_req->mp_capable = 1;
189                         }
190                         return 0;
191                 }
192
193                 err = mptcp_token_new_request(req);
194                 if (err == 0)
195                         subflow_req->mp_capable = 1;
196                 else if (retries-- > 0)
197                         goto again;
198                 else
199                         SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT);
200
201         } else if (opt_mp_join && listener->request_mptcp) {
202                 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
203                 subflow_req->mp_join = 1;
204                 subflow_req->backup = mp_opt.backup;
205                 subflow_req->remote_id = mp_opt.join_id;
206                 subflow_req->token = mp_opt.token;
207                 subflow_req->remote_nonce = mp_opt.nonce;
208                 subflow_req->msk = subflow_token_join_request(req);
209
210                 /* Can't fall back to TCP in this case. */
211                 if (!subflow_req->msk) {
212                         subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
213                         return -EPERM;
214                 }
215
216                 if (subflow_use_different_sport(subflow_req->msk, sk_listener)) {
217                         pr_debug("syn inet_sport=%d %d",
218                                  ntohs(inet_sk(sk_listener)->inet_sport),
219                                  ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
220                         if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
221                                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
222                                 return -EPERM;
223                         }
224                         SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX);
225                 }
226
227                 subflow_req_create_thmac(subflow_req);
228
229                 if (unlikely(req->syncookie)) {
230                         if (mptcp_can_accept_new_subflow(subflow_req->msk))
231                                 subflow_init_req_cookie_join_save(subflow_req, skb);
232                         else
233                                 return -EPERM;
234                 }
235
236                 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
237                          subflow_req->remote_nonce, subflow_req->msk);
238         }
239
240         return 0;
241 }
242
243 int mptcp_subflow_init_cookie_req(struct request_sock *req,
244                                   const struct sock *sk_listener,
245                                   struct sk_buff *skb)
246 {
247         struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
248         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
249         struct mptcp_options_received mp_opt;
250         bool opt_mp_capable, opt_mp_join;
251         int err;
252
253         subflow_init_req(req, sk_listener);
254         mptcp_get_options(skb, &mp_opt);
255
256         opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC);
257         opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ);
258         if (opt_mp_capable && opt_mp_join)
259                 return -EINVAL;
260
261         if (opt_mp_capable && listener->request_mptcp) {
262                 if (mp_opt.sndr_key == 0)
263                         return -EINVAL;
264
265                 subflow_req->local_key = mp_opt.rcvr_key;
266                 err = mptcp_token_new_request(req);
267                 if (err)
268                         return err;
269
270                 subflow_req->mp_capable = 1;
271                 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
272         } else if (opt_mp_join && listener->request_mptcp) {
273                 if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
274                         return -EINVAL;
275
276                 subflow_req->mp_join = 1;
277                 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
278         }
279
280         return 0;
281 }
282 EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
283
284 static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
285                                               struct sk_buff *skb,
286                                               struct flowi *fl,
287                                               struct request_sock *req)
288 {
289         struct dst_entry *dst;
290         int err;
291
292         tcp_rsk(req)->is_mptcp = 1;
293         subflow_init_req(req, sk);
294
295         dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
296         if (!dst)
297                 return NULL;
298
299         err = subflow_check_req(req, sk, skb);
300         if (err == 0)
301                 return dst;
302
303         dst_release(dst);
304         if (!req->syncookie)
305                 tcp_request_sock_ops.send_reset(sk, skb);
306         return NULL;
307 }
308
309 static void subflow_prep_synack(const struct sock *sk, struct request_sock *req,
310                                 struct tcp_fastopen_cookie *foc,
311                                 enum tcp_synack_type synack_type)
312 {
313         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
314         struct inet_request_sock *ireq = inet_rsk(req);
315
316         /* clear tstamp_ok, as needed depending on cookie */
317         if (foc && foc->len > -1)
318                 ireq->tstamp_ok = 0;
319
320         if (synack_type == TCP_SYNACK_FASTOPEN)
321                 mptcp_fastopen_subflow_synack_set_params(subflow, req);
322 }
323
324 static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
325                                   struct flowi *fl,
326                                   struct request_sock *req,
327                                   struct tcp_fastopen_cookie *foc,
328                                   enum tcp_synack_type synack_type,
329                                   struct sk_buff *syn_skb)
330 {
331         subflow_prep_synack(sk, req, foc, synack_type);
332
333         return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc,
334                                                      synack_type, syn_skb);
335 }
336
337 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
338 static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
339                                   struct flowi *fl,
340                                   struct request_sock *req,
341                                   struct tcp_fastopen_cookie *foc,
342                                   enum tcp_synack_type synack_type,
343                                   struct sk_buff *syn_skb)
344 {
345         subflow_prep_synack(sk, req, foc, synack_type);
346
347         return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc,
348                                                      synack_type, syn_skb);
349 }
350
351 static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
352                                               struct sk_buff *skb,
353                                               struct flowi *fl,
354                                               struct request_sock *req)
355 {
356         struct dst_entry *dst;
357         int err;
358
359         tcp_rsk(req)->is_mptcp = 1;
360         subflow_init_req(req, sk);
361
362         dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
363         if (!dst)
364                 return NULL;
365
366         err = subflow_check_req(req, sk, skb);
367         if (err == 0)
368                 return dst;
369
370         dst_release(dst);
371         if (!req->syncookie)
372                 tcp6_request_sock_ops.send_reset(sk, skb);
373         return NULL;
374 }
375 #endif
376
377 /* validate received truncated hmac and create hmac for third ACK */
378 static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
379 {
380         u8 hmac[SHA256_DIGEST_SIZE];
381         u64 thmac;
382
383         subflow_generate_hmac(subflow->remote_key, subflow->local_key,
384                               subflow->remote_nonce, subflow->local_nonce,
385                               hmac);
386
387         thmac = get_unaligned_be64(hmac);
388         pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
389                  subflow, subflow->token, thmac, subflow->thmac);
390
391         return thmac == subflow->thmac;
392 }
393
394 void mptcp_subflow_reset(struct sock *ssk)
395 {
396         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
397         struct sock *sk = subflow->conn;
398
399         /* must hold: tcp_done() could drop last reference on parent */
400         sock_hold(sk);
401
402         tcp_set_state(ssk, TCP_CLOSE);
403         tcp_send_active_reset(ssk, GFP_ATOMIC);
404         tcp_done(ssk);
405         if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
406             schedule_work(&mptcp_sk(sk)->work))
407                 return; /* worker will put sk for us */
408
409         sock_put(sk);
410 }
411
412 static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk)
413 {
414         return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport;
415 }
416
417 void __mptcp_set_connected(struct sock *sk)
418 {
419         if (sk->sk_state == TCP_SYN_SENT) {
420                 inet_sk_state_store(sk, TCP_ESTABLISHED);
421                 sk->sk_state_change(sk);
422         }
423 }
424
425 static void mptcp_set_connected(struct sock *sk)
426 {
427         mptcp_data_lock(sk);
428         if (!sock_owned_by_user(sk))
429                 __mptcp_set_connected(sk);
430         else
431                 __set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->cb_flags);
432         mptcp_data_unlock(sk);
433 }
434
435 static void subflow_set_remote_key(struct mptcp_sock *msk,
436                                    struct mptcp_subflow_context *subflow,
437                                    const struct mptcp_options_received *mp_opt)
438 {
439         /* active MPC subflow will reach here multiple times:
440          * at subflow_finish_connect() time and at 4th ack time
441          */
442         if (subflow->remote_key_valid)
443                 return;
444
445         subflow->remote_key_valid = 1;
446         subflow->remote_key = mp_opt->sndr_key;
447         mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn);
448         subflow->iasn++;
449
450         WRITE_ONCE(msk->remote_key, subflow->remote_key);
451         WRITE_ONCE(msk->ack_seq, subflow->iasn);
452         WRITE_ONCE(msk->can_ack, true);
453         atomic64_set(&msk->rcv_wnd_sent, subflow->iasn);
454 }
455
456 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
457 {
458         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
459         struct mptcp_options_received mp_opt;
460         struct sock *parent = subflow->conn;
461         struct mptcp_sock *msk;
462
463         subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
464
465         /* be sure no special action on any packet other than syn-ack */
466         if (subflow->conn_finished)
467                 return;
468
469         msk = mptcp_sk(parent);
470         mptcp_propagate_sndbuf(parent, sk);
471         subflow->rel_write_seq = 1;
472         subflow->conn_finished = 1;
473         subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
474         pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
475
476         mptcp_get_options(skb, &mp_opt);
477         if (subflow->request_mptcp) {
478                 if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) {
479                         MPTCP_INC_STATS(sock_net(sk),
480                                         MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
481                         mptcp_do_fallback(sk);
482                         pr_fallback(msk);
483                         goto fallback;
484                 }
485
486                 if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD)
487                         WRITE_ONCE(msk->csum_enabled, true);
488                 if (mp_opt.deny_join_id0)
489                         WRITE_ONCE(msk->pm.remote_deny_join_id0, true);
490                 subflow->mp_capable = 1;
491                 subflow_set_remote_key(msk, subflow, &mp_opt);
492                 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK);
493                 mptcp_finish_connect(sk);
494                 mptcp_set_connected(parent);
495         } else if (subflow->request_join) {
496                 u8 hmac[SHA256_DIGEST_SIZE];
497
498                 if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ)) {
499                         subflow->reset_reason = MPTCP_RST_EMPTCP;
500                         goto do_reset;
501                 }
502
503                 subflow->backup = mp_opt.backup;
504                 subflow->thmac = mp_opt.thmac;
505                 subflow->remote_nonce = mp_opt.nonce;
506                 subflow->remote_id = mp_opt.join_id;
507                 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d",
508                          subflow, subflow->thmac, subflow->remote_nonce,
509                          subflow->backup);
510
511                 if (!subflow_thmac_valid(subflow)) {
512                         MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
513                         subflow->reset_reason = MPTCP_RST_EMPTCP;
514                         goto do_reset;
515                 }
516
517                 if (!mptcp_finish_join(sk))
518                         goto do_reset;
519
520                 subflow_generate_hmac(subflow->local_key, subflow->remote_key,
521                                       subflow->local_nonce,
522                                       subflow->remote_nonce,
523                                       hmac);
524                 memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
525
526                 subflow->mp_join = 1;
527                 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
528
529                 if (subflow_use_different_dport(msk, sk)) {
530                         pr_debug("synack inet_dport=%d %d",
531                                  ntohs(inet_sk(sk)->inet_dport),
532                                  ntohs(inet_sk(parent)->inet_dport));
533                         MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX);
534                 }
535         } else if (mptcp_check_fallback(sk)) {
536 fallback:
537                 mptcp_rcv_space_init(msk, sk);
538                 mptcp_set_connected(parent);
539         }
540         return;
541
542 do_reset:
543         subflow->reset_transient = 0;
544         mptcp_subflow_reset(sk);
545 }
546
547 static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id)
548 {
549         subflow->local_id = local_id;
550         subflow->local_id_valid = 1;
551 }
552
553 static int subflow_chk_local_id(struct sock *sk)
554 {
555         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
556         struct mptcp_sock *msk = mptcp_sk(subflow->conn);
557         int err;
558
559         if (likely(subflow->local_id_valid))
560                 return 0;
561
562         err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
563         if (err < 0)
564                 return err;
565
566         subflow_set_local_id(subflow, err);
567         return 0;
568 }
569
570 static int subflow_rebuild_header(struct sock *sk)
571 {
572         int err = subflow_chk_local_id(sk);
573
574         if (unlikely(err < 0))
575                 return err;
576
577         return inet_sk_rebuild_header(sk);
578 }
579
580 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
581 static int subflow_v6_rebuild_header(struct sock *sk)
582 {
583         int err = subflow_chk_local_id(sk);
584
585         if (unlikely(err < 0))
586                 return err;
587
588         return inet6_sk_rebuild_header(sk);
589 }
590 #endif
591
592 static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init;
593 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init;
594
595 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
596 {
597         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
598
599         pr_debug("subflow=%p", subflow);
600
601         /* Never answer to SYNs sent to broadcast or multicast */
602         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
603                 goto drop;
604
605         return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops,
606                                 &subflow_request_sock_ipv4_ops,
607                                 sk, skb);
608 drop:
609         tcp_listendrop(sk);
610         return 0;
611 }
612
613 static void subflow_v4_req_destructor(struct request_sock *req)
614 {
615         subflow_req_destructor(req);
616         tcp_request_sock_ops.destructor(req);
617 }
618
619 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
620 static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init;
621 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init;
622 static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init;
623 static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init;
624 static struct proto tcpv6_prot_override;
625
626 static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
627 {
628         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
629
630         pr_debug("subflow=%p", subflow);
631
632         if (skb->protocol == htons(ETH_P_IP))
633                 return subflow_v4_conn_request(sk, skb);
634
635         if (!ipv6_unicast_destination(skb))
636                 goto drop;
637
638         if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
639                 __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
640                 return 0;
641         }
642
643         return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops,
644                                 &subflow_request_sock_ipv6_ops, sk, skb);
645
646 drop:
647         tcp_listendrop(sk);
648         return 0; /* don't send reset */
649 }
650
651 static void subflow_v6_req_destructor(struct request_sock *req)
652 {
653         subflow_req_destructor(req);
654         tcp6_request_sock_ops.destructor(req);
655 }
656 #endif
657
658 struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops,
659                                                struct sock *sk_listener,
660                                                bool attach_listener)
661 {
662         if (ops->family == AF_INET)
663                 ops = &mptcp_subflow_v4_request_sock_ops;
664 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
665         else if (ops->family == AF_INET6)
666                 ops = &mptcp_subflow_v6_request_sock_ops;
667 #endif
668
669         return inet_reqsk_alloc(ops, sk_listener, attach_listener);
670 }
671 EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc);
672
673 /* validate hmac received in third ACK */
674 static bool subflow_hmac_valid(const struct request_sock *req,
675                                const struct mptcp_options_received *mp_opt)
676 {
677         const struct mptcp_subflow_request_sock *subflow_req;
678         u8 hmac[SHA256_DIGEST_SIZE];
679         struct mptcp_sock *msk;
680
681         subflow_req = mptcp_subflow_rsk(req);
682         msk = subflow_req->msk;
683         if (!msk)
684                 return false;
685
686         subflow_generate_hmac(msk->remote_key, msk->local_key,
687                               subflow_req->remote_nonce,
688                               subflow_req->local_nonce, hmac);
689
690         return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
691 }
692
693 static void mptcp_force_close(struct sock *sk)
694 {
695         /* the msk is not yet exposed to user-space */
696         inet_sk_state_store(sk, TCP_CLOSE);
697         sk_common_release(sk);
698 }
699
700 static void subflow_ulp_fallback(struct sock *sk,
701                                  struct mptcp_subflow_context *old_ctx)
702 {
703         struct inet_connection_sock *icsk = inet_csk(sk);
704
705         mptcp_subflow_tcp_fallback(sk, old_ctx);
706         icsk->icsk_ulp_ops = NULL;
707         rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
708         tcp_sk(sk)->is_mptcp = 0;
709
710         mptcp_subflow_ops_undo_override(sk);
711 }
712
713 static void subflow_drop_ctx(struct sock *ssk)
714 {
715         struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
716
717         if (!ctx)
718                 return;
719
720         subflow_ulp_fallback(ssk, ctx);
721         if (ctx->conn)
722                 sock_put(ctx->conn);
723
724         kfree_rcu(ctx, rcu);
725 }
726
727 void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
728                                      const struct mptcp_options_received *mp_opt)
729 {
730         struct mptcp_sock *msk = mptcp_sk(subflow->conn);
731
732         subflow_set_remote_key(msk, subflow, mp_opt);
733         subflow->fully_established = 1;
734         WRITE_ONCE(msk->fully_established, true);
735
736         if (subflow->is_mptfo)
737                 mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt);
738 }
739
740 static struct sock *subflow_syn_recv_sock(const struct sock *sk,
741                                           struct sk_buff *skb,
742                                           struct request_sock *req,
743                                           struct dst_entry *dst,
744                                           struct request_sock *req_unhash,
745                                           bool *own_req)
746 {
747         struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
748         struct mptcp_subflow_request_sock *subflow_req;
749         struct mptcp_options_received mp_opt;
750         bool fallback, fallback_is_fatal;
751         struct sock *new_msk = NULL;
752         struct sock *child;
753
754         pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
755
756         /* After child creation we must look for MPC even when options
757          * are not parsed
758          */
759         mp_opt.suboptions = 0;
760
761         /* hopefully temporary handling for MP_JOIN+syncookie */
762         subflow_req = mptcp_subflow_rsk(req);
763         fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join;
764         fallback = !tcp_rsk(req)->is_mptcp;
765         if (fallback)
766                 goto create_child;
767
768         /* if the sk is MP_CAPABLE, we try to fetch the client key */
769         if (subflow_req->mp_capable) {
770                 /* we can receive and accept an in-window, out-of-order pkt,
771                  * which may not carry the MP_CAPABLE opt even on mptcp enabled
772                  * paths: always try to extract the peer key, and fallback
773                  * for packets missing it.
774                  * Even OoO DSS packets coming legitly after dropped or
775                  * reordered MPC will cause fallback, but we don't have other
776                  * options.
777                  */
778                 mptcp_get_options(skb, &mp_opt);
779                 if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) {
780                         fallback = true;
781                         goto create_child;
782                 }
783
784                 new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req);
785                 if (!new_msk)
786                         fallback = true;
787         } else if (subflow_req->mp_join) {
788                 mptcp_get_options(skb, &mp_opt);
789                 if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) ||
790                     !subflow_hmac_valid(req, &mp_opt) ||
791                     !mptcp_can_accept_new_subflow(subflow_req->msk)) {
792                         SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
793                         fallback = true;
794                 }
795         }
796
797 create_child:
798         child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
799                                                      req_unhash, own_req);
800
801         if (child && *own_req) {
802                 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
803
804                 tcp_rsk(req)->drop_req = false;
805
806                 /* we need to fallback on ctx allocation failure and on pre-reqs
807                  * checking above. In the latter scenario we additionally need
808                  * to reset the context to non MPTCP status.
809                  */
810                 if (!ctx || fallback) {
811                         if (fallback_is_fatal) {
812                                 subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
813                                 goto dispose_child;
814                         }
815
816                         if (new_msk)
817                                 mptcp_copy_inaddrs(new_msk, child);
818                         subflow_drop_ctx(child);
819                         goto out;
820                 }
821
822                 /* ssk inherits options of listener sk */
823                 ctx->setsockopt_seq = listener->setsockopt_seq;
824
825                 if (ctx->mp_capable) {
826                         /* this can't race with mptcp_close(), as the msk is
827                          * not yet exposted to user-space
828                          */
829                         inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED);
830
831                         /* record the newly created socket as the first msk
832                          * subflow, but don't link it yet into conn_list
833                          */
834                         WRITE_ONCE(mptcp_sk(new_msk)->first, child);
835
836                         /* new mpc subflow takes ownership of the newly
837                          * created mptcp socket
838                          */
839                         mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq;
840                         mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1);
841                         mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
842                         ctx->conn = new_msk;
843                         new_msk = NULL;
844
845                         /* set msk addresses early to ensure mptcp_pm_get_local_id()
846                          * uses the correct data
847                          */
848                         mptcp_copy_inaddrs(ctx->conn, child);
849
850                         /* with OoO packets we can reach here without ingress
851                          * mpc option
852                          */
853                         if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK)
854                                 mptcp_subflow_fully_established(ctx, &mp_opt);
855                 } else if (ctx->mp_join) {
856                         struct mptcp_sock *owner;
857
858                         owner = subflow_req->msk;
859                         if (!owner) {
860                                 subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
861                                 goto dispose_child;
862                         }
863
864                         /* move the msk reference ownership to the subflow */
865                         subflow_req->msk = NULL;
866                         ctx->conn = (struct sock *)owner;
867
868                         if (subflow_use_different_sport(owner, sk)) {
869                                 pr_debug("ack inet_sport=%d %d",
870                                          ntohs(inet_sk(sk)->inet_sport),
871                                          ntohs(inet_sk((struct sock *)owner)->inet_sport));
872                                 if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
873                                         SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX);
874                                         goto dispose_child;
875                                 }
876                                 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX);
877                         }
878
879                         if (!mptcp_finish_join(child))
880                                 goto dispose_child;
881
882                         SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
883                         tcp_rsk(req)->drop_req = true;
884                 }
885         }
886
887 out:
888         /* dispose of the left over mptcp master, if any */
889         if (unlikely(new_msk))
890                 mptcp_force_close(new_msk);
891
892         /* check for expected invariant - should never trigger, just help
893          * catching eariler subtle bugs
894          */
895         WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
896                      (!mptcp_subflow_ctx(child) ||
897                       !mptcp_subflow_ctx(child)->conn));
898         return child;
899
900 dispose_child:
901         subflow_drop_ctx(child);
902         tcp_rsk(req)->drop_req = true;
903         inet_csk_prepare_for_destroy_sock(child);
904         tcp_done(child);
905         req->rsk_ops->send_reset(sk, skb);
906
907         /* The last child reference will be released by the caller */
908         return child;
909 }
910
911 static struct inet_connection_sock_af_ops subflow_specific __ro_after_init;
912 static struct proto tcp_prot_override;
913
914 enum mapping_status {
915         MAPPING_OK,
916         MAPPING_INVALID,
917         MAPPING_EMPTY,
918         MAPPING_DATA_FIN,
919         MAPPING_DUMMY,
920         MAPPING_BAD_CSUM
921 };
922
923 static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
924 {
925         pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
926                  ssn, subflow->map_subflow_seq, subflow->map_data_len);
927 }
928
929 static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
930 {
931         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
932         unsigned int skb_consumed;
933
934         skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
935         if (WARN_ON_ONCE(skb_consumed >= skb->len))
936                 return true;
937
938         return skb->len - skb_consumed <= subflow->map_data_len -
939                                           mptcp_subflow_get_map_offset(subflow);
940 }
941
942 static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
943 {
944         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
945         u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
946
947         if (unlikely(before(ssn, subflow->map_subflow_seq))) {
948                 /* Mapping covers data later in the subflow stream,
949                  * currently unsupported.
950                  */
951                 dbg_bad_map(subflow, ssn);
952                 return false;
953         }
954         if (unlikely(!before(ssn, subflow->map_subflow_seq +
955                                   subflow->map_data_len))) {
956                 /* Mapping does covers past subflow data, invalid */
957                 dbg_bad_map(subflow, ssn);
958                 return false;
959         }
960         return true;
961 }
962
963 static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb,
964                                               bool csum_reqd)
965 {
966         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
967         u32 offset, seq, delta;
968         __sum16 csum;
969         int len;
970
971         if (!csum_reqd)
972                 return MAPPING_OK;
973
974         /* mapping already validated on previous traversal */
975         if (subflow->map_csum_len == subflow->map_data_len)
976                 return MAPPING_OK;
977
978         /* traverse the receive queue, ensuring it contains a full
979          * DSS mapping and accumulating the related csum.
980          * Preserve the accoumlate csum across multiple calls, to compute
981          * the csum only once
982          */
983         delta = subflow->map_data_len - subflow->map_csum_len;
984         for (;;) {
985                 seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len;
986                 offset = seq - TCP_SKB_CB(skb)->seq;
987
988                 /* if the current skb has not been accounted yet, csum its contents
989                  * up to the amount covered by the current DSS
990                  */
991                 if (offset < skb->len) {
992                         __wsum csum;
993
994                         len = min(skb->len - offset, delta);
995                         csum = skb_checksum(skb, offset, len, 0);
996                         subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum,
997                                                                 subflow->map_csum_len);
998
999                         delta -= len;
1000                         subflow->map_csum_len += len;
1001                 }
1002                 if (delta == 0)
1003                         break;
1004
1005                 if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) {
1006                         /* if this subflow is closed, the partial mapping
1007                          * will be never completed; flush the pending skbs, so
1008                          * that subflow_sched_work_if_closed() can kick in
1009                          */
1010                         if (unlikely(ssk->sk_state == TCP_CLOSE))
1011                                 while ((skb = skb_peek(&ssk->sk_receive_queue)))
1012                                         sk_eat_skb(ssk, skb);
1013
1014                         /* not enough data to validate the csum */
1015                         return MAPPING_EMPTY;
1016                 }
1017
1018                 /* the DSS mapping for next skbs will be validated later,
1019                  * when a get_mapping_status call will process such skb
1020                  */
1021                 skb = skb->next;
1022         }
1023
1024         /* note that 'map_data_len' accounts only for the carried data, does
1025          * not include the eventual seq increment due to the data fin,
1026          * while the pseudo header requires the original DSS data len,
1027          * including that
1028          */
1029         csum = __mptcp_make_csum(subflow->map_seq,
1030                                  subflow->map_subflow_seq,
1031                                  subflow->map_data_len + subflow->map_data_fin,
1032                                  subflow->map_data_csum);
1033         if (unlikely(csum)) {
1034                 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR);
1035                 return MAPPING_BAD_CSUM;
1036         }
1037
1038         subflow->valid_csum_seen = 1;
1039         return MAPPING_OK;
1040 }
1041
1042 static enum mapping_status get_mapping_status(struct sock *ssk,
1043                                               struct mptcp_sock *msk)
1044 {
1045         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1046         bool csum_reqd = READ_ONCE(msk->csum_enabled);
1047         struct mptcp_ext *mpext;
1048         struct sk_buff *skb;
1049         u16 data_len;
1050         u64 map_seq;
1051
1052         skb = skb_peek(&ssk->sk_receive_queue);
1053         if (!skb)
1054                 return MAPPING_EMPTY;
1055
1056         if (mptcp_check_fallback(ssk))
1057                 return MAPPING_DUMMY;
1058
1059         mpext = mptcp_get_ext(skb);
1060         if (!mpext || !mpext->use_map) {
1061                 if (!subflow->map_valid && !skb->len) {
1062                         /* the TCP stack deliver 0 len FIN pkt to the receive
1063                          * queue, that is the only 0len pkts ever expected here,
1064                          * and we can admit no mapping only for 0 len pkts
1065                          */
1066                         if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1067                                 WARN_ONCE(1, "0len seq %d:%d flags %x",
1068                                           TCP_SKB_CB(skb)->seq,
1069                                           TCP_SKB_CB(skb)->end_seq,
1070                                           TCP_SKB_CB(skb)->tcp_flags);
1071                         sk_eat_skb(ssk, skb);
1072                         return MAPPING_EMPTY;
1073                 }
1074
1075                 if (!subflow->map_valid)
1076                         return MAPPING_INVALID;
1077
1078                 goto validate_seq;
1079         }
1080
1081         trace_get_mapping_status(mpext);
1082
1083         data_len = mpext->data_len;
1084         if (data_len == 0) {
1085                 pr_debug("infinite mapping received");
1086                 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
1087                 subflow->map_data_len = 0;
1088                 return MAPPING_INVALID;
1089         }
1090
1091         if (mpext->data_fin == 1) {
1092                 if (data_len == 1) {
1093                         bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq,
1094                                                                  mpext->dsn64);
1095                         pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
1096                         if (subflow->map_valid) {
1097                                 /* A DATA_FIN might arrive in a DSS
1098                                  * option before the previous mapping
1099                                  * has been fully consumed. Continue
1100                                  * handling the existing mapping.
1101                                  */
1102                                 skb_ext_del(skb, SKB_EXT_MPTCP);
1103                                 return MAPPING_OK;
1104                         } else {
1105                                 if (updated && schedule_work(&msk->work))
1106                                         sock_hold((struct sock *)msk);
1107
1108                                 return MAPPING_DATA_FIN;
1109                         }
1110                 } else {
1111                         u64 data_fin_seq = mpext->data_seq + data_len - 1;
1112
1113                         /* If mpext->data_seq is a 32-bit value, data_fin_seq
1114                          * must also be limited to 32 bits.
1115                          */
1116                         if (!mpext->dsn64)
1117                                 data_fin_seq &= GENMASK_ULL(31, 0);
1118
1119                         mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
1120                         pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d",
1121                                  data_fin_seq, mpext->dsn64);
1122                 }
1123
1124                 /* Adjust for DATA_FIN using 1 byte of sequence space */
1125                 data_len--;
1126         }
1127
1128         map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64);
1129         WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64);
1130
1131         if (subflow->map_valid) {
1132                 /* Allow replacing only with an identical map */
1133                 if (subflow->map_seq == map_seq &&
1134                     subflow->map_subflow_seq == mpext->subflow_seq &&
1135                     subflow->map_data_len == data_len &&
1136                     subflow->map_csum_reqd == mpext->csum_reqd) {
1137                         skb_ext_del(skb, SKB_EXT_MPTCP);
1138                         goto validate_csum;
1139                 }
1140
1141                 /* If this skb data are fully covered by the current mapping,
1142                  * the new map would need caching, which is not supported
1143                  */
1144                 if (skb_is_fully_mapped(ssk, skb)) {
1145                         MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
1146                         return MAPPING_INVALID;
1147                 }
1148
1149                 /* will validate the next map after consuming the current one */
1150                 goto validate_csum;
1151         }
1152
1153         subflow->map_seq = map_seq;
1154         subflow->map_subflow_seq = mpext->subflow_seq;
1155         subflow->map_data_len = data_len;
1156         subflow->map_valid = 1;
1157         subflow->map_data_fin = mpext->data_fin;
1158         subflow->mpc_map = mpext->mpc_map;
1159         subflow->map_csum_reqd = mpext->csum_reqd;
1160         subflow->map_csum_len = 0;
1161         subflow->map_data_csum = csum_unfold(mpext->csum);
1162
1163         /* Cfr RFC 8684 Section 3.3.0 */
1164         if (unlikely(subflow->map_csum_reqd != csum_reqd))
1165                 return MAPPING_INVALID;
1166
1167         pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u",
1168                  subflow->map_seq, subflow->map_subflow_seq,
1169                  subflow->map_data_len, subflow->map_csum_reqd,
1170                  subflow->map_data_csum);
1171
1172 validate_seq:
1173         /* we revalidate valid mapping on new skb, because we must ensure
1174          * the current skb is completely covered by the available mapping
1175          */
1176         if (!validate_mapping(ssk, skb)) {
1177                 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH);
1178                 return MAPPING_INVALID;
1179         }
1180
1181         skb_ext_del(skb, SKB_EXT_MPTCP);
1182
1183 validate_csum:
1184         return validate_data_csum(ssk, skb, csum_reqd);
1185 }
1186
1187 static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
1188                                        u64 limit)
1189 {
1190         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1191         bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
1192         u32 incr;
1193
1194         incr = limit >= skb->len ? skb->len + fin : limit;
1195
1196         pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
1197                  subflow->map_subflow_seq);
1198         MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
1199         tcp_sk(ssk)->copied_seq += incr;
1200         if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
1201                 sk_eat_skb(ssk, skb);
1202         if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
1203                 subflow->map_valid = 0;
1204 }
1205
1206 /* sched mptcp worker to remove the subflow if no more data is pending */
1207 static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk)
1208 {
1209         struct sock *sk = (struct sock *)msk;
1210
1211         if (likely(ssk->sk_state != TCP_CLOSE))
1212                 return;
1213
1214         if (skb_queue_empty(&ssk->sk_receive_queue) &&
1215             !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) {
1216                 sock_hold(sk);
1217                 if (!schedule_work(&msk->work))
1218                         sock_put(sk);
1219         }
1220 }
1221
1222 static bool subflow_can_fallback(struct mptcp_subflow_context *subflow)
1223 {
1224         struct mptcp_sock *msk = mptcp_sk(subflow->conn);
1225
1226         if (subflow->mp_join)
1227                 return false;
1228         else if (READ_ONCE(msk->csum_enabled))
1229                 return !subflow->valid_csum_seen;
1230         else
1231                 return !subflow->fully_established;
1232 }
1233
1234 static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
1235 {
1236         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1237         unsigned long fail_tout;
1238
1239         /* greceful failure can happen only on the MPC subflow */
1240         if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
1241                 return;
1242
1243         /* since the close timeout take precedence on the fail one,
1244          * no need to start the latter when the first is already set
1245          */
1246         if (sock_flag((struct sock *)msk, SOCK_DEAD))
1247                 return;
1248
1249         /* we don't need extreme accuracy here, use a zero fail_tout as special
1250          * value meaning no fail timeout at all;
1251          */
1252         fail_tout = jiffies + TCP_RTO_MAX;
1253         if (!fail_tout)
1254                 fail_tout = 1;
1255         WRITE_ONCE(subflow->fail_tout, fail_tout);
1256         tcp_send_ack(ssk);
1257
1258         mptcp_reset_timeout(msk, subflow->fail_tout);
1259 }
1260
1261 static bool subflow_check_data_avail(struct sock *ssk)
1262 {
1263         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1264         enum mapping_status status;
1265         struct mptcp_sock *msk;
1266         struct sk_buff *skb;
1267
1268         if (!skb_peek(&ssk->sk_receive_queue))
1269                 WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
1270         if (subflow->data_avail)
1271                 return true;
1272
1273         msk = mptcp_sk(subflow->conn);
1274         for (;;) {
1275                 u64 ack_seq;
1276                 u64 old_ack;
1277
1278                 status = get_mapping_status(ssk, msk);
1279                 trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue));
1280                 if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY ||
1281                              status == MAPPING_BAD_CSUM))
1282                         goto fallback;
1283
1284                 if (status != MAPPING_OK)
1285                         goto no_data;
1286
1287                 skb = skb_peek(&ssk->sk_receive_queue);
1288                 if (WARN_ON_ONCE(!skb))
1289                         goto no_data;
1290
1291                 if (unlikely(!READ_ONCE(msk->can_ack)))
1292                         goto fallback;
1293
1294                 old_ack = READ_ONCE(msk->ack_seq);
1295                 ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
1296                 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
1297                          ack_seq);
1298                 if (unlikely(before64(ack_seq, old_ack))) {
1299                         mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
1300                         continue;
1301                 }
1302
1303                 WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
1304                 break;
1305         }
1306         return true;
1307
1308 no_data:
1309         subflow_sched_work_if_closed(msk, ssk);
1310         return false;
1311
1312 fallback:
1313         if (!__mptcp_check_fallback(msk)) {
1314                 /* RFC 8684 section 3.7. */
1315                 if (status == MAPPING_BAD_CSUM &&
1316                     (subflow->mp_join || subflow->valid_csum_seen)) {
1317                         subflow->send_mp_fail = 1;
1318
1319                         if (!READ_ONCE(msk->allow_infinite_fallback)) {
1320                                 subflow->reset_transient = 0;
1321                                 subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
1322                                 goto reset;
1323                         }
1324                         mptcp_subflow_fail(msk, ssk);
1325                         WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
1326                         return true;
1327                 }
1328
1329                 if (!subflow_can_fallback(subflow) && subflow->map_data_len) {
1330                         /* fatal protocol error, close the socket.
1331                          * subflow_error_report() will introduce the appropriate barriers
1332                          */
1333                         subflow->reset_transient = 0;
1334                         subflow->reset_reason = MPTCP_RST_EMPTCP;
1335
1336 reset:
1337                         ssk->sk_err = EBADMSG;
1338                         tcp_set_state(ssk, TCP_CLOSE);
1339                         while ((skb = skb_peek(&ssk->sk_receive_queue)))
1340                                 sk_eat_skb(ssk, skb);
1341                         tcp_send_active_reset(ssk, GFP_ATOMIC);
1342                         WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
1343                         return false;
1344                 }
1345
1346                 mptcp_do_fallback(ssk);
1347         }
1348
1349         skb = skb_peek(&ssk->sk_receive_queue);
1350         subflow->map_valid = 1;
1351         subflow->map_seq = READ_ONCE(msk->ack_seq);
1352         subflow->map_data_len = skb->len;
1353         subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
1354         WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
1355         return true;
1356 }
1357
1358 bool mptcp_subflow_data_available(struct sock *sk)
1359 {
1360         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1361
1362         /* check if current mapping is still valid */
1363         if (subflow->map_valid &&
1364             mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
1365                 subflow->map_valid = 0;
1366                 WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
1367
1368                 pr_debug("Done with mapping: seq=%u data_len=%u",
1369                          subflow->map_subflow_seq,
1370                          subflow->map_data_len);
1371         }
1372
1373         return subflow_check_data_avail(sk);
1374 }
1375
1376 /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
1377  * not the ssk one.
1378  *
1379  * In mptcp, rwin is about the mptcp-level connection data.
1380  *
1381  * Data that is still on the ssk rx queue can thus be ignored,
1382  * as far as mptcp peer is concerned that data is still inflight.
1383  * DSS ACK is updated when skb is moved to the mptcp rx queue.
1384  */
1385 void mptcp_space(const struct sock *ssk, int *space, int *full_space)
1386 {
1387         const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1388         const struct sock *sk = subflow->conn;
1389
1390         *space = __mptcp_space(sk);
1391         *full_space = tcp_full_space(sk);
1392 }
1393
1394 void __mptcp_error_report(struct sock *sk)
1395 {
1396         struct mptcp_subflow_context *subflow;
1397         struct mptcp_sock *msk = mptcp_sk(sk);
1398
1399         mptcp_for_each_subflow(msk, subflow) {
1400                 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1401                 int err = sock_error(ssk);
1402
1403                 if (!err)
1404                         continue;
1405
1406                 /* only propagate errors on fallen-back sockets or
1407                  * on MPC connect
1408                  */
1409                 if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
1410                         continue;
1411
1412                 inet_sk_state_store(sk, inet_sk_state_load(ssk));
1413                 sk->sk_err = -err;
1414
1415                 /* This barrier is coupled with smp_rmb() in mptcp_poll() */
1416                 smp_wmb();
1417                 sk_error_report(sk);
1418                 break;
1419         }
1420 }
1421
1422 static void subflow_error_report(struct sock *ssk)
1423 {
1424         struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
1425
1426         mptcp_data_lock(sk);
1427         if (!sock_owned_by_user(sk))
1428                 __mptcp_error_report(sk);
1429         else
1430                 __set_bit(MPTCP_ERROR_REPORT,  &mptcp_sk(sk)->cb_flags);
1431         mptcp_data_unlock(sk);
1432 }
1433
1434 static void subflow_data_ready(struct sock *sk)
1435 {
1436         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1437         u16 state = 1 << inet_sk_state_load(sk);
1438         struct sock *parent = subflow->conn;
1439         struct mptcp_sock *msk;
1440
1441         msk = mptcp_sk(parent);
1442         if (state & TCPF_LISTEN) {
1443                 /* MPJ subflow are removed from accept queue before reaching here,
1444                  * avoid stray wakeups
1445                  */
1446                 if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue))
1447                         return;
1448
1449                 parent->sk_data_ready(parent);
1450                 return;
1451         }
1452
1453         WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
1454                      !subflow->mp_join && !(state & TCPF_CLOSE));
1455
1456         if (mptcp_subflow_data_available(sk))
1457                 mptcp_data_ready(parent, sk);
1458         else if (unlikely(sk->sk_err))
1459                 subflow_error_report(sk);
1460 }
1461
1462 static void subflow_write_space(struct sock *ssk)
1463 {
1464         struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
1465
1466         mptcp_propagate_sndbuf(sk, ssk);
1467         mptcp_write_space(sk);
1468 }
1469
1470 static const struct inet_connection_sock_af_ops *
1471 subflow_default_af_ops(struct sock *sk)
1472 {
1473 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1474         if (sk->sk_family == AF_INET6)
1475                 return &subflow_v6_specific;
1476 #endif
1477         return &subflow_specific;
1478 }
1479
1480 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1481 void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
1482 {
1483         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1484         struct inet_connection_sock *icsk = inet_csk(sk);
1485         const struct inet_connection_sock_af_ops *target;
1486
1487         target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
1488
1489         pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
1490                  subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
1491
1492         if (likely(icsk->icsk_af_ops == target))
1493                 return;
1494
1495         subflow->icsk_af_ops = icsk->icsk_af_ops;
1496         icsk->icsk_af_ops = target;
1497 }
1498 #endif
1499
1500 void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
1501                          struct sockaddr_storage *addr,
1502                          unsigned short family)
1503 {
1504         memset(addr, 0, sizeof(*addr));
1505         addr->ss_family = family;
1506         if (addr->ss_family == AF_INET) {
1507                 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
1508
1509                 if (info->family == AF_INET)
1510                         in_addr->sin_addr = info->addr;
1511 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1512                 else if (ipv6_addr_v4mapped(&info->addr6))
1513                         in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3];
1514 #endif
1515                 in_addr->sin_port = info->port;
1516         }
1517 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1518         else if (addr->ss_family == AF_INET6) {
1519                 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
1520
1521                 if (info->family == AF_INET)
1522                         ipv6_addr_set_v4mapped(info->addr.s_addr,
1523                                                &in6_addr->sin6_addr);
1524                 else
1525                         in6_addr->sin6_addr = info->addr6;
1526                 in6_addr->sin6_port = info->port;
1527         }
1528 #endif
1529 }
1530
1531 int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
1532                             const struct mptcp_addr_info *remote)
1533 {
1534         struct mptcp_sock *msk = mptcp_sk(sk);
1535         struct mptcp_subflow_context *subflow;
1536         struct sockaddr_storage addr;
1537         int remote_id = remote->id;
1538         int local_id = loc->id;
1539         int err = -ENOTCONN;
1540         struct socket *sf;
1541         struct sock *ssk;
1542         u32 remote_token;
1543         int addrlen;
1544         int ifindex;
1545         u8 flags;
1546
1547         if (!mptcp_is_fully_established(sk))
1548                 goto err_out;
1549
1550         err = mptcp_subflow_create_socket(sk, loc->family, &sf);
1551         if (err)
1552                 goto err_out;
1553
1554         ssk = sf->sk;
1555         subflow = mptcp_subflow_ctx(ssk);
1556         do {
1557                 get_random_bytes(&subflow->local_nonce, sizeof(u32));
1558         } while (!subflow->local_nonce);
1559
1560         if (local_id)
1561                 subflow_set_local_id(subflow, local_id);
1562
1563         mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id,
1564                                              &flags, &ifindex);
1565         subflow->remote_key_valid = 1;
1566         subflow->remote_key = msk->remote_key;
1567         subflow->local_key = msk->local_key;
1568         subflow->token = msk->token;
1569         mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
1570
1571         addrlen = sizeof(struct sockaddr_in);
1572 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1573         if (addr.ss_family == AF_INET6)
1574                 addrlen = sizeof(struct sockaddr_in6);
1575 #endif
1576         mptcp_sockopt_sync(msk, ssk);
1577
1578         ssk->sk_bound_dev_if = ifindex;
1579         err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
1580         if (err)
1581                 goto failed;
1582
1583         mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
1584         pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk,
1585                  remote_token, local_id, remote_id);
1586         subflow->remote_token = remote_token;
1587         subflow->remote_id = remote_id;
1588         subflow->request_join = 1;
1589         subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP);
1590         mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
1591
1592         sock_hold(ssk);
1593         list_add_tail(&subflow->node, &msk->conn_list);
1594         err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
1595         if (err && err != -EINPROGRESS)
1596                 goto failed_unlink;
1597
1598         /* discard the subflow socket */
1599         mptcp_sock_graft(ssk, sk->sk_socket);
1600         iput(SOCK_INODE(sf));
1601         WRITE_ONCE(msk->allow_infinite_fallback, false);
1602         return 0;
1603
1604 failed_unlink:
1605         list_del(&subflow->node);
1606         sock_put(mptcp_subflow_tcp_sock(subflow));
1607
1608 failed:
1609         subflow->disposable = 1;
1610         sock_release(sf);
1611
1612 err_out:
1613         /* we account subflows before the creation, and this failures will not
1614          * be caught by sk_state_change()
1615          */
1616         mptcp_pm_close_subflow(msk);
1617         return err;
1618 }
1619
1620 static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
1621 {
1622 #ifdef CONFIG_SOCK_CGROUP_DATA
1623         struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
1624                                 *child_skcd = &child->sk_cgrp_data;
1625
1626         /* only the additional subflows created by kworkers have to be modified */
1627         if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
1628             cgroup_id(sock_cgroup_ptr(child_skcd))) {
1629 #ifdef CONFIG_MEMCG
1630                 struct mem_cgroup *memcg = parent->sk_memcg;
1631
1632                 mem_cgroup_sk_free(child);
1633                 if (memcg && css_tryget(&memcg->css))
1634                         child->sk_memcg = memcg;
1635 #endif /* CONFIG_MEMCG */
1636
1637                 cgroup_sk_free(child_skcd);
1638                 *child_skcd = *parent_skcd;
1639                 cgroup_sk_clone(child_skcd);
1640         }
1641 #endif /* CONFIG_SOCK_CGROUP_DATA */
1642 }
1643
1644 static void mptcp_subflow_ops_override(struct sock *ssk)
1645 {
1646 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1647         if (ssk->sk_prot == &tcpv6_prot)
1648                 ssk->sk_prot = &tcpv6_prot_override;
1649         else
1650 #endif
1651                 ssk->sk_prot = &tcp_prot_override;
1652 }
1653
1654 static void mptcp_subflow_ops_undo_override(struct sock *ssk)
1655 {
1656 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1657         if (ssk->sk_prot == &tcpv6_prot_override)
1658                 ssk->sk_prot = &tcpv6_prot;
1659         else
1660 #endif
1661                 ssk->sk_prot = &tcp_prot;
1662 }
1663
1664 int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
1665                                 struct socket **new_sock)
1666 {
1667         struct mptcp_subflow_context *subflow;
1668         struct net *net = sock_net(sk);
1669         struct socket *sf;
1670         int err;
1671
1672         /* un-accepted server sockets can reach here - on bad configuration
1673          * bail early to avoid greater trouble later
1674          */
1675         if (unlikely(!sk->sk_socket))
1676                 return -EINVAL;
1677
1678         err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf);
1679         if (err)
1680                 return err;
1681
1682         lock_sock(sf->sk);
1683
1684         /* the newly created socket has to be in the same cgroup as its parent */
1685         mptcp_attach_cgroup(sk, sf->sk);
1686
1687         /* kernel sockets do not by default acquire net ref, but TCP timer
1688          * needs it.
1689          * Update ns_tracker to current stack trace and refcounted tracker.
1690          */
1691         __netns_tracker_free(net, &sf->sk->ns_tracker, false);
1692         sf->sk->sk_net_refcnt = 1;
1693         get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL);
1694         sock_inuse_add(net, 1);
1695         err = tcp_set_ulp(sf->sk, "mptcp");
1696         release_sock(sf->sk);
1697
1698         if (err) {
1699                 sock_release(sf);
1700                 return err;
1701         }
1702
1703         /* the newly created socket really belongs to the owning MPTCP master
1704          * socket, even if for additional subflows the allocation is performed
1705          * by a kernel workqueue. Adjust inode references, so that the
1706          * procfs/diag interfaces really show this one belonging to the correct
1707          * user.
1708          */
1709         SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
1710         SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
1711         SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
1712
1713         subflow = mptcp_subflow_ctx(sf->sk);
1714         pr_debug("subflow=%p", subflow);
1715
1716         *new_sock = sf;
1717         sock_hold(sk);
1718         subflow->conn = sk;
1719         mptcp_subflow_ops_override(sf->sk);
1720
1721         return 0;
1722 }
1723
1724 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
1725                                                         gfp_t priority)
1726 {
1727         struct inet_connection_sock *icsk = inet_csk(sk);
1728         struct mptcp_subflow_context *ctx;
1729
1730         ctx = kzalloc(sizeof(*ctx), priority);
1731         if (!ctx)
1732                 return NULL;
1733
1734         rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
1735         INIT_LIST_HEAD(&ctx->node);
1736         INIT_LIST_HEAD(&ctx->delegated_node);
1737
1738         pr_debug("subflow=%p", ctx);
1739
1740         ctx->tcp_sock = sk;
1741
1742         return ctx;
1743 }
1744
1745 static void __subflow_state_change(struct sock *sk)
1746 {
1747         struct socket_wq *wq;
1748
1749         rcu_read_lock();
1750         wq = rcu_dereference(sk->sk_wq);
1751         if (skwq_has_sleeper(wq))
1752                 wake_up_interruptible_all(&wq->wait);
1753         rcu_read_unlock();
1754 }
1755
1756 static bool subflow_is_done(const struct sock *sk)
1757 {
1758         return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
1759 }
1760
1761 static void subflow_state_change(struct sock *sk)
1762 {
1763         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1764         struct sock *parent = subflow->conn;
1765
1766         __subflow_state_change(sk);
1767
1768         if (subflow_simultaneous_connect(sk)) {
1769                 mptcp_propagate_sndbuf(parent, sk);
1770                 mptcp_do_fallback(sk);
1771                 mptcp_rcv_space_init(mptcp_sk(parent), sk);
1772                 pr_fallback(mptcp_sk(parent));
1773                 subflow->conn_finished = 1;
1774                 mptcp_set_connected(parent);
1775         }
1776
1777         /* as recvmsg() does not acquire the subflow socket for ssk selection
1778          * a fin packet carrying a DSS can be unnoticed if we don't trigger
1779          * the data available machinery here.
1780          */
1781         if (mptcp_subflow_data_available(sk))
1782                 mptcp_data_ready(parent, sk);
1783         else if (unlikely(sk->sk_err))
1784                 subflow_error_report(sk);
1785
1786         subflow_sched_work_if_closed(mptcp_sk(parent), sk);
1787
1788         if (__mptcp_check_fallback(mptcp_sk(parent)) &&
1789             !subflow->rx_eof && subflow_is_done(sk)) {
1790                 subflow->rx_eof = 1;
1791                 mptcp_subflow_eof(parent);
1792         }
1793 }
1794
1795 void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
1796 {
1797         struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
1798         struct mptcp_sock *msk, *next, *head = NULL;
1799         struct request_sock *req;
1800
1801         /* build a list of all unaccepted mptcp sockets */
1802         spin_lock_bh(&queue->rskq_lock);
1803         for (req = queue->rskq_accept_head; req; req = req->dl_next) {
1804                 struct mptcp_subflow_context *subflow;
1805                 struct sock *ssk = req->sk;
1806                 struct mptcp_sock *msk;
1807
1808                 if (!sk_is_mptcp(ssk))
1809                         continue;
1810
1811                 subflow = mptcp_subflow_ctx(ssk);
1812                 if (!subflow || !subflow->conn)
1813                         continue;
1814
1815                 /* skip if already in list */
1816                 msk = mptcp_sk(subflow->conn);
1817                 if (msk->dl_next || msk == head)
1818                         continue;
1819
1820                 msk->dl_next = head;
1821                 head = msk;
1822         }
1823         spin_unlock_bh(&queue->rskq_lock);
1824         if (!head)
1825                 return;
1826
1827         /* can't acquire the msk socket lock under the subflow one,
1828          * or will cause ABBA deadlock
1829          */
1830         release_sock(listener_ssk);
1831
1832         for (msk = head; msk; msk = next) {
1833                 struct sock *sk = (struct sock *)msk;
1834                 bool do_cancel_work;
1835
1836                 sock_hold(sk);
1837                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
1838                 next = msk->dl_next;
1839                 msk->first = NULL;
1840                 msk->dl_next = NULL;
1841
1842                 do_cancel_work = __mptcp_close(sk, 0);
1843                 release_sock(sk);
1844                 if (do_cancel_work) {
1845                         /* lockdep will report a false positive ABBA deadlock
1846                          * between cancel_work_sync and the listener socket.
1847                          * The involved locks belong to different sockets WRT
1848                          * the existing AB chain.
1849                          * Using a per socket key is problematic as key
1850                          * deregistration requires process context and must be
1851                          * performed at socket disposal time, in atomic
1852                          * context.
1853                          * Just tell lockdep to consider the listener socket
1854                          * released here.
1855                          */
1856                         mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_);
1857                         mptcp_cancel_work(sk);
1858                         mutex_acquire(&listener_sk->sk_lock.dep_map,
1859                                       SINGLE_DEPTH_NESTING, 0, _RET_IP_);
1860                 }
1861                 sock_put(sk);
1862         }
1863
1864         /* we are still under the listener msk socket lock */
1865         lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
1866 }
1867
1868 static int subflow_ulp_init(struct sock *sk)
1869 {
1870         struct inet_connection_sock *icsk = inet_csk(sk);
1871         struct mptcp_subflow_context *ctx;
1872         struct tcp_sock *tp = tcp_sk(sk);
1873         int err = 0;
1874
1875         /* disallow attaching ULP to a socket unless it has been
1876          * created with sock_create_kern()
1877          */
1878         if (!sk->sk_kern_sock) {
1879                 err = -EOPNOTSUPP;
1880                 goto out;
1881         }
1882
1883         ctx = subflow_create_ctx(sk, GFP_KERNEL);
1884         if (!ctx) {
1885                 err = -ENOMEM;
1886                 goto out;
1887         }
1888
1889         pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
1890
1891         tp->is_mptcp = 1;
1892         ctx->icsk_af_ops = icsk->icsk_af_ops;
1893         icsk->icsk_af_ops = subflow_default_af_ops(sk);
1894         ctx->tcp_state_change = sk->sk_state_change;
1895         ctx->tcp_error_report = sk->sk_error_report;
1896
1897         WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable);
1898         WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space);
1899
1900         sk->sk_data_ready = subflow_data_ready;
1901         sk->sk_write_space = subflow_write_space;
1902         sk->sk_state_change = subflow_state_change;
1903         sk->sk_error_report = subflow_error_report;
1904 out:
1905         return err;
1906 }
1907
1908 static void subflow_ulp_release(struct sock *ssk)
1909 {
1910         struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
1911         bool release = true;
1912         struct sock *sk;
1913
1914         if (!ctx)
1915                 return;
1916
1917         sk = ctx->conn;
1918         if (sk) {
1919                 /* if the msk has been orphaned, keep the ctx
1920                  * alive, will be freed by __mptcp_close_ssk(),
1921                  * when the subflow is still unaccepted
1922                  */
1923                 release = ctx->disposable || list_empty(&ctx->node);
1924                 sock_put(sk);
1925         }
1926
1927         mptcp_subflow_ops_undo_override(ssk);
1928         if (release)
1929                 kfree_rcu(ctx, rcu);
1930 }
1931
1932 static void subflow_ulp_clone(const struct request_sock *req,
1933                               struct sock *newsk,
1934                               const gfp_t priority)
1935 {
1936         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
1937         struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
1938         struct mptcp_subflow_context *new_ctx;
1939
1940         if (!tcp_rsk(req)->is_mptcp ||
1941             (!subflow_req->mp_capable && !subflow_req->mp_join)) {
1942                 subflow_ulp_fallback(newsk, old_ctx);
1943                 return;
1944         }
1945
1946         new_ctx = subflow_create_ctx(newsk, priority);
1947         if (!new_ctx) {
1948                 subflow_ulp_fallback(newsk, old_ctx);
1949                 return;
1950         }
1951
1952         new_ctx->conn_finished = 1;
1953         new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
1954         new_ctx->tcp_state_change = old_ctx->tcp_state_change;
1955         new_ctx->tcp_error_report = old_ctx->tcp_error_report;
1956         new_ctx->rel_write_seq = 1;
1957         new_ctx->tcp_sock = newsk;
1958
1959         if (subflow_req->mp_capable) {
1960                 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1961                  * is fully established only after we receive the remote key
1962                  */
1963                 new_ctx->mp_capable = 1;
1964                 new_ctx->local_key = subflow_req->local_key;
1965                 new_ctx->token = subflow_req->token;
1966                 new_ctx->ssn_offset = subflow_req->ssn_offset;
1967                 new_ctx->idsn = subflow_req->idsn;
1968
1969                 /* this is the first subflow, id is always 0 */
1970                 new_ctx->local_id_valid = 1;
1971         } else if (subflow_req->mp_join) {
1972                 new_ctx->ssn_offset = subflow_req->ssn_offset;
1973                 new_ctx->mp_join = 1;
1974                 new_ctx->fully_established = 1;
1975                 new_ctx->remote_key_valid = 1;
1976                 new_ctx->backup = subflow_req->backup;
1977                 new_ctx->remote_id = subflow_req->remote_id;
1978                 new_ctx->token = subflow_req->token;
1979                 new_ctx->thmac = subflow_req->thmac;
1980
1981                 /* the subflow req id is valid, fetched via subflow_check_req()
1982                  * and subflow_token_join_request()
1983                  */
1984                 subflow_set_local_id(new_ctx, subflow_req->local_id);
1985         }
1986 }
1987
1988 static void tcp_release_cb_override(struct sock *ssk)
1989 {
1990         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1991
1992         if (mptcp_subflow_has_delegated_action(subflow))
1993                 mptcp_subflow_process_delegated(ssk);
1994
1995         tcp_release_cb(ssk);
1996 }
1997
1998 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
1999         .name           = "mptcp",
2000         .owner          = THIS_MODULE,
2001         .init           = subflow_ulp_init,
2002         .release        = subflow_ulp_release,
2003         .clone          = subflow_ulp_clone,
2004 };
2005
2006 static int subflow_ops_init(struct request_sock_ops *subflow_ops)
2007 {
2008         subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
2009
2010         subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
2011                                               subflow_ops->obj_size, 0,
2012                                               SLAB_ACCOUNT |
2013                                               SLAB_TYPESAFE_BY_RCU,
2014                                               NULL);
2015         if (!subflow_ops->slab)
2016                 return -ENOMEM;
2017
2018         return 0;
2019 }
2020
2021 void __init mptcp_subflow_init(void)
2022 {
2023         mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops;
2024         mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4";
2025         mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor;
2026
2027         if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0)
2028                 panic("MPTCP: failed to init subflow v4 request sock ops\n");
2029
2030         subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
2031         subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req;
2032         subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack;
2033
2034         subflow_specific = ipv4_specific;
2035         subflow_specific.conn_request = subflow_v4_conn_request;
2036         subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
2037         subflow_specific.sk_rx_dst_set = subflow_finish_connect;
2038         subflow_specific.rebuild_header = subflow_rebuild_header;
2039
2040         tcp_prot_override = tcp_prot;
2041         tcp_prot_override.release_cb = tcp_release_cb_override;
2042
2043 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2044         /* In struct mptcp_subflow_request_sock, we assume the TCP request sock
2045          * structures for v4 and v6 have the same size. It should not changed in
2046          * the future but better to make sure to be warned if it is no longer
2047          * the case.
2048          */
2049         BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock));
2050
2051         mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops;
2052         mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6";
2053         mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor;
2054
2055         if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0)
2056                 panic("MPTCP: failed to init subflow v6 request sock ops\n");
2057
2058         subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
2059         subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
2060         subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack;
2061
2062         subflow_v6_specific = ipv6_specific;
2063         subflow_v6_specific.conn_request = subflow_v6_conn_request;
2064         subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
2065         subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
2066         subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header;
2067
2068         subflow_v6m_specific = subflow_v6_specific;
2069         subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
2070         subflow_v6m_specific.send_check = ipv4_specific.send_check;
2071         subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
2072         subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
2073         subflow_v6m_specific.net_frag_header_len = 0;
2074         subflow_v6m_specific.rebuild_header = subflow_rebuild_header;
2075
2076         tcpv6_prot_override = tcpv6_prot;
2077         tcpv6_prot_override.release_cb = tcp_release_cb_override;
2078 #endif
2079
2080         mptcp_diag_subflow_init(&subflow_ulp_ops);
2081
2082         if (tcp_register_ulp(&subflow_ulp_ops) != 0)
2083                 panic("MPTCP: failed to register subflows to ULP\n");
2084 }
This page took 0.147615 seconds and 4 git commands to generate.