1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2017 - 2019, Intel Corporation.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <crypto/algapi.h>
13 #include <crypto/sha.h>
15 #include <net/inet_common.h>
16 #include <net/inet_hashtables.h>
17 #include <net/protocol.h>
19 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
20 #include <net/ip6_route.h>
22 #include <net/mptcp.h>
26 static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
27 enum linux_mptcp_mib_field field)
29 MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
32 static void subflow_req_destructor(struct request_sock *req)
34 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
36 pr_debug("subflow_req=%p", subflow_req);
39 sock_put((struct sock *)subflow_req->msk);
41 mptcp_token_destroy_request(req);
42 tcp_request_sock_ops.destructor(req);
45 static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
50 put_unaligned_be32(nonce1, &msg[0]);
51 put_unaligned_be32(nonce2, &msg[4]);
53 mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
56 /* validate received token and create truncated hmac and nonce for SYN-ACK */
57 static struct mptcp_sock *subflow_token_join_request(struct request_sock *req,
58 const struct sk_buff *skb)
60 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
61 u8 hmac[SHA256_DIGEST_SIZE];
62 struct mptcp_sock *msk;
65 msk = mptcp_token_get_sock(subflow_req->token);
67 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
71 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
73 sock_put((struct sock *)msk);
76 subflow_req->local_id = local_id;
78 get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
80 subflow_generate_hmac(msk->local_key, msk->remote_key,
81 subflow_req->local_nonce,
82 subflow_req->remote_nonce, hmac);
84 subflow_req->thmac = get_unaligned_be64(hmac);
88 static void subflow_init_req(struct request_sock *req,
89 const struct sock *sk_listener,
92 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
93 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
94 struct mptcp_options_received mp_opt;
96 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
98 mptcp_get_options(skb, &mp_opt);
100 subflow_req->mp_capable = 0;
101 subflow_req->mp_join = 0;
102 subflow_req->msk = NULL;
103 mptcp_token_init_request(req);
105 #ifdef CONFIG_TCP_MD5SIG
106 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
109 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
113 if (mp_opt.mp_capable) {
114 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
118 } else if (mp_opt.mp_join) {
119 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
122 if (mp_opt.mp_capable && listener->request_mptcp) {
125 err = mptcp_token_new_request(req);
127 subflow_req->mp_capable = 1;
129 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
130 } else if (mp_opt.mp_join && listener->request_mptcp) {
131 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
132 subflow_req->mp_join = 1;
133 subflow_req->backup = mp_opt.backup;
134 subflow_req->remote_id = mp_opt.join_id;
135 subflow_req->token = mp_opt.token;
136 subflow_req->remote_nonce = mp_opt.nonce;
137 subflow_req->msk = subflow_token_join_request(req, skb);
138 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
139 subflow_req->remote_nonce, subflow_req->msk);
143 static void subflow_v4_init_req(struct request_sock *req,
144 const struct sock *sk_listener,
147 tcp_rsk(req)->is_mptcp = 1;
149 tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
151 subflow_init_req(req, sk_listener, skb);
154 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
155 static void subflow_v6_init_req(struct request_sock *req,
156 const struct sock *sk_listener,
159 tcp_rsk(req)->is_mptcp = 1;
161 tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
163 subflow_init_req(req, sk_listener, skb);
167 /* validate received truncated hmac and create hmac for third ACK */
168 static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
170 u8 hmac[SHA256_DIGEST_SIZE];
173 subflow_generate_hmac(subflow->remote_key, subflow->local_key,
174 subflow->remote_nonce, subflow->local_nonce,
177 thmac = get_unaligned_be64(hmac);
178 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
179 subflow, subflow->token,
180 (unsigned long long)thmac,
181 (unsigned long long)subflow->thmac);
183 return thmac == subflow->thmac;
186 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
188 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
189 struct mptcp_options_received mp_opt;
190 struct sock *parent = subflow->conn;
192 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
194 if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
195 inet_sk_state_store(parent, TCP_ESTABLISHED);
196 parent->sk_state_change(parent);
199 /* be sure no special action on any packet other than syn-ack */
200 if (subflow->conn_finished)
203 subflow->conn_finished = 1;
204 subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
205 pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
207 mptcp_get_options(skb, &mp_opt);
208 if (subflow->request_mptcp && mp_opt.mp_capable) {
209 subflow->mp_capable = 1;
210 subflow->can_ack = 1;
211 subflow->remote_key = mp_opt.sndr_key;
212 pr_debug("subflow=%p, remote_key=%llu", subflow,
213 subflow->remote_key);
214 } else if (subflow->request_join && mp_opt.mp_join) {
215 subflow->mp_join = 1;
216 subflow->thmac = mp_opt.thmac;
217 subflow->remote_nonce = mp_opt.nonce;
218 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
219 subflow->thmac, subflow->remote_nonce);
221 if (subflow->request_mptcp)
222 MPTCP_INC_STATS(sock_net(sk),
223 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
224 mptcp_do_fallback(sk);
225 pr_fallback(mptcp_sk(subflow->conn));
228 if (mptcp_check_fallback(sk)) {
229 mptcp_rcv_space_init(mptcp_sk(parent), sk);
233 if (subflow->mp_capable) {
234 pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
235 subflow->remote_key);
236 mptcp_finish_connect(sk);
237 } else if (subflow->mp_join) {
238 u8 hmac[SHA256_DIGEST_SIZE];
240 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
241 subflow, subflow->thmac,
242 subflow->remote_nonce);
243 if (!subflow_thmac_valid(subflow)) {
244 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
245 subflow->mp_join = 0;
249 subflow_generate_hmac(subflow->local_key, subflow->remote_key,
250 subflow->local_nonce,
251 subflow->remote_nonce,
254 memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
256 if (!mptcp_finish_join(sk))
259 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
262 tcp_send_active_reset(sk, GFP_ATOMIC);
267 static struct request_sock_ops subflow_request_sock_ops;
268 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
270 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
272 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
274 pr_debug("subflow=%p", subflow);
276 /* Never answer to SYNs sent to broadcast or multicast */
277 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
280 return tcp_conn_request(&subflow_request_sock_ops,
281 &subflow_request_sock_ipv4_ops,
288 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
289 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
290 static struct inet_connection_sock_af_ops subflow_v6_specific;
291 static struct inet_connection_sock_af_ops subflow_v6m_specific;
293 static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
295 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
297 pr_debug("subflow=%p", subflow);
299 if (skb->protocol == htons(ETH_P_IP))
300 return subflow_v4_conn_request(sk, skb);
302 if (!ipv6_unicast_destination(skb))
305 return tcp_conn_request(&subflow_request_sock_ops,
306 &subflow_request_sock_ipv6_ops, sk, skb);
310 return 0; /* don't send reset */
314 /* validate hmac received in third ACK */
315 static bool subflow_hmac_valid(const struct request_sock *req,
316 const struct mptcp_options_received *mp_opt)
318 const struct mptcp_subflow_request_sock *subflow_req;
319 u8 hmac[SHA256_DIGEST_SIZE];
320 struct mptcp_sock *msk;
322 subflow_req = mptcp_subflow_rsk(req);
323 msk = subflow_req->msk;
327 subflow_generate_hmac(msk->remote_key, msk->local_key,
328 subflow_req->remote_nonce,
329 subflow_req->local_nonce, hmac);
331 return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
334 static void mptcp_sock_destruct(struct sock *sk)
336 /* if new mptcp socket isn't accepted, it is free'd
337 * from the tcp listener sockets request queue, linked
338 * from req->sk. The tcp socket is released.
339 * This calls the ULP release function which will
340 * also remove the mptcp socket, via
341 * sock_put(ctx->conn).
343 * Problem is that the mptcp socket will not be in
344 * SYN_RECV state and doesn't have SOCK_DEAD flag.
345 * Both result in warnings from inet_sock_destruct.
348 if (sk->sk_state == TCP_SYN_RECV) {
349 sk->sk_state = TCP_CLOSE;
350 WARN_ON_ONCE(sk->sk_socket);
354 mptcp_token_destroy(mptcp_sk(sk));
355 inet_sock_destruct(sk);
358 static void mptcp_force_close(struct sock *sk)
360 inet_sk_state_store(sk, TCP_CLOSE);
361 sk_common_release(sk);
364 static void subflow_ulp_fallback(struct sock *sk,
365 struct mptcp_subflow_context *old_ctx)
367 struct inet_connection_sock *icsk = inet_csk(sk);
369 mptcp_subflow_tcp_fallback(sk, old_ctx);
370 icsk->icsk_ulp_ops = NULL;
371 rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
372 tcp_sk(sk)->is_mptcp = 0;
375 static void subflow_drop_ctx(struct sock *ssk)
377 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
382 subflow_ulp_fallback(ssk, ctx);
389 static struct sock *subflow_syn_recv_sock(const struct sock *sk,
391 struct request_sock *req,
392 struct dst_entry *dst,
393 struct request_sock *req_unhash,
396 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
397 struct mptcp_subflow_request_sock *subflow_req;
398 struct mptcp_options_received mp_opt;
399 bool fallback, fallback_is_fatal;
400 struct sock *new_msk = NULL;
403 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
405 /* After child creation we must look for 'mp_capable' even when options
408 mp_opt.mp_capable = 0;
410 /* hopefully temporary handling for MP_JOIN+syncookie */
411 subflow_req = mptcp_subflow_rsk(req);
412 fallback_is_fatal = subflow_req->mp_join;
413 fallback = !tcp_rsk(req)->is_mptcp;
417 /* if the sk is MP_CAPABLE, we try to fetch the client key */
418 if (subflow_req->mp_capable) {
419 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
420 /* here we can receive and accept an in-window,
421 * out-of-order pkt, which will not carry the MP_CAPABLE
422 * opt even on mptcp enabled paths
427 mptcp_get_options(skb, &mp_opt);
428 if (!mp_opt.mp_capable) {
434 new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req);
437 } else if (subflow_req->mp_join) {
438 mptcp_get_options(skb, &mp_opt);
439 if (!mp_opt.mp_join ||
440 !subflow_hmac_valid(req, &mp_opt)) {
441 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
447 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
448 req_unhash, own_req);
450 if (child && *own_req) {
451 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
453 tcp_rsk(req)->drop_req = false;
455 /* we need to fallback on ctx allocation failure and on pre-reqs
456 * checking above. In the latter scenario we additionally need
457 * to reset the context to non MPTCP status.
459 if (!ctx || fallback) {
460 if (fallback_is_fatal)
463 subflow_drop_ctx(child);
467 if (ctx->mp_capable) {
468 /* new mpc subflow takes ownership of the newly
469 * created mptcp socket
471 new_msk->sk_destruct = mptcp_sock_destruct;
472 mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
473 mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
477 /* with OoO packets we can reach here without ingress
480 ctx->remote_key = mp_opt.sndr_key;
481 ctx->fully_established = mp_opt.mp_capable;
482 ctx->can_ack = mp_opt.mp_capable;
483 } else if (ctx->mp_join) {
484 struct mptcp_sock *owner;
486 owner = subflow_req->msk;
490 /* move the msk reference ownership to the subflow */
491 subflow_req->msk = NULL;
492 ctx->conn = (struct sock *)owner;
493 if (!mptcp_finish_join(child))
496 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
497 tcp_rsk(req)->drop_req = true;
502 /* dispose of the left over mptcp master, if any */
503 if (unlikely(new_msk))
504 mptcp_force_close(new_msk);
506 /* check for expected invariant - should never trigger, just help
507 * catching eariler subtle bugs
509 WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
510 (!mptcp_subflow_ctx(child) ||
511 !mptcp_subflow_ctx(child)->conn));
515 subflow_drop_ctx(child);
516 tcp_rsk(req)->drop_req = true;
517 tcp_send_active_reset(child, GFP_ATOMIC);
518 inet_csk_prepare_for_destroy_sock(child);
521 /* The last child reference will be released by the caller */
525 static struct inet_connection_sock_af_ops subflow_specific;
527 enum mapping_status {
535 static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
537 if ((u32)seq == (u32)old_seq)
540 /* Assume map covers data not mapped yet. */
541 return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
544 static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
546 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
547 ssn, subflow->map_subflow_seq, subflow->map_data_len);
550 static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
552 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
553 unsigned int skb_consumed;
555 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
556 if (WARN_ON_ONCE(skb_consumed >= skb->len))
559 return skb->len - skb_consumed <= subflow->map_data_len -
560 mptcp_subflow_get_map_offset(subflow);
563 static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
565 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
566 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
568 if (unlikely(before(ssn, subflow->map_subflow_seq))) {
569 /* Mapping covers data later in the subflow stream,
570 * currently unsupported.
572 warn_bad_map(subflow, ssn);
575 if (unlikely(!before(ssn, subflow->map_subflow_seq +
576 subflow->map_data_len))) {
577 /* Mapping does covers past subflow data, invalid */
578 warn_bad_map(subflow, ssn + skb->len);
584 static enum mapping_status get_mapping_status(struct sock *ssk)
586 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
587 struct mptcp_ext *mpext;
592 skb = skb_peek(&ssk->sk_receive_queue);
594 return MAPPING_EMPTY;
596 if (mptcp_check_fallback(ssk))
597 return MAPPING_DUMMY;
599 mpext = mptcp_get_ext(skb);
600 if (!mpext || !mpext->use_map) {
601 if (!subflow->map_valid && !skb->len) {
602 /* the TCP stack deliver 0 len FIN pkt to the receive
603 * queue, that is the only 0len pkts ever expected here,
604 * and we can admit no mapping only for 0 len pkts
606 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
607 WARN_ONCE(1, "0len seq %d:%d flags %x",
608 TCP_SKB_CB(skb)->seq,
609 TCP_SKB_CB(skb)->end_seq,
610 TCP_SKB_CB(skb)->tcp_flags);
611 sk_eat_skb(ssk, skb);
612 return MAPPING_EMPTY;
615 if (!subflow->map_valid)
616 return MAPPING_INVALID;
621 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
622 mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
623 mpext->data_len, mpext->data_fin);
625 data_len = mpext->data_len;
627 pr_err("Infinite mapping not handled");
628 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
629 return MAPPING_INVALID;
632 if (mpext->data_fin == 1) {
634 pr_debug("DATA_FIN with no payload");
635 if (subflow->map_valid) {
636 /* A DATA_FIN might arrive in a DSS
637 * option before the previous mapping
638 * has been fully consumed. Continue
639 * handling the existing mapping.
641 skb_ext_del(skb, SKB_EXT_MPTCP);
644 return MAPPING_DATA_FIN;
648 /* Adjust for DATA_FIN using 1 byte of sequence space */
653 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
655 subflow->use_64bit_ack = 0;
656 pr_debug("expanded seq=%llu", subflow->map_seq);
658 map_seq = mpext->data_seq;
659 subflow->use_64bit_ack = 1;
662 if (subflow->map_valid) {
663 /* Allow replacing only with an identical map */
664 if (subflow->map_seq == map_seq &&
665 subflow->map_subflow_seq == mpext->subflow_seq &&
666 subflow->map_data_len == data_len) {
667 skb_ext_del(skb, SKB_EXT_MPTCP);
671 /* If this skb data are fully covered by the current mapping,
672 * the new map would need caching, which is not supported
674 if (skb_is_fully_mapped(ssk, skb)) {
675 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
676 return MAPPING_INVALID;
679 /* will validate the next map after consuming the current one */
683 subflow->map_seq = map_seq;
684 subflow->map_subflow_seq = mpext->subflow_seq;
685 subflow->map_data_len = data_len;
686 subflow->map_valid = 1;
687 subflow->mpc_map = mpext->mpc_map;
688 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
689 subflow->map_seq, subflow->map_subflow_seq,
690 subflow->map_data_len);
693 /* we revalidate valid mapping on new skb, because we must ensure
694 * the current skb is completely covered by the available mapping
696 if (!validate_mapping(ssk, skb))
697 return MAPPING_INVALID;
699 skb_ext_del(skb, SKB_EXT_MPTCP);
703 static int subflow_read_actor(read_descriptor_t *desc,
705 unsigned int offset, size_t len)
707 size_t copy_len = min(desc->count, len);
709 desc->count -= copy_len;
711 pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
715 static bool subflow_check_data_avail(struct sock *ssk)
717 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
718 enum mapping_status status;
719 struct mptcp_sock *msk;
722 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
723 subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
724 if (subflow->data_avail)
727 msk = mptcp_sk(subflow->conn);
734 status = get_mapping_status(ssk);
735 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
736 if (status == MAPPING_INVALID) {
737 ssk->sk_err = EBADMSG;
740 if (status == MAPPING_DUMMY) {
741 __mptcp_do_fallback(msk);
742 skb = skb_peek(&ssk->sk_receive_queue);
743 subflow->map_valid = 1;
744 subflow->map_seq = READ_ONCE(msk->ack_seq);
745 subflow->map_data_len = skb->len;
746 subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
751 if (status != MAPPING_OK)
754 skb = skb_peek(&ssk->sk_receive_queue);
755 if (WARN_ON_ONCE(!skb))
758 /* if msk lacks the remote key, this subflow must provide an
759 * MP_CAPABLE-based mapping
761 if (unlikely(!READ_ONCE(msk->can_ack))) {
762 if (!subflow->mpc_map) {
763 ssk->sk_err = EBADMSG;
766 WRITE_ONCE(msk->remote_key, subflow->remote_key);
767 WRITE_ONCE(msk->ack_seq, subflow->map_seq);
768 WRITE_ONCE(msk->can_ack, true);
771 old_ack = READ_ONCE(msk->ack_seq);
772 ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
773 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
775 if (ack_seq == old_ack)
778 /* only accept in-sequence mapping. Old values are spurious
779 * retransmission; we can hit "future" values on active backup
780 * subflow switch, we relay on retransmissions to get
782 * Cuncurrent subflows support will require subflow data
785 map_remaining = subflow->map_data_len -
786 mptcp_subflow_get_map_offset(subflow);
787 if (before64(ack_seq, old_ack))
788 delta = min_t(size_t, old_ack - ack_seq, map_remaining);
790 delta = min_t(size_t, ack_seq - old_ack, map_remaining);
792 /* discard mapped data */
793 pr_debug("discarding %zu bytes, current map len=%d", delta,
796 read_descriptor_t desc = {
801 ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
808 if (delta == map_remaining)
809 subflow->map_valid = 0;
815 /* fatal protocol error, close the socket */
816 /* This barrier is coupled with smp_rmb() in tcp_poll() */
818 ssk->sk_error_report(ssk);
819 tcp_set_state(ssk, TCP_CLOSE);
820 tcp_send_active_reset(ssk, GFP_ATOMIC);
824 bool mptcp_subflow_data_available(struct sock *sk)
826 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
829 /* check if current mapping is still valid */
830 if (subflow->map_valid &&
831 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
832 subflow->map_valid = 0;
833 subflow->data_avail = 0;
835 pr_debug("Done with mapping: seq=%u data_len=%u",
836 subflow->map_subflow_seq,
837 subflow->map_data_len);
840 if (!subflow_check_data_avail(sk)) {
841 subflow->data_avail = 0;
845 skb = skb_peek(&sk->sk_receive_queue);
846 subflow->data_avail = skb &&
847 before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
848 return subflow->data_avail;
851 /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
854 * In mptcp, rwin is about the mptcp-level connection data.
856 * Data that is still on the ssk rx queue can thus be ignored,
857 * as far as mptcp peer is concerened that data is still inflight.
858 * DSS ACK is updated when skb is moved to the mptcp rx queue.
860 void mptcp_space(const struct sock *ssk, int *space, int *full_space)
862 const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
863 const struct sock *sk = subflow->conn;
865 *space = tcp_space(sk);
866 *full_space = tcp_full_space(sk);
869 static void subflow_data_ready(struct sock *sk)
871 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
872 u16 state = 1 << inet_sk_state_load(sk);
873 struct sock *parent = subflow->conn;
874 struct mptcp_sock *msk;
876 msk = mptcp_sk(parent);
877 if (state & TCPF_LISTEN) {
878 set_bit(MPTCP_DATA_READY, &msk->flags);
879 parent->sk_data_ready(parent);
883 WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
884 !subflow->mp_join && !(state & TCPF_CLOSE));
886 if (mptcp_subflow_data_available(sk))
887 mptcp_data_ready(parent, sk);
890 static void subflow_write_space(struct sock *sk)
892 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
893 struct sock *parent = subflow->conn;
895 sk_stream_write_space(sk);
896 if (sk_stream_is_writeable(sk)) {
897 set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
898 smp_mb__after_atomic();
899 /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
900 sk_stream_write_space(parent);
904 static struct inet_connection_sock_af_ops *
905 subflow_default_af_ops(struct sock *sk)
907 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
908 if (sk->sk_family == AF_INET6)
909 return &subflow_v6_specific;
911 return &subflow_specific;
914 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
915 void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
917 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
918 struct inet_connection_sock *icsk = inet_csk(sk);
919 struct inet_connection_sock_af_ops *target;
921 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
923 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
924 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
926 if (likely(icsk->icsk_af_ops == target))
929 subflow->icsk_af_ops = icsk->icsk_af_ops;
930 icsk->icsk_af_ops = target;
934 static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
935 struct sockaddr_storage *addr)
937 memset(addr, 0, sizeof(*addr));
938 addr->ss_family = info->family;
939 if (addr->ss_family == AF_INET) {
940 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
942 in_addr->sin_addr = info->addr;
943 in_addr->sin_port = info->port;
945 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
946 else if (addr->ss_family == AF_INET6) {
947 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
949 in6_addr->sin6_addr = info->addr6;
950 in6_addr->sin6_port = info->port;
955 int __mptcp_subflow_connect(struct sock *sk, int ifindex,
956 const struct mptcp_addr_info *loc,
957 const struct mptcp_addr_info *remote)
959 struct mptcp_sock *msk = mptcp_sk(sk);
960 struct mptcp_subflow_context *subflow;
961 struct sockaddr_storage addr;
962 int local_id = loc->id;
969 if (sk->sk_state != TCP_ESTABLISHED)
972 err = mptcp_subflow_create_socket(sk, &sf);
977 subflow = mptcp_subflow_ctx(ssk);
979 get_random_bytes(&subflow->local_nonce, sizeof(u32));
980 } while (!subflow->local_nonce);
983 err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk);
990 subflow->remote_key = msk->remote_key;
991 subflow->local_key = msk->local_key;
992 subflow->token = msk->token;
993 mptcp_info2sockaddr(loc, &addr);
995 addrlen = sizeof(struct sockaddr_in);
996 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
997 if (loc->family == AF_INET6)
998 addrlen = sizeof(struct sockaddr_in6);
1000 ssk->sk_bound_dev_if = ifindex;
1001 err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
1005 mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
1006 pr_debug("msk=%p remote_token=%u local_id=%d", msk, remote_token,
1008 subflow->remote_token = remote_token;
1009 subflow->local_id = local_id;
1010 subflow->request_join = 1;
1011 subflow->request_bkup = 1;
1012 mptcp_info2sockaddr(remote, &addr);
1014 err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
1015 if (err && err != -EINPROGRESS)
1018 spin_lock_bh(&msk->join_list_lock);
1019 list_add_tail(&subflow->node, &msk->join_list);
1020 spin_unlock_bh(&msk->join_list_lock);
1029 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
1031 struct mptcp_subflow_context *subflow;
1032 struct net *net = sock_net(sk);
1036 err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
1043 /* kernel sockets do not by default acquire net ref, but TCP timer
1046 sf->sk->sk_net_refcnt = 1;
1048 #ifdef CONFIG_PROC_FS
1049 this_cpu_add(*net->core.sock_inuse, 1);
1051 err = tcp_set_ulp(sf->sk, "mptcp");
1052 release_sock(sf->sk);
1059 /* the newly created socket really belongs to the owning MPTCP master
1060 * socket, even if for additional subflows the allocation is performed
1061 * by a kernel workqueue. Adjust inode references, so that the
1062 * procfs/diag interaces really show this one belonging to the correct
1065 SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
1066 SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
1067 SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
1069 subflow = mptcp_subflow_ctx(sf->sk);
1070 pr_debug("subflow=%p", subflow);
1079 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
1082 struct inet_connection_sock *icsk = inet_csk(sk);
1083 struct mptcp_subflow_context *ctx;
1085 ctx = kzalloc(sizeof(*ctx), priority);
1089 rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
1090 INIT_LIST_HEAD(&ctx->node);
1092 pr_debug("subflow=%p", ctx);
1099 static void __subflow_state_change(struct sock *sk)
1101 struct socket_wq *wq;
1104 wq = rcu_dereference(sk->sk_wq);
1105 if (skwq_has_sleeper(wq))
1106 wake_up_interruptible_all(&wq->wait);
1110 static bool subflow_is_done(const struct sock *sk)
1112 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
1115 static void subflow_state_change(struct sock *sk)
1117 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1118 struct sock *parent = subflow->conn;
1120 __subflow_state_change(sk);
1122 if (subflow_simultaneous_connect(sk)) {
1123 mptcp_do_fallback(sk);
1124 mptcp_rcv_space_init(mptcp_sk(parent), sk);
1125 pr_fallback(mptcp_sk(parent));
1126 subflow->conn_finished = 1;
1127 if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
1128 inet_sk_state_store(parent, TCP_ESTABLISHED);
1129 parent->sk_state_change(parent);
1133 /* as recvmsg() does not acquire the subflow socket for ssk selection
1134 * a fin packet carrying a DSS can be unnoticed if we don't trigger
1135 * the data available machinery here.
1137 if (mptcp_subflow_data_available(sk))
1138 mptcp_data_ready(parent, sk);
1140 if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
1141 !subflow->rx_eof && subflow_is_done(sk)) {
1142 subflow->rx_eof = 1;
1143 mptcp_subflow_eof(parent);
1147 static int subflow_ulp_init(struct sock *sk)
1149 struct inet_connection_sock *icsk = inet_csk(sk);
1150 struct mptcp_subflow_context *ctx;
1151 struct tcp_sock *tp = tcp_sk(sk);
1154 /* disallow attaching ULP to a socket unless it has been
1155 * created with sock_create_kern()
1157 if (!sk->sk_kern_sock) {
1162 ctx = subflow_create_ctx(sk, GFP_KERNEL);
1168 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
1171 ctx->icsk_af_ops = icsk->icsk_af_ops;
1172 icsk->icsk_af_ops = subflow_default_af_ops(sk);
1173 ctx->tcp_data_ready = sk->sk_data_ready;
1174 ctx->tcp_state_change = sk->sk_state_change;
1175 ctx->tcp_write_space = sk->sk_write_space;
1176 sk->sk_data_ready = subflow_data_ready;
1177 sk->sk_write_space = subflow_write_space;
1178 sk->sk_state_change = subflow_state_change;
1183 static void subflow_ulp_release(struct sock *sk)
1185 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
1191 sock_put(ctx->conn);
1193 kfree_rcu(ctx, rcu);
1196 static void subflow_ulp_clone(const struct request_sock *req,
1198 const gfp_t priority)
1200 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
1201 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
1202 struct mptcp_subflow_context *new_ctx;
1204 if (!tcp_rsk(req)->is_mptcp ||
1205 (!subflow_req->mp_capable && !subflow_req->mp_join)) {
1206 subflow_ulp_fallback(newsk, old_ctx);
1210 new_ctx = subflow_create_ctx(newsk, priority);
1212 subflow_ulp_fallback(newsk, old_ctx);
1216 new_ctx->conn_finished = 1;
1217 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
1218 new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
1219 new_ctx->tcp_state_change = old_ctx->tcp_state_change;
1220 new_ctx->tcp_write_space = old_ctx->tcp_write_space;
1221 new_ctx->rel_write_seq = 1;
1222 new_ctx->tcp_sock = newsk;
1224 if (subflow_req->mp_capable) {
1225 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1226 * is fully established only after we receive the remote key
1228 new_ctx->mp_capable = 1;
1229 new_ctx->local_key = subflow_req->local_key;
1230 new_ctx->token = subflow_req->token;
1231 new_ctx->ssn_offset = subflow_req->ssn_offset;
1232 new_ctx->idsn = subflow_req->idsn;
1233 } else if (subflow_req->mp_join) {
1234 new_ctx->ssn_offset = subflow_req->ssn_offset;
1235 new_ctx->mp_join = 1;
1236 new_ctx->fully_established = 1;
1237 new_ctx->backup = subflow_req->backup;
1238 new_ctx->local_id = subflow_req->local_id;
1239 new_ctx->token = subflow_req->token;
1240 new_ctx->thmac = subflow_req->thmac;
1244 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
1246 .owner = THIS_MODULE,
1247 .init = subflow_ulp_init,
1248 .release = subflow_ulp_release,
1249 .clone = subflow_ulp_clone,
1252 static int subflow_ops_init(struct request_sock_ops *subflow_ops)
1254 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
1255 subflow_ops->slab_name = "request_sock_subflow";
1257 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
1258 subflow_ops->obj_size, 0,
1260 SLAB_TYPESAFE_BY_RCU,
1262 if (!subflow_ops->slab)
1265 subflow_ops->destructor = subflow_req_destructor;
1270 void __init mptcp_subflow_init(void)
1272 subflow_request_sock_ops = tcp_request_sock_ops;
1273 if (subflow_ops_init(&subflow_request_sock_ops) != 0)
1274 panic("MPTCP: failed to init subflow request sock ops\n");
1276 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
1277 subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
1279 subflow_specific = ipv4_specific;
1280 subflow_specific.conn_request = subflow_v4_conn_request;
1281 subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
1282 subflow_specific.sk_rx_dst_set = subflow_finish_connect;
1284 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1285 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
1286 subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
1288 subflow_v6_specific = ipv6_specific;
1289 subflow_v6_specific.conn_request = subflow_v6_conn_request;
1290 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
1291 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
1293 subflow_v6m_specific = subflow_v6_specific;
1294 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
1295 subflow_v6m_specific.send_check = ipv4_specific.send_check;
1296 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
1297 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
1298 subflow_v6m_specific.net_frag_header_len = 0;
1301 mptcp_diag_subflow_init(&subflow_ulp_ops);
1303 if (tcp_register_ulp(&subflow_ulp_ops) != 0)
1304 panic("MPTCP: failed to register subflows to ULP\n");