net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <[email protected]>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_is_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142         /* Policy lookup after SNAT yielded a new policy */
 143         if (skb_dst(skb)->xfrm) {
 144                 IPCB(skb)->flags |= IPSKB_REROUTED;
 145                 return dst_output(net, sk, skb);
 146         }
 147 #endif
 148
 149         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150             dst_allfrag(skb_dst(skb)) ||
 151             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153         else
 154                 return ip6_finish_output2(net, sk, skb);
 155 }
 156
 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158 {
 159         struct net_device *dev = skb_dst(skb)->dev;
 160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162         skb->protocol = htons(ETH_P_IPV6);
 163         skb->dev = dev;
 164
 165         if (unlikely(idev->cnf.disable_ipv6)) {
 166                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                 kfree_skb(skb);
 168                 return 0;
 169         }
 170
 171         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                             net, sk, skb, NULL, dev,
 173                             ip6_finish_output,
 174                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175 }
 176
 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178 {
 179         if (!np->autoflowlabel_set)
 180                 return ip6_default_np_autolabel(net);
 181         else
 182                 return np->autoflowlabel;
 183 }
 184
 185 /*
 186  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187  * Note : socket lock is not held for SYNACK packets, but might be modified
 188  * by calls to skb_set_owner_w() and ipv6_local_error(),
 189  * which are using proper atomic operations or spinlocks.
 190  */
 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193 {
 194         struct net *net = sock_net(sk);
 195         const struct ipv6_pinfo *np = inet6_sk(sk);
 196         struct in6_addr *first_hop = &fl6->daddr;
 197         struct dst_entry *dst = skb_dst(skb);
 198         unsigned int head_room;
 199         struct ipv6hdr *hdr;
 200         u8  proto = fl6->flowi6_proto;
 201         int seg_len = skb->len;
 202         int hlimit = -1;
 203         u32 mtu;
 204
 205         head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 206         if (opt)
 207                 head_room += opt->opt_nflen + opt->opt_flen;
 208
 209         if (unlikely(skb_headroom(skb) < head_room)) {
 210                 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                 if (!skb2) {
 212                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 213                                       IPSTATS_MIB_OUTDISCARDS);
 214                         kfree_skb(skb);
 215                         return -ENOBUFS;
 216                 }
 217                 if (skb->sk)
 218                         skb_set_owner_w(skb2, skb->sk);
 219                 consume_skb(skb);
 220                 skb = skb2;
 221         }
 222
 223         if (opt) {
 224                 seg_len += opt->opt_nflen + opt->opt_flen;
 225
 226                 if (opt->opt_flen)
 227                         ipv6_push_frag_opts(skb, opt, &proto);
 228
 229                 if (opt->opt_nflen)
 230                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 231                                              &fl6->saddr);
 232         }
 233
 234         skb_push(skb, sizeof(struct ipv6hdr));
 235         skb_reset_network_header(skb);
 236         hdr = ipv6_hdr(skb);
 237
 238         /*
 239          *      Fill in the IPv6 header
 240          */
 241         if (np)
 242                 hlimit = np->hop_limit;
 243         if (hlimit < 0)
 244                 hlimit = ip6_dst_hoplimit(dst);
 245
 246         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 247                                 ip6_autoflowlabel(net, np), fl6));
 248
 249         hdr->payload_len = htons(seg_len);
 250         hdr->nexthdr = proto;
 251         hdr->hop_limit = hlimit;
 252
 253         hdr->saddr = fl6->saddr;
 254         hdr->daddr = *first_hop;
 255
 256         skb->protocol = htons(ETH_P_IPV6);
 257         skb->priority = sk->sk_priority;
 258         skb->mark = mark;
 259
 260         mtu = dst_mtu(dst);
 261         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 262                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 263                               IPSTATS_MIB_OUT, skb->len);
 264
 265                 /* if egress device is enslaved to an L3 master device pass the
 266                  * skb to its handler for processing
 267                  */
 268                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 269                 if (unlikely(!skb))
 270                         return 0;
 271
 272                 /* hooks should never assume socket lock is held.
 273                  * we promote our socket to non const
 274                  */
 275                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 276                                net, (struct sock *)sk, skb, NULL, dst->dev,
 277                                dst_output);
 278         }
 279
 280         skb->dev = dst->dev;
 281         /* ipv6_local_error() does not require socket lock,
 282          * we promote our socket to non const
 283          */
 284         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 285
 286         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 287         kfree_skb(skb);
 288         return -EMSGSIZE;
 289 }
 290 EXPORT_SYMBOL(ip6_xmit);
 291
 292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 293 {
 294         struct ip6_ra_chain *ra;
 295         struct sock *last = NULL;
 296
 297         read_lock(&ip6_ra_lock);
 298         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 299                 struct sock *sk = ra->sk;
 300                 if (sk && ra->sel == sel &&
 301                     (!sk->sk_bound_dev_if ||
 302                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 303                         if (last) {
 304                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 305                                 if (skb2)
 306                                         rawv6_rcv(last, skb2);
 307                         }
 308                         last = sk;
 309                 }
 310         }
 311
 312         if (last) {
 313                 rawv6_rcv(last, skb);
 314                 read_unlock(&ip6_ra_lock);
 315                 return 1;
 316         }
 317         read_unlock(&ip6_ra_lock);
 318         return 0;
 319 }
 320
 321 static int ip6_forward_proxy_check(struct sk_buff *skb)
 322 {
 323         struct ipv6hdr *hdr = ipv6_hdr(skb);
 324         u8 nexthdr = hdr->nexthdr;
 325         __be16 frag_off;
 326         int offset;
 327
 328         if (ipv6_ext_hdr(nexthdr)) {
 329                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 330                 if (offset < 0)
 331                         return 0;
 332         } else
 333                 offset = sizeof(struct ipv6hdr);
 334
 335         if (nexthdr == IPPROTO_ICMPV6) {
 336                 struct icmp6hdr *icmp6;
 337
 338                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 339                                          offset + 1 - skb->data)))
 340                         return 0;
 341
 342                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 343
 344                 switch (icmp6->icmp6_type) {
 345                 case NDISC_ROUTER_SOLICITATION:
 346                 case NDISC_ROUTER_ADVERTISEMENT:
 347                 case NDISC_NEIGHBOUR_SOLICITATION:
 348                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 349                 case NDISC_REDIRECT:
 350                         /* For reaction involving unicast neighbor discovery
 351                          * message destined to the proxied address, pass it to
 352                          * input function.
 353                          */
 354                         return 1;
 355                 default:
 356                         break;
 357                 }
 358         }
 359
 360         /*
 361          * The proxying router can't forward traffic sent to a link-local
 362          * address, so signal the sender and discard the packet. This
 363          * behavior is clarified by the MIPv6 specification.
 364          */
 365         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 366                 dst_link_failure(skb);
 367                 return -1;
 368         }
 369
 370         return 0;
 371 }
 372
 373 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 374                                      struct sk_buff *skb)
 375 {
 376         struct dst_entry *dst = skb_dst(skb);
 377
 378         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 379         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 380
 381 #ifdef CONFIG_NET_SWITCHDEV
 382         if (skb->offload_l3_fwd_mark) {
 383                 consume_skb(skb);
 384                 return 0;
 385         }
 386 #endif
 387
 388         skb->tstamp = 0;
 389         return dst_output(net, sk, skb);
 390 }
 391
 392 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 393 {
 394         if (skb->len <= mtu)
 395                 return false;
 396
 397         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 398         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 399                 return true;
 400
 401         if (skb->ignore_df)
 402                 return false;
 403
 404         if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 405                 return false;
 406
 407         return true;
 408 }
 409
 410 int ip6_forward(struct sk_buff *skb)
 411 {
 412         struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 413         struct dst_entry *dst = skb_dst(skb);
 414         struct ipv6hdr *hdr = ipv6_hdr(skb);
 415         struct inet6_skb_parm *opt = IP6CB(skb);
 416         struct net *net = dev_net(dst->dev);
 417         u32 mtu;
 418
 419         if (net->ipv6.devconf_all->forwarding == 0)
 420                 goto error;
 421
 422         if (skb->pkt_type != PACKET_HOST)
 423                 goto drop;
 424
 425         if (unlikely(skb->sk))
 426                 goto drop;
 427
 428         if (skb_warn_if_lro(skb))
 429                 goto drop;
 430
 431         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 432                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 433                 goto drop;
 434         }
 435
 436         skb_forward_csum(skb);
 437
 438         /*
 439          *      We DO NOT make any processing on
 440          *      RA packets, pushing them to user level AS IS
 441          *      without ane WARRANTY that application will be able
 442          *      to interpret them. The reason is that we
 443          *      cannot make anything clever here.
 444          *
 445          *      We are not end-node, so that if packet contains
 446          *      AH/ESP, we cannot make anything.
 447          *      Defragmentation also would be mistake, RA packets
 448          *      cannot be fragmented, because there is no warranty
 449          *      that different fragments will go along one path. --ANK
 450          */
 451         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 452                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 453                         return 0;
 454         }
 455
 456         /*
 457          *      check and decrement ttl
 458          */
 459         if (hdr->hop_limit <= 1) {
 460                 /* Force OUTPUT device used as source address */
 461                 skb->dev = dst->dev;
 462                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 463                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 464
 465                 kfree_skb(skb);
 466                 return -ETIMEDOUT;
 467         }
 468
 469         /* XXX: idev->cnf.proxy_ndp? */
 470         if (net->ipv6.devconf_all->proxy_ndp &&
 471             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 472                 int proxied = ip6_forward_proxy_check(skb);
 473                 if (proxied > 0)
 474                         return ip6_input(skb);
 475                 else if (proxied < 0) {
 476                         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 477                         goto drop;
 478                 }
 479         }
 480
 481         if (!xfrm6_route_forward(skb)) {
 482                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 483                 goto drop;
 484         }
 485         dst = skb_dst(skb);
 486
 487         /* IPv6 specs say nothing about it, but it is clear that we cannot
 488            send redirects to source routed frames.
 489            We don't send redirects to frames decapsulated from IPsec.
 490          */
 491         if (IP6CB(skb)->iif == dst->dev->ifindex &&
 492             opt->srcrt == 0 && !skb_sec_path(skb)) {
 493                 struct in6_addr *target = NULL;
 494                 struct inet_peer *peer;
 495                 struct rt6_info *rt;
 496
 497                 /*
 498                  *      incoming and outgoing devices are the same
 499                  *      send a redirect.
 500                  */
 501
 502                 rt = (struct rt6_info *) dst;
 503                 if (rt->rt6i_flags & RTF_GATEWAY)
 504                         target = &rt->rt6i_gateway;
 505                 else
 506                         target = &hdr->daddr;
 507
 508                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 509
 510                 /* Limit redirects both by destination (here)
 511                    and by source (inside ndisc_send_redirect)
 512                  */
 513                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 514                         ndisc_send_redirect(skb, target);
 515                 if (peer)
 516                         inet_putpeer(peer);
 517         } else {
 518                 int addrtype = ipv6_addr_type(&hdr->saddr);
 519
 520                 /* This check is security critical. */
 521                 if (addrtype == IPV6_ADDR_ANY ||
 522                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 523                         goto error;
 524                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 525                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 526                                     ICMPV6_NOT_NEIGHBOUR, 0);
 527                         goto error;
 528                 }
 529         }
 530
 531         mtu = ip6_dst_mtu_forward(dst);
 532         if (mtu < IPV6_MIN_MTU)
 533                 mtu = IPV6_MIN_MTU;
 534
 535         if (ip6_pkt_too_big(skb, mtu)) {
 536                 /* Again, force OUTPUT device used as source address */
 537                 skb->dev = dst->dev;
 538                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 539                 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 540                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 541                                 IPSTATS_MIB_FRAGFAILS);
 542                 kfree_skb(skb);
 543                 return -EMSGSIZE;
 544         }
 545
 546         if (skb_cow(skb, dst->dev->hard_header_len)) {
 547                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 548                                 IPSTATS_MIB_OUTDISCARDS);
 549                 goto drop;
 550         }
 551
 552         hdr = ipv6_hdr(skb);
 553
 554         /* Mangling hops number delayed to point after skb COW */
 555
 556         hdr->hop_limit--;
 557
 558         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 559                        net, NULL, skb, skb->dev, dst->dev,
 560                        ip6_forward_finish);
 561
 562 error:
 563         __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 564 drop:
 565         kfree_skb(skb);
 566         return -EINVAL;
 567 }
 568
 569 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 570 {
 571         to->pkt_type = from->pkt_type;
 572         to->priority = from->priority;
 573         to->protocol = from->protocol;
 574         skb_dst_drop(to);
 575         skb_dst_set(to, dst_clone(skb_dst(from)));
 576         to->dev = from->dev;
 577         to->mark = from->mark;
 578
 579         skb_copy_hash(to, from);
 580
 581 #ifdef CONFIG_NET_SCHED
 582         to->tc_index = from->tc_index;
 583 #endif
 584         nf_copy(to, from);
 585         skb_ext_copy(to, from);
 586         skb_copy_secmark(to, from);
 587 }
 588
 589 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 590                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 591 {
 592         struct sk_buff *frag;
 593         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 594         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 595                                 inet6_sk(skb->sk) : NULL;
 596         struct ipv6hdr *tmp_hdr;
 597         struct frag_hdr *fh;
 598         unsigned int mtu, hlen, left, len;
 599         int hroom, troom;
 600         __be32 frag_id;
 601         int ptr, offset = 0, err = 0;
 602         u8 *prevhdr, nexthdr = 0;
 603
 604         err = ip6_find_1stfragopt(skb, &prevhdr);
 605         if (err < 0)
 606                 goto fail;
 607         hlen = err;
 608         nexthdr = *prevhdr;
 609
 610         mtu = ip6_skb_dst_mtu(skb);
 611
 612         /* We must not fragment if the socket is set to force MTU discovery
 613          * or if the skb it not generated by a local socket.
 614          */
 615         if (unlikely(!skb->ignore_df && skb->len > mtu))
 616                 goto fail_toobig;
 617
 618         if (IP6CB(skb)->frag_max_size) {
 619                 if (IP6CB(skb)->frag_max_size > mtu)
 620                         goto fail_toobig;
 621
 622                 /* don't send fragments larger than what we received */
 623                 mtu = IP6CB(skb)->frag_max_size;
 624                 if (mtu < IPV6_MIN_MTU)
 625                         mtu = IPV6_MIN_MTU;
 626         }
 627
 628         if (np && np->frag_size < mtu) {
 629                 if (np->frag_size)
 630                         mtu = np->frag_size;
 631         }
 632         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 633                 goto fail_toobig;
 634         mtu -= hlen + sizeof(struct frag_hdr);
 635
 636         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 637                                     &ipv6_hdr(skb)->saddr);
 638
 639         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 640             (err = skb_checksum_help(skb)))
 641                 goto fail;
 642
 643         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 644         if (skb_has_frag_list(skb)) {
 645                 unsigned int first_len = skb_pagelen(skb);
 646                 struct sk_buff *frag2;
 647
 648                 if (first_len - hlen > mtu ||
 649                     ((first_len - hlen) & 7) ||
 650                     skb_cloned(skb) ||
 651                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 652                         goto slow_path;
 653
 654                 skb_walk_frags(skb, frag) {
 655                         /* Correct geometry. */
 656                         if (frag->len > mtu ||
 657                             ((frag->len & 7) && frag->next) ||
 658                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 659                                 goto slow_path_clean;
 660
 661                         /* Partially cloned skb? */
 662                         if (skb_shared(frag))
 663                                 goto slow_path_clean;
 664
 665                         BUG_ON(frag->sk);
 666                         if (skb->sk) {
 667                                 frag->sk = skb->sk;
 668                                 frag->destructor = sock_wfree;
 669                         }
 670                         skb->truesize -= frag->truesize;
 671                 }
 672
 673                 err = 0;
 674                 offset = 0;
 675                 /* BUILD HEADER */
 676
 677                 *prevhdr = NEXTHDR_FRAGMENT;
 678                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 679                 if (!tmp_hdr) {
 680                         err = -ENOMEM;
 681                         goto fail;
 682                 }
 683                 frag = skb_shinfo(skb)->frag_list;
 684                 skb_frag_list_init(skb);
 685
 686                 __skb_pull(skb, hlen);
 687                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 688                 __skb_push(skb, hlen);
 689                 skb_reset_network_header(skb);
 690                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 691
 692                 fh->nexthdr = nexthdr;
 693                 fh->reserved = 0;
 694                 fh->frag_off = htons(IP6_MF);
 695                 fh->identification = frag_id;
 696
 697                 first_len = skb_pagelen(skb);
 698                 skb->data_len = first_len - skb_headlen(skb);
 699                 skb->len = first_len;
 700                 ipv6_hdr(skb)->payload_len = htons(first_len -
 701                                                    sizeof(struct ipv6hdr));
 702
 703                 for (;;) {
 704                         /* Prepare header of the next frame,
 705                          * before previous one went down. */
 706                         if (frag) {
 707                                 frag->ip_summed = CHECKSUM_NONE;
 708                                 skb_reset_transport_header(frag);
 709                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 710                                 __skb_push(frag, hlen);
 711                                 skb_reset_network_header(frag);
 712                                 memcpy(skb_network_header(frag), tmp_hdr,
 713                                        hlen);
 714                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 715                                 fh->nexthdr = nexthdr;
 716                                 fh->reserved = 0;
 717                                 fh->frag_off = htons(offset);
 718                                 if (frag->next)
 719                                         fh->frag_off |= htons(IP6_MF);
 720                                 fh->identification = frag_id;
 721                                 ipv6_hdr(frag)->payload_len =
 722                                                 htons(frag->len -
 723                                                       sizeof(struct ipv6hdr));
 724                                 ip6_copy_metadata(frag, skb);
 725                         }
 726
 727                         err = output(net, sk, skb);
 728                         if (!err)
 729                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 730                                               IPSTATS_MIB_FRAGCREATES);
 731
 732                         if (err || !frag)
 733                                 break;
 734
 735                         skb = frag;
 736                         frag = skb->next;
 737                         skb_mark_not_on_list(skb);
 738                 }
 739
 740                 kfree(tmp_hdr);
 741
 742                 if (err == 0) {
 743                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 744                                       IPSTATS_MIB_FRAGOKS);
 745                         return 0;
 746                 }
 747
 748                 kfree_skb_list(frag);
 749
 750                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 751                               IPSTATS_MIB_FRAGFAILS);
 752                 return err;
 753
 754 slow_path_clean:
 755                 skb_walk_frags(skb, frag2) {
 756                         if (frag2 == frag)
 757                                 break;
 758                         frag2->sk = NULL;
 759                         frag2->destructor = NULL;
 760                         skb->truesize += frag2->truesize;
 761                 }
 762         }
 763
 764 slow_path:
 765         left = skb->len - hlen;         /* Space per frame */
 766         ptr = hlen;                     /* Where to start from */
 767
 768         /*
 769          *      Fragment the datagram.
 770          */
 771
 772         troom = rt->dst.dev->needed_tailroom;
 773
 774         /*
 775          *      Keep copying data until we run out.
 776          */
 777         while (left > 0)        {
 778                 u8 *fragnexthdr_offset;
 779
 780                 len = left;
 781                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 782                 if (len > mtu)
 783                         len = mtu;
 784                 /* IF: we are not sending up to and including the packet end
 785                    then align the next start on an eight byte boundary */
 786                 if (len < left) {
 787                         len &= ~7;
 788                 }
 789
 790                 /* Allocate buffer */
 791                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 792                                  hroom + troom, GFP_ATOMIC);
 793                 if (!frag) {
 794                         err = -ENOMEM;
 795                         goto fail;
 796                 }
 797
 798                 /*
 799                  *      Set up data on packet
 800                  */
 801
 802                 ip6_copy_metadata(frag, skb);
 803                 skb_reserve(frag, hroom);
 804                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 805                 skb_reset_network_header(frag);
 806                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 807                 frag->transport_header = (frag->network_header + hlen +
 808                                           sizeof(struct frag_hdr));
 809
 810                 /*
 811                  *      Charge the memory for the fragment to any owner
 812                  *      it might possess
 813                  */
 814                 if (skb->sk)
 815                         skb_set_owner_w(frag, skb->sk);
 816
 817                 /*
 818                  *      Copy the packet header into the new buffer.
 819                  */
 820                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 821
 822                 fragnexthdr_offset = skb_network_header(frag);
 823                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 824                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 825
 826                 /*
 827                  *      Build fragment header.
 828                  */
 829                 fh->nexthdr = nexthdr;
 830                 fh->reserved = 0;
 831                 fh->identification = frag_id;
 832
 833                 /*
 834                  *      Copy a block of the IP datagram.
 835                  */
 836                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 837                                      len));
 838                 left -= len;
 839
 840                 fh->frag_off = htons(offset);
 841                 if (left > 0)
 842                         fh->frag_off |= htons(IP6_MF);
 843                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 844                                                     sizeof(struct ipv6hdr));
 845
 846                 ptr += len;
 847                 offset += len;
 848
 849                 /*
 850                  *      Put this fragment into the sending queue.
 851                  */
 852                 err = output(net, sk, frag);
 853                 if (err)
 854                         goto fail;
 855
 856                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 857                               IPSTATS_MIB_FRAGCREATES);
 858         }
 859         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 860                       IPSTATS_MIB_FRAGOKS);
 861         consume_skb(skb);
 862         return err;
 863
 864 fail_toobig:
 865         if (skb->sk && dst_allfrag(skb_dst(skb)))
 866                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 867
 868         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 869         err = -EMSGSIZE;
 870
 871 fail:
 872         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 873                       IPSTATS_MIB_FRAGFAILS);
 874         kfree_skb(skb);
 875         return err;
 876 }
 877
 878 static inline int ip6_rt_check(const struct rt6key *rt_key,
 879                                const struct in6_addr *fl_addr,
 880                                const struct in6_addr *addr_cache)
 881 {
 882         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 883                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 884 }
 885
 886 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 887                                           struct dst_entry *dst,
 888                                           const struct flowi6 *fl6)
 889 {
 890         struct ipv6_pinfo *np = inet6_sk(sk);
 891         struct rt6_info *rt;
 892
 893         if (!dst)
 894                 goto out;
 895
 896         if (dst->ops->family != AF_INET6) {
 897                 dst_release(dst);
 898                 return NULL;
 899         }
 900
 901         rt = (struct rt6_info *)dst;
 902         /* Yes, checking route validity in not connected
 903          * case is not very simple. Take into account,
 904          * that we do not support routing by source, TOS,
 905          * and MSG_DONTROUTE            --ANK (980726)
 906          *
 907          * 1. ip6_rt_check(): If route was host route,
 908          *    check that cached destination is current.
 909          *    If it is network route, we still may
 910          *    check its validity using saved pointer
 911          *    to the last used address: daddr_cache.
 912          *    We do not want to save whole address now,
 913          *    (because main consumer of this service
 914          *    is tcp, which has not this problem),
 915          *    so that the last trick works only on connected
 916          *    sockets.
 917          * 2. oif also should be the same.
 918          */
 919         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 920 #ifdef CONFIG_IPV6_SUBTREES
 921             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 922 #endif
 923            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 924               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 925                 dst_release(dst);
 926                 dst = NULL;
 927         }
 928
 929 out:
 930         return dst;
 931 }
 932
 933 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 934                                struct dst_entry **dst, struct flowi6 *fl6)
 935 {
 936 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 937         struct neighbour *n;
 938         struct rt6_info *rt;
 939 #endif
 940         int err;
 941         int flags = 0;
 942
 943         /* The correct way to handle this would be to do
 944          * ip6_route_get_saddr, and then ip6_route_output; however,
 945          * the route-specific preferred source forces the
 946          * ip6_route_output call _before_ ip6_route_get_saddr.
 947          *
 948          * In source specific routing (no src=any default route),
 949          * ip6_route_output will fail given src=any saddr, though, so
 950          * that's why we try it again later.
 951          */
 952         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 953                 struct fib6_info *from;
 954                 struct rt6_info *rt;
 955                 bool had_dst = *dst != NULL;
 956
 957                 if (!had_dst)
 958                         *dst = ip6_route_output(net, sk, fl6);
 959                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 960
 961                 rcu_read_lock();
 962                 from = rt ? rcu_dereference(rt->from) : NULL;
 963                 err = ip6_route_get_saddr(net, from, &fl6->daddr,
 964                                           sk ? inet6_sk(sk)->srcprefs : 0,
 965                                           &fl6->saddr);
 966                 rcu_read_unlock();
 967
 968                 if (err)
 969                         goto out_err_release;
 970
 971                 /* If we had an erroneous initial result, pretend it
 972                  * never existed and let the SA-enabled version take
 973                  * over.
 974                  */
 975                 if (!had_dst && (*dst)->error) {
 976                         dst_release(*dst);
 977                         *dst = NULL;
 978                 }
 979
 980                 if (fl6->flowi6_oif)
 981                         flags |= RT6_LOOKUP_F_IFACE;
 982         }
 983
 984         if (!*dst)
 985                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 986
 987         err = (*dst)->error;
 988         if (err)
 989                 goto out_err_release;
 990
 991 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 992         /*
 993          * Here if the dst entry we've looked up
 994          * has a neighbour entry that is in the INCOMPLETE
 995          * state and the src address from the flow is
 996          * marked as OPTIMISTIC, we release the found
 997          * dst entry and replace it instead with the
 998          * dst entry of the nexthop router
 999          */
1000         rt = (struct rt6_info *) *dst;
1001         rcu_read_lock_bh();
1002         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1003                                       rt6_nexthop(rt, &fl6->daddr));
1004         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1005         rcu_read_unlock_bh();
1006
1007         if (err) {
1008                 struct inet6_ifaddr *ifp;
1009                 struct flowi6 fl_gw6;
1010                 int redirect;
1011
1012                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1013                                       (*dst)->dev, 1);
1014
1015                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1016                 if (ifp)
1017                         in6_ifa_put(ifp);
1018
1019                 if (redirect) {
1020                         /*
1021                          * We need to get the dst entry for the
1022                          * default router instead
1023                          */
1024                         dst_release(*dst);
1025                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1026                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1027                         *dst = ip6_route_output(net, sk, &fl_gw6);
1028                         err = (*dst)->error;
1029                         if (err)
1030                                 goto out_err_release;
1031                 }
1032         }
1033 #endif
1034         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1035             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1036                 err = -EAFNOSUPPORT;
1037                 goto out_err_release;
1038         }
1039
1040         return 0;
1041
1042 out_err_release:
1043         dst_release(*dst);
1044         *dst = NULL;
1045
1046         if (err == -ENETUNREACH)
1047                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1048         return err;
1049 }
1050
1051 /**
1052  *      ip6_dst_lookup - perform route lookup on flow
1053  *      @sk: socket which provides route info
1054  *      @dst: pointer to dst_entry * for result
1055  *      @fl6: flow to lookup
1056  *
1057  *      This function performs a route lookup on the given flow.
1058  *
1059  *      It returns zero on success, or a standard errno code on error.
1060  */
1061 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1062                    struct flowi6 *fl6)
1063 {
1064         *dst = NULL;
1065         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1066 }
1067 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1068
1069 /**
1070  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1071  *      @sk: socket which provides route info
1072  *      @fl6: flow to lookup
1073  *      @final_dst: final destination address for ipsec lookup
1074  *
1075  *      This function performs a route lookup on the given flow.
1076  *
1077  *      It returns a valid dst pointer on success, or a pointer encoded
1078  *      error code.
1079  */
1080 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1081                                       const struct in6_addr *final_dst)
1082 {
1083         struct dst_entry *dst = NULL;
1084         int err;
1085
1086         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1087         if (err)
1088                 return ERR_PTR(err);
1089         if (final_dst)
1090                 fl6->daddr = *final_dst;
1091
1092         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1093 }
1094 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1095
1096 /**
1097  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1098  *      @sk: socket which provides the dst cache and route info
1099  *      @fl6: flow to lookup
1100  *      @final_dst: final destination address for ipsec lookup
1101  *      @connected: whether @sk is connected or not
1102  *
1103  *      This function performs a route lookup on the given flow with the
1104  *      possibility of using the cached route in the socket if it is valid.
1105  *      It will take the socket dst lock when operating on the dst cache.
1106  *      As a result, this function can only be used in process context.
1107  *
1108  *      In addition, for a connected socket, cache the dst in the socket
1109  *      if the current cache is not valid.
1110  *
1111  *      It returns a valid dst pointer on success, or a pointer encoded
1112  *      error code.
1113  */
1114 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1115                                          const struct in6_addr *final_dst,
1116                                          bool connected)
1117 {
1118         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1119
1120         dst = ip6_sk_dst_check(sk, dst, fl6);
1121         if (dst)
1122                 return dst;
1123
1124         dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1125         if (connected && !IS_ERR(dst))
1126                 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1127
1128         return dst;
1129 }
1130 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1131
1132 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1133                                                gfp_t gfp)
1134 {
1135         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1136 }
1137
1138 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1139                                                 gfp_t gfp)
1140 {
1141         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1142 }
1143
1144 static void ip6_append_data_mtu(unsigned int *mtu,
1145                                 int *maxfraglen,
1146                                 unsigned int fragheaderlen,
1147                                 struct sk_buff *skb,
1148                                 struct rt6_info *rt,
1149                                 unsigned int orig_mtu)
1150 {
1151         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1152                 if (!skb) {
1153                         /* first fragment, reserve header_len */
1154                         *mtu = orig_mtu - rt->dst.header_len;
1155
1156                 } else {
1157                         /*
1158                          * this fragment is not first, the headers
1159                          * space is regarded as data space.
1160                          */
1161                         *mtu = orig_mtu;
1162                 }
1163                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1164                               + fragheaderlen - sizeof(struct frag_hdr);
1165         }
1166 }
1167
1168 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1169                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1170                           struct rt6_info *rt, struct flowi6 *fl6)
1171 {
1172         struct ipv6_pinfo *np = inet6_sk(sk);
1173         unsigned int mtu;
1174         struct ipv6_txoptions *opt = ipc6->opt;
1175
1176         /*
1177          * setup for corking
1178          */
1179         if (opt) {
1180                 if (WARN_ON(v6_cork->opt))
1181                         return -EINVAL;
1182
1183                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1184                 if (unlikely(!v6_cork->opt))
1185                         return -ENOBUFS;
1186
1187                 v6_cork->opt->tot_len = sizeof(*opt);
1188                 v6_cork->opt->opt_flen = opt->opt_flen;
1189                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1190
1191                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1192                                                     sk->sk_allocation);
1193                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1194                         return -ENOBUFS;
1195
1196                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1197                                                     sk->sk_allocation);
1198                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1199                         return -ENOBUFS;
1200
1201                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1202                                                    sk->sk_allocation);
1203                 if (opt->hopopt && !v6_cork->opt->hopopt)
1204                         return -ENOBUFS;
1205
1206                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1207                                                     sk->sk_allocation);
1208                 if (opt->srcrt && !v6_cork->opt->srcrt)
1209                         return -ENOBUFS;
1210
1211                 /* need source address above miyazawa*/
1212         }
1213         dst_hold(&rt->dst);
1214         cork->base.dst = &rt->dst;
1215         cork->fl.u.ip6 = *fl6;
1216         v6_cork->hop_limit = ipc6->hlimit;
1217         v6_cork->tclass = ipc6->tclass;
1218         if (rt->dst.flags & DST_XFRM_TUNNEL)
1219                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1220                       READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1221         else
1222                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1223                         READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1224         if (np->frag_size < mtu) {
1225                 if (np->frag_size)
1226                         mtu = np->frag_size;
1227         }
1228         if (mtu < IPV6_MIN_MTU)
1229                 return -EINVAL;
1230         cork->base.fragsize = mtu;
1231         cork->base.gso_size = ipc6->gso_size;
1232         cork->base.tx_flags = 0;
1233         sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1234
1235         if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1236                 cork->base.flags |= IPCORK_ALLFRAG;
1237         cork->base.length = 0;
1238
1239         cork->base.transmit_time = ipc6->sockc.transmit_time;
1240
1241         return 0;
1242 }
1243
1244 static int __ip6_append_data(struct sock *sk,
1245                              struct flowi6 *fl6,
1246                              struct sk_buff_head *queue,
1247                              struct inet_cork *cork,
1248                              struct inet6_cork *v6_cork,
1249                              struct page_frag *pfrag,
1250                              int getfrag(void *from, char *to, int offset,
1251                                          int len, int odd, struct sk_buff *skb),
1252                              void *from, int length, int transhdrlen,
1253                              unsigned int flags, struct ipcm6_cookie *ipc6)
1254 {
1255         struct sk_buff *skb, *skb_prev = NULL;
1256         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1257         struct ubuf_info *uarg = NULL;
1258         int exthdrlen = 0;
1259         int dst_exthdrlen = 0;
1260         int hh_len;
1261         int copy;
1262         int err;
1263         int offset = 0;
1264         u32 tskey = 0;
1265         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1266         struct ipv6_txoptions *opt = v6_cork->opt;
1267         int csummode = CHECKSUM_NONE;
1268         unsigned int maxnonfragsize, headersize;
1269         unsigned int wmem_alloc_delta = 0;
1270         bool paged, extra_uref;
1271
1272         skb = skb_peek_tail(queue);
1273         if (!skb) {
1274                 exthdrlen = opt ? opt->opt_flen : 0;
1275                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1276         }
1277
1278         paged = !!cork->gso_size;
1279         mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1280         orig_mtu = mtu;
1281
1282         if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1283             sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1284                 tskey = sk->sk_tskey++;
1285
1286         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1287
1288         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1289                         (opt ? opt->opt_nflen : 0);
1290         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1291                      sizeof(struct frag_hdr);
1292
1293         headersize = sizeof(struct ipv6hdr) +
1294                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1295                      (dst_allfrag(&rt->dst) ?
1296                       sizeof(struct frag_hdr) : 0) +
1297                      rt->rt6i_nfheader_len;
1298
1299         /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1300          * the first fragment
1301          */
1302         if (headersize + transhdrlen > mtu)
1303                 goto emsgsize;
1304
1305         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1306             (sk->sk_protocol == IPPROTO_UDP ||
1307              sk->sk_protocol == IPPROTO_RAW)) {
1308                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1309                                 sizeof(struct ipv6hdr));
1310                 goto emsgsize;
1311         }
1312
1313         if (ip6_sk_ignore_df(sk))
1314                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1315         else
1316                 maxnonfragsize = mtu;
1317
1318         if (cork->length + length > maxnonfragsize - headersize) {
1319 emsgsize:
1320                 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1321                 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1322                 return -EMSGSIZE;
1323         }
1324
1325         /* CHECKSUM_PARTIAL only with no extension headers and when
1326          * we are not going to fragment
1327          */
1328         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1329             headersize == sizeof(struct ipv6hdr) &&
1330             length <= mtu - headersize &&
1331             (!(flags & MSG_MORE) || cork->gso_size) &&
1332             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1333                 csummode = CHECKSUM_PARTIAL;
1334
1335         if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1336                 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1337                 if (!uarg)
1338                         return -ENOBUFS;
1339                 extra_uref = true;
1340                 if (rt->dst.dev->features & NETIF_F_SG &&
1341                     csummode == CHECKSUM_PARTIAL) {
1342                         paged = true;
1343                 } else {
1344                         uarg->zerocopy = 0;
1345                         skb_zcopy_set(skb, uarg, &extra_uref);
1346                 }
1347         }
1348
1349         /*
1350          * Let's try using as much space as possible.
1351          * Use MTU if total length of the message fits into the MTU.
1352          * Otherwise, we need to reserve fragment header and
1353          * fragment alignment (= 8-15 octects, in total).
1354          *
1355          * Note that we may need to "move" the data from the tail of
1356          * of the buffer to the new fragment when we split
1357          * the message.
1358          *
1359          * FIXME: It may be fragmented into multiple chunks
1360          *        at once if non-fragmentable extension headers
1361          *        are too large.
1362          * --yoshfuji
1363          */
1364
1365         cork->length += length;
1366         if (!skb)
1367                 goto alloc_new_skb;
1368
1369         while (length > 0) {
1370                 /* Check if the remaining data fits into current packet. */
1371                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1372                 if (copy < length)
1373                         copy = maxfraglen - skb->len;
1374
1375                 if (copy <= 0) {
1376                         char *data;
1377                         unsigned int datalen;
1378                         unsigned int fraglen;
1379                         unsigned int fraggap;
1380                         unsigned int alloclen;
1381                         unsigned int pagedlen;
1382 alloc_new_skb:
1383                         /* There's no room in the current skb */
1384                         if (skb)
1385                                 fraggap = skb->len - maxfraglen;
1386                         else
1387                                 fraggap = 0;
1388                         /* update mtu and maxfraglen if necessary */
1389                         if (!skb || !skb_prev)
1390                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1391                                                     fragheaderlen, skb, rt,
1392                                                     orig_mtu);
1393
1394                         skb_prev = skb;
1395
1396                         /*
1397                          * If remaining data exceeds the mtu,
1398                          * we know we need more fragment(s).
1399                          */
1400                         datalen = length + fraggap;
1401
1402                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1403                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1404                         fraglen = datalen + fragheaderlen;
1405                         pagedlen = 0;
1406
1407                         if ((flags & MSG_MORE) &&
1408                             !(rt->dst.dev->features&NETIF_F_SG))
1409                                 alloclen = mtu;
1410                         else if (!paged)
1411                                 alloclen = fraglen;
1412                         else {
1413                                 alloclen = min_t(int, fraglen, MAX_HEADER);
1414                                 pagedlen = fraglen - alloclen;
1415                         }
1416
1417                         alloclen += dst_exthdrlen;
1418
1419                         if (datalen != length + fraggap) {
1420                                 /*
1421                                  * this is not the last fragment, the trailer
1422                                  * space is regarded as data space.
1423                                  */
1424                                 datalen += rt->dst.trailer_len;
1425                         }
1426
1427                         alloclen += rt->dst.trailer_len;
1428                         fraglen = datalen + fragheaderlen;
1429
1430                         /*
1431                          * We just reserve space for fragment header.
1432                          * Note: this may be overallocation if the message
1433                          * (without MSG_MORE) fits into the MTU.
1434                          */
1435                         alloclen += sizeof(struct frag_hdr);
1436
1437                         copy = datalen - transhdrlen - fraggap - pagedlen;
1438                         if (copy < 0) {
1439                                 err = -EINVAL;
1440                                 goto error;
1441                         }
1442                         if (transhdrlen) {
1443                                 skb = sock_alloc_send_skb(sk,
1444                                                 alloclen + hh_len,
1445                                                 (flags & MSG_DONTWAIT), &err);
1446                         } else {
1447                                 skb = NULL;
1448                                 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1449                                     2 * sk->sk_sndbuf)
1450                                         skb = alloc_skb(alloclen + hh_len,
1451                                                         sk->sk_allocation);
1452                                 if (unlikely(!skb))
1453                                         err = -ENOBUFS;
1454                         }
1455                         if (!skb)
1456                                 goto error;
1457                         /*
1458                          *      Fill in the control structures
1459                          */
1460                         skb->protocol = htons(ETH_P_IPV6);
1461                         skb->ip_summed = csummode;
1462                         skb->csum = 0;
1463                         /* reserve for fragmentation and ipsec header */
1464                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1465                                     dst_exthdrlen);
1466
1467                         /*
1468                          *      Find where to start putting bytes
1469                          */
1470                         data = skb_put(skb, fraglen - pagedlen);
1471                         skb_set_network_header(skb, exthdrlen);
1472                         data += fragheaderlen;
1473                         skb->transport_header = (skb->network_header +
1474                                                  fragheaderlen);
1475                         if (fraggap) {
1476                                 skb->csum = skb_copy_and_csum_bits(
1477                                         skb_prev, maxfraglen,
1478                                         data + transhdrlen, fraggap, 0);
1479                                 skb_prev->csum = csum_sub(skb_prev->csum,
1480                                                           skb->csum);
1481                                 data += fraggap;
1482                                 pskb_trim_unique(skb_prev, maxfraglen);
1483                         }
1484                         if (copy > 0 &&
1485                             getfrag(from, data + transhdrlen, offset,
1486                                     copy, fraggap, skb) < 0) {
1487                                 err = -EFAULT;
1488                                 kfree_skb(skb);
1489                                 goto error;
1490                         }
1491
1492                         offset += copy;
1493                         length -= copy + transhdrlen;
1494                         transhdrlen = 0;
1495                         exthdrlen = 0;
1496                         dst_exthdrlen = 0;
1497
1498                         /* Only the initial fragment is time stamped */
1499                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
1500                         cork->tx_flags = 0;
1501                         skb_shinfo(skb)->tskey = tskey;
1502                         tskey = 0;
1503                         skb_zcopy_set(skb, uarg, &extra_uref);
1504
1505                         if ((flags & MSG_CONFIRM) && !skb_prev)
1506                                 skb_set_dst_pending_confirm(skb, 1);
1507
1508                         /*
1509                          * Put the packet on the pending queue
1510                          */
1511                         if (!skb->destructor) {
1512                                 skb->destructor = sock_wfree;
1513                                 skb->sk = sk;
1514                                 wmem_alloc_delta += skb->truesize;
1515                         }
1516                         __skb_queue_tail(queue, skb);
1517                         continue;
1518                 }
1519
1520                 if (copy > length)
1521                         copy = length;
1522
1523                 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1524                     skb_tailroom(skb) >= copy) {
1525                         unsigned int off;
1526
1527                         off = skb->len;
1528                         if (getfrag(from, skb_put(skb, copy),
1529                                                 offset, copy, off, skb) < 0) {
1530                                 __skb_trim(skb, off);
1531                                 err = -EFAULT;
1532                                 goto error;
1533                         }
1534                 } else if (!uarg || !uarg->zerocopy) {
1535                         int i = skb_shinfo(skb)->nr_frags;
1536
1537                         err = -ENOMEM;
1538                         if (!sk_page_frag_refill(sk, pfrag))
1539                                 goto error;
1540
1541                         if (!skb_can_coalesce(skb, i, pfrag->page,
1542                                               pfrag->offset)) {
1543                                 err = -EMSGSIZE;
1544                                 if (i == MAX_SKB_FRAGS)
1545                                         goto error;
1546
1547                                 __skb_fill_page_desc(skb, i, pfrag->page,
1548                                                      pfrag->offset, 0);
1549                                 skb_shinfo(skb)->nr_frags = ++i;
1550                                 get_page(pfrag->page);
1551                         }
1552                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1553                         if (getfrag(from,
1554                                     page_address(pfrag->page) + pfrag->offset,
1555                                     offset, copy, skb->len, skb) < 0)
1556                                 goto error_efault;
1557
1558                         pfrag->offset += copy;
1559                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1560                         skb->len += copy;
1561                         skb->data_len += copy;
1562                         skb->truesize += copy;
1563                         wmem_alloc_delta += copy;
1564                 } else {
1565                         err = skb_zerocopy_iter_dgram(skb, from, copy);
1566                         if (err < 0)
1567                                 goto error;
1568                 }
1569                 offset += copy;
1570                 length -= copy;
1571         }
1572
1573         if (wmem_alloc_delta)
1574                 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1575         return 0;
1576
1577 error_efault:
1578         err = -EFAULT;
1579 error:
1580         if (uarg)
1581                 sock_zerocopy_put_abort(uarg, extra_uref);
1582         cork->length -= length;
1583         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1584         refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1585         return err;
1586 }
1587
1588 int ip6_append_data(struct sock *sk,
1589                     int getfrag(void *from, char *to, int offset, int len,
1590                                 int odd, struct sk_buff *skb),
1591                     void *from, int length, int transhdrlen,
1592                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1593                     struct rt6_info *rt, unsigned int flags)
1594 {
1595         struct inet_sock *inet = inet_sk(sk);
1596         struct ipv6_pinfo *np = inet6_sk(sk);
1597         int exthdrlen;
1598         int err;
1599
1600         if (flags&MSG_PROBE)
1601                 return 0;
1602         if (skb_queue_empty(&sk->sk_write_queue)) {
1603                 /*
1604                  * setup for corking
1605                  */
1606                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1607                                      ipc6, rt, fl6);
1608                 if (err)
1609                         return err;
1610
1611                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1612                 length += exthdrlen;
1613                 transhdrlen += exthdrlen;
1614         } else {
1615                 fl6 = &inet->cork.fl.u.ip6;
1616                 transhdrlen = 0;
1617         }
1618
1619         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1620                                  &np->cork, sk_page_frag(sk), getfrag,
1621                                  from, length, transhdrlen, flags, ipc6);
1622 }
1623 EXPORT_SYMBOL_GPL(ip6_append_data);
1624
1625 static void ip6_cork_release(struct inet_cork_full *cork,
1626                              struct inet6_cork *v6_cork)
1627 {
1628         if (v6_cork->opt) {
1629                 kfree(v6_cork->opt->dst0opt);
1630                 kfree(v6_cork->opt->dst1opt);
1631                 kfree(v6_cork->opt->hopopt);
1632                 kfree(v6_cork->opt->srcrt);
1633                 kfree(v6_cork->opt);
1634                 v6_cork->opt = NULL;
1635         }
1636
1637         if (cork->base.dst) {
1638                 dst_release(cork->base.dst);
1639                 cork->base.dst = NULL;
1640                 cork->base.flags &= ~IPCORK_ALLFRAG;
1641         }
1642         memset(&cork->fl, 0, sizeof(cork->fl));
1643 }
1644
1645 struct sk_buff *__ip6_make_skb(struct sock *sk,
1646                                struct sk_buff_head *queue,
1647                                struct inet_cork_full *cork,
1648                                struct inet6_cork *v6_cork)
1649 {
1650         struct sk_buff *skb, *tmp_skb;
1651         struct sk_buff **tail_skb;
1652         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1653         struct ipv6_pinfo *np = inet6_sk(sk);
1654         struct net *net = sock_net(sk);
1655         struct ipv6hdr *hdr;
1656         struct ipv6_txoptions *opt = v6_cork->opt;
1657         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1658         struct flowi6 *fl6 = &cork->fl.u.ip6;
1659         unsigned char proto = fl6->flowi6_proto;
1660
1661         skb = __skb_dequeue(queue);
1662         if (!skb)
1663                 goto out;
1664         tail_skb = &(skb_shinfo(skb)->frag_list);
1665
1666         /* move skb->data to ip header from ext header */
1667         if (skb->data < skb_network_header(skb))
1668                 __skb_pull(skb, skb_network_offset(skb));
1669         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1670                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1671                 *tail_skb = tmp_skb;
1672                 tail_skb = &(tmp_skb->next);
1673                 skb->len += tmp_skb->len;
1674                 skb->data_len += tmp_skb->len;
1675                 skb->truesize += tmp_skb->truesize;
1676                 tmp_skb->destructor = NULL;
1677                 tmp_skb->sk = NULL;
1678         }
1679
1680         /* Allow local fragmentation. */
1681         skb->ignore_df = ip6_sk_ignore_df(sk);
1682
1683         *final_dst = fl6->daddr;
1684         __skb_pull(skb, skb_network_header_len(skb));
1685         if (opt && opt->opt_flen)
1686                 ipv6_push_frag_opts(skb, opt, &proto);
1687         if (opt && opt->opt_nflen)
1688                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1689
1690         skb_push(skb, sizeof(struct ipv6hdr));
1691         skb_reset_network_header(skb);
1692         hdr = ipv6_hdr(skb);
1693
1694         ip6_flow_hdr(hdr, v6_cork->tclass,
1695                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1696                                         ip6_autoflowlabel(net, np), fl6));
1697         hdr->hop_limit = v6_cork->hop_limit;
1698         hdr->nexthdr = proto;
1699         hdr->saddr = fl6->saddr;
1700         hdr->daddr = *final_dst;
1701
1702         skb->priority = sk->sk_priority;
1703         skb->mark = sk->sk_mark;
1704
1705         skb->tstamp = cork->base.transmit_time;
1706
1707         skb_dst_set(skb, dst_clone(&rt->dst));
1708         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1709         if (proto == IPPROTO_ICMPV6) {
1710                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1711
1712                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1713                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1714         }
1715
1716         ip6_cork_release(cork, v6_cork);
1717 out:
1718         return skb;
1719 }
1720
1721 int ip6_send_skb(struct sk_buff *skb)
1722 {
1723         struct net *net = sock_net(skb->sk);
1724         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1725         int err;
1726
1727         err = ip6_local_out(net, skb->sk, skb);
1728         if (err) {
1729                 if (err > 0)
1730                         err = net_xmit_errno(err);
1731                 if (err)
1732                         IP6_INC_STATS(net, rt->rt6i_idev,
1733                                       IPSTATS_MIB_OUTDISCARDS);
1734         }
1735
1736         return err;
1737 }
1738
1739 int ip6_push_pending_frames(struct sock *sk)
1740 {
1741         struct sk_buff *skb;
1742
1743         skb = ip6_finish_skb(sk);
1744         if (!skb)
1745                 return 0;
1746
1747         return ip6_send_skb(skb);
1748 }
1749 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1750
1751 static void __ip6_flush_pending_frames(struct sock *sk,
1752                                        struct sk_buff_head *queue,
1753                                        struct inet_cork_full *cork,
1754                                        struct inet6_cork *v6_cork)
1755 {
1756         struct sk_buff *skb;
1757
1758         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1759                 if (skb_dst(skb))
1760                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1761                                       IPSTATS_MIB_OUTDISCARDS);
1762                 kfree_skb(skb);
1763         }
1764
1765         ip6_cork_release(cork, v6_cork);
1766 }
1767
1768 void ip6_flush_pending_frames(struct sock *sk)
1769 {
1770         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1771                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1772 }
1773 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1774
1775 struct sk_buff *ip6_make_skb(struct sock *sk,
1776                              int getfrag(void *from, char *to, int offset,
1777                                          int len, int odd, struct sk_buff *skb),
1778                              void *from, int length, int transhdrlen,
1779                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1780                              struct rt6_info *rt, unsigned int flags,
1781                              struct inet_cork_full *cork)
1782 {
1783         struct inet6_cork v6_cork;
1784         struct sk_buff_head queue;
1785         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1786         int err;
1787
1788         if (flags & MSG_PROBE)
1789                 return NULL;
1790
1791         __skb_queue_head_init(&queue);
1792
1793         cork->base.flags = 0;
1794         cork->base.addr = 0;
1795         cork->base.opt = NULL;
1796         cork->base.dst = NULL;
1797         v6_cork.opt = NULL;
1798         err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1799         if (err) {
1800                 ip6_cork_release(cork, &v6_cork);
1801                 return ERR_PTR(err);
1802         }
1803         if (ipc6->dontfrag < 0)
1804                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1805
1806         err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1807                                 &current->task_frag, getfrag, from,
1808                                 length + exthdrlen, transhdrlen + exthdrlen,
1809                                 flags, ipc6);
1810         if (err) {
1811                 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1812                 return ERR_PTR(err);
1813         }
1814
1815         return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1816 }