net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <[email protected]>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/bpf-cgroup.h>
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv6.h>
  45
  46 #include <net/sock.h>
  47 #include <net/snmp.h>
  48
  49 #include <net/ipv6.h>
  50 #include <net/ndisc.h>
  51 #include <net/protocol.h>
  52 #include <net/ip6_route.h>
  53 #include <net/addrconf.h>
  54 #include <net/rawv6.h>
  55 #include <net/icmp.h>
  56 #include <net/xfrm.h>
  57 #include <net/checksum.h>
  58 #include <linux/mroute6.h>
  59 #include <net/l3mdev.h>
  60 #include <net/lwtunnel.h>
  61
  62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63 {
  64         struct dst_entry *dst = skb_dst(skb);
  65         struct net_device *dev = dst->dev;
  66         struct neighbour *neigh;
  67         struct in6_addr *nexthop;
  68         int ret;
  69
  70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                     ((mroute6_socket(net, skb) &&
  75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                          &ipv6_hdr(skb)->saddr))) {
  78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                         /* Do not check for IFF_ALLMULTI; multicast routing
  81                            is not supported in any case.
  82                          */
  83                         if (newskb)
  84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                         net, sk, newskb, NULL, newskb->dev,
  86                                         dev_loopback_xmit);
  87
  88                         if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                 IP6_INC_STATS(net, idev,
  90                                               IPSTATS_MIB_OUTDISCARDS);
  91                                 kfree_skb(skb);
  92                                 return 0;
  93                         }
  94                 }
  95
  96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                     IPV6_ADDR_SCOPE_NODELOCAL &&
 100                     !(dev->flags & IFF_LOOPBACK)) {
 101                         kfree_skb(skb);
 102                         return 0;
 103                 }
 104         }
 105
 106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                 int res = lwtunnel_xmit(skb);
 108
 109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                         return res;
 111         }
 112
 113         rcu_read_lock_bh();
 114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116         if (unlikely(!neigh))
 117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118         if (!IS_ERR(neigh)) {
 119                 sock_confirm_neigh(skb, neigh);
 120                 ret = neigh_output(neigh, skb);
 121                 rcu_read_unlock_bh();
 122                 return ret;
 123         }
 124         rcu_read_unlock_bh();
 125
 126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127         kfree_skb(skb);
 128         return -EINVAL;
 129 }
 130
 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132 {
 133         int ret;
 134
 135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136         if (ret) {
 137                 kfree_skb(skb);
 138                 return ret;
 139         }
 140
 141         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 142             dst_allfrag(skb_dst(skb)) ||
 143             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 144                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
 145         else
 146                 return ip6_finish_output2(net, sk, skb);
 147 }
 148
 149 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 150 {
 151         struct net_device *dev = skb_dst(skb)->dev;
 152         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 153
 154         skb->protocol = htons(ETH_P_IPV6);
 155         skb->dev = dev;
 156
 157         if (unlikely(idev->cnf.disable_ipv6)) {
 158                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 159                 kfree_skb(skb);
 160                 return 0;
 161         }
 162
 163         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 164                             net, sk, skb, NULL, dev,
 165                             ip6_finish_output,
 166                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 167 }
 168
 169 static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 170 {
 171         if (!np->autoflowlabel_set)
 172                 return ip6_default_np_autolabel(net);
 173         else
 174                 return np->autoflowlabel;
 175 }
 176
 177 /*
 178  * xmit an sk_buff (used by TCP, SCTP and DCCP)
 179  * Note : socket lock is not held for SYNACK packets, but might be modified
 180  * by calls to skb_set_owner_w() and ipv6_local_error(),
 181  * which are using proper atomic operations or spinlocks.
 182  */
 183 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 184              __u32 mark, struct ipv6_txoptions *opt, int tclass)
 185 {
 186         struct net *net = sock_net(sk);
 187         const struct ipv6_pinfo *np = inet6_sk(sk);
 188         struct in6_addr *first_hop = &fl6->daddr;
 189         struct dst_entry *dst = skb_dst(skb);
 190         struct ipv6hdr *hdr;
 191         u8  proto = fl6->flowi6_proto;
 192         int seg_len = skb->len;
 193         int hlimit = -1;
 194         u32 mtu;
 195
 196         if (opt) {
 197                 unsigned int head_room;
 198
 199                 /* First: exthdrs may take lots of space (~8K for now)
 200                    MAX_HEADER is not enough.
 201                  */
 202                 head_room = opt->opt_nflen + opt->opt_flen;
 203                 seg_len += head_room;
 204                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 205
 206                 if (skb_headroom(skb) < head_room) {
 207                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 208                         if (!skb2) {
 209                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 210                                               IPSTATS_MIB_OUTDISCARDS);
 211                                 kfree_skb(skb);
 212                                 return -ENOBUFS;
 213                         }
 214                         consume_skb(skb);
 215                         skb = skb2;
 216                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 217                          * it is safe to call in our context (socket lock not held)
 218                          */
 219                         skb_set_owner_w(skb, (struct sock *)sk);
 220                 }
 221                 if (opt->opt_flen)
 222                         ipv6_push_frag_opts(skb, opt, &proto);
 223                 if (opt->opt_nflen)
 224                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 225                                              &fl6->saddr);
 226         }
 227
 228         skb_push(skb, sizeof(struct ipv6hdr));
 229         skb_reset_network_header(skb);
 230         hdr = ipv6_hdr(skb);
 231
 232         /*
 233          *      Fill in the IPv6 header
 234          */
 235         if (np)
 236                 hlimit = np->hop_limit;
 237         if (hlimit < 0)
 238                 hlimit = ip6_dst_hoplimit(dst);
 239
 240         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 241                                 ip6_autoflowlabel(net, np), fl6));
 242
 243         hdr->payload_len = htons(seg_len);
 244         hdr->nexthdr = proto;
 245         hdr->hop_limit = hlimit;
 246
 247         hdr->saddr = fl6->saddr;
 248         hdr->daddr = *first_hop;
 249
 250         skb->protocol = htons(ETH_P_IPV6);
 251         skb->priority = sk->sk_priority;
 252         skb->mark = mark;
 253
 254         mtu = dst_mtu(dst);
 255         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 256                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 257                               IPSTATS_MIB_OUT, skb->len);
 258
 259                 /* if egress device is enslaved to an L3 master device pass the
 260                  * skb to its handler for processing
 261                  */
 262                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
 263                 if (unlikely(!skb))
 264                         return 0;
 265
 266                 /* hooks should never assume socket lock is held.
 267                  * we promote our socket to non const
 268                  */
 269                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 270                                net, (struct sock *)sk, skb, NULL, dst->dev,
 271                                dst_output);
 272         }
 273
 274         skb->dev = dst->dev;
 275         /* ipv6_local_error() does not require socket lock,
 276          * we promote our socket to non const
 277          */
 278         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 279
 280         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 281         kfree_skb(skb);
 282         return -EMSGSIZE;
 283 }
 284 EXPORT_SYMBOL(ip6_xmit);
 285
 286 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 287 {
 288         struct ip6_ra_chain *ra;
 289         struct sock *last = NULL;
 290
 291         read_lock(&ip6_ra_lock);
 292         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 293                 struct sock *sk = ra->sk;
 294                 if (sk && ra->sel == sel &&
 295                     (!sk->sk_bound_dev_if ||
 296                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 297                         if (last) {
 298                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 299                                 if (skb2)
 300                                         rawv6_rcv(last, skb2);
 301                         }
 302                         last = sk;
 303                 }
 304         }
 305
 306         if (last) {
 307                 rawv6_rcv(last, skb);
 308                 read_unlock(&ip6_ra_lock);
 309                 return 1;
 310         }
 311         read_unlock(&ip6_ra_lock);
 312         return 0;
 313 }
 314
 315 static int ip6_forward_proxy_check(struct sk_buff *skb)
 316 {
 317         struct ipv6hdr *hdr = ipv6_hdr(skb);
 318         u8 nexthdr = hdr->nexthdr;
 319         __be16 frag_off;
 320         int offset;
 321
 322         if (ipv6_ext_hdr(nexthdr)) {
 323                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 324                 if (offset < 0)
 325                         return 0;
 326         } else
 327                 offset = sizeof(struct ipv6hdr);
 328
 329         if (nexthdr == IPPROTO_ICMPV6) {
 330                 struct icmp6hdr *icmp6;
 331
 332                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 333                                          offset + 1 - skb->data)))
 334                         return 0;
 335
 336                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 337
 338                 switch (icmp6->icmp6_type) {
 339                 case NDISC_ROUTER_SOLICITATION:
 340                 case NDISC_ROUTER_ADVERTISEMENT:
 341                 case NDISC_NEIGHBOUR_SOLICITATION:
 342                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 343                 case NDISC_REDIRECT:
 344                         /* For reaction involving unicast neighbor discovery
 345                          * message destined to the proxied address, pass it to
 346                          * input function.
 347                          */
 348                         return 1;
 349                 default:
 350                         break;
 351                 }
 352         }
 353
 354         /*
 355          * The proxying router can't forward traffic sent to a link-local
 356          * address, so signal the sender and discard the packet. This
 357          * behavior is clarified by the MIPv6 specification.
 358          */
 359         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 360                 dst_link_failure(skb);
 361                 return -1;
 362         }
 363
 364         return 0;
 365 }
 366
 367 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 368                                      struct sk_buff *skb)
 369 {
 370         return dst_output(net, sk, skb);
 371 }
 372
 373 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 374 {
 375         unsigned int mtu;
 376         struct inet6_dev *idev;
 377
 378         if (dst_metric_locked(dst, RTAX_MTU)) {
 379                 mtu = dst_metric_raw(dst, RTAX_MTU);
 380                 if (mtu)
 381                         return mtu;
 382         }
 383
 384         mtu = IPV6_MIN_MTU;
 385         rcu_read_lock();
 386         idev = __in6_dev_get(dst->dev);
 387         if (idev)
 388                 mtu = idev->cnf.mtu6;
 389         rcu_read_unlock();
 390
 391         return mtu;
 392 }
 393
 394 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 395 {
 396         if (skb->len <= mtu)
 397                 return false;
 398
 399         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 400         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 401                 return true;
 402
 403         if (skb->ignore_df)
 404                 return false;
 405
 406         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 407                 return false;
 408
 409         return true;
 410 }
 411
 412 int ip6_forward(struct sk_buff *skb)
 413 {
 414         struct dst_entry *dst = skb_dst(skb);
 415         struct ipv6hdr *hdr = ipv6_hdr(skb);
 416         struct inet6_skb_parm *opt = IP6CB(skb);
 417         struct net *net = dev_net(dst->dev);
 418         u32 mtu;
 419
 420         if (net->ipv6.devconf_all->forwarding == 0)
 421                 goto error;
 422
 423         if (skb->pkt_type != PACKET_HOST)
 424                 goto drop;
 425
 426         if (unlikely(skb->sk))
 427                 goto drop;
 428
 429         if (skb_warn_if_lro(skb))
 430                 goto drop;
 431
 432         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 433                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 434                                 IPSTATS_MIB_INDISCARDS);
 435                 goto drop;
 436         }
 437
 438         skb_forward_csum(skb);
 439
 440         /*
 441          *      We DO NOT make any processing on
 442          *      RA packets, pushing them to user level AS IS
 443          *      without ane WARRANTY that application will be able
 444          *      to interpret them. The reason is that we
 445          *      cannot make anything clever here.
 446          *
 447          *      We are not end-node, so that if packet contains
 448          *      AH/ESP, we cannot make anything.
 449          *      Defragmentation also would be mistake, RA packets
 450          *      cannot be fragmented, because there is no warranty
 451          *      that different fragments will go along one path. --ANK
 452          */
 453         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 454                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 455                         return 0;
 456         }
 457
 458         /*
 459          *      check and decrement ttl
 460          */
 461         if (hdr->hop_limit <= 1) {
 462                 /* Force OUTPUT device used as source address */
 463                 skb->dev = dst->dev;
 464                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 465                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 466                                 IPSTATS_MIB_INHDRERRORS);
 467
 468                 kfree_skb(skb);
 469                 return -ETIMEDOUT;
 470         }
 471
 472         /* XXX: idev->cnf.proxy_ndp? */
 473         if (net->ipv6.devconf_all->proxy_ndp &&
 474             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 475                 int proxied = ip6_forward_proxy_check(skb);
 476                 if (proxied > 0)
 477                         return ip6_input(skb);
 478                 else if (proxied < 0) {
 479                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
 480                                         IPSTATS_MIB_INDISCARDS);
 481                         goto drop;
 482                 }
 483         }
 484
 485         if (!xfrm6_route_forward(skb)) {
 486                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 487                                 IPSTATS_MIB_INDISCARDS);
 488                 goto drop;
 489         }
 490         dst = skb_dst(skb);
 491
 492         /* IPv6 specs say nothing about it, but it is clear that we cannot
 493            send redirects to source routed frames.
 494            We don't send redirects to frames decapsulated from IPsec.
 495          */
 496         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 497                 struct in6_addr *target = NULL;
 498                 struct inet_peer *peer;
 499                 struct rt6_info *rt;
 500
 501                 /*
 502                  *      incoming and outgoing devices are the same
 503                  *      send a redirect.
 504                  */
 505
 506                 rt = (struct rt6_info *) dst;
 507                 if (rt->rt6i_flags & RTF_GATEWAY)
 508                         target = &rt->rt6i_gateway;
 509                 else
 510                         target = &hdr->daddr;
 511
 512                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 513
 514                 /* Limit redirects both by destination (here)
 515                    and by source (inside ndisc_send_redirect)
 516                  */
 517                 if (inet_peer_xrlim_allow(peer, 1*HZ))
 518                         ndisc_send_redirect(skb, target);
 519                 if (peer)
 520                         inet_putpeer(peer);
 521         } else {
 522                 int addrtype = ipv6_addr_type(&hdr->saddr);
 523
 524                 /* This check is security critical. */
 525                 if (addrtype == IPV6_ADDR_ANY ||
 526                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 527                         goto error;
 528                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 529                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 530                                     ICMPV6_NOT_NEIGHBOUR, 0);
 531                         goto error;
 532                 }
 533         }
 534
 535         mtu = ip6_dst_mtu_forward(dst);
 536         if (mtu < IPV6_MIN_MTU)
 537                 mtu = IPV6_MIN_MTU;
 538
 539         if (ip6_pkt_too_big(skb, mtu)) {
 540                 /* Again, force OUTPUT device used as source address */
 541                 skb->dev = dst->dev;
 542                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 543                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 544                                 IPSTATS_MIB_INTOOBIGERRORS);
 545                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 546                                 IPSTATS_MIB_FRAGFAILS);
 547                 kfree_skb(skb);
 548                 return -EMSGSIZE;
 549         }
 550
 551         if (skb_cow(skb, dst->dev->hard_header_len)) {
 552                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
 553                                 IPSTATS_MIB_OUTDISCARDS);
 554                 goto drop;
 555         }
 556
 557         hdr = ipv6_hdr(skb);
 558
 559         /* Mangling hops number delayed to point after skb COW */
 560
 561         hdr->hop_limit--;
 562
 563         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 564         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 565         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 566                        net, NULL, skb, skb->dev, dst->dev,
 567                        ip6_forward_finish);
 568
 569 error:
 570         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 571 drop:
 572         kfree_skb(skb);
 573         return -EINVAL;
 574 }
 575
 576 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 577 {
 578         to->pkt_type = from->pkt_type;
 579         to->priority = from->priority;
 580         to->protocol = from->protocol;
 581         skb_dst_drop(to);
 582         skb_dst_set(to, dst_clone(skb_dst(from)));
 583         to->dev = from->dev;
 584         to->mark = from->mark;
 585
 586 #ifdef CONFIG_NET_SCHED
 587         to->tc_index = from->tc_index;
 588 #endif
 589         nf_copy(to, from);
 590         skb_copy_secmark(to, from);
 591 }
 592
 593 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 594                  int (*output)(struct net *, struct sock *, struct sk_buff *))
 595 {
 596         struct sk_buff *frag;
 597         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 598         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 599                                 inet6_sk(skb->sk) : NULL;
 600         struct ipv6hdr *tmp_hdr;
 601         struct frag_hdr *fh;
 602         unsigned int mtu, hlen, left, len;
 603         int hroom, troom;
 604         __be32 frag_id;
 605         int ptr, offset = 0, err = 0;
 606         u8 *prevhdr, nexthdr = 0;
 607
 608         err = ip6_find_1stfragopt(skb, &prevhdr);
 609         if (err < 0)
 610                 goto fail;
 611         hlen = err;
 612         nexthdr = *prevhdr;
 613
 614         mtu = ip6_skb_dst_mtu(skb);
 615
 616         /* We must not fragment if the socket is set to force MTU discovery
 617          * or if the skb it not generated by a local socket.
 618          */
 619         if (unlikely(!skb->ignore_df && skb->len > mtu))
 620                 goto fail_toobig;
 621
 622         if (IP6CB(skb)->frag_max_size) {
 623                 if (IP6CB(skb)->frag_max_size > mtu)
 624                         goto fail_toobig;
 625
 626                 /* don't send fragments larger than what we received */
 627                 mtu = IP6CB(skb)->frag_max_size;
 628                 if (mtu < IPV6_MIN_MTU)
 629                         mtu = IPV6_MIN_MTU;
 630         }
 631
 632         if (np && np->frag_size < mtu) {
 633                 if (np->frag_size)
 634                         mtu = np->frag_size;
 635         }
 636         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 637                 goto fail_toobig;
 638         mtu -= hlen + sizeof(struct frag_hdr);
 639
 640         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 641                                     &ipv6_hdr(skb)->saddr);
 642
 643         if (skb->ip_summed == CHECKSUM_PARTIAL &&
 644             (err = skb_checksum_help(skb)))
 645                 goto fail;
 646
 647         hroom = LL_RESERVED_SPACE(rt->dst.dev);
 648         if (skb_has_frag_list(skb)) {
 649                 unsigned int first_len = skb_pagelen(skb);
 650                 struct sk_buff *frag2;
 651
 652                 if (first_len - hlen > mtu ||
 653                     ((first_len - hlen) & 7) ||
 654                     skb_cloned(skb) ||
 655                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 656                         goto slow_path;
 657
 658                 skb_walk_frags(skb, frag) {
 659                         /* Correct geometry. */
 660                         if (frag->len > mtu ||
 661                             ((frag->len & 7) && frag->next) ||
 662                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 663                                 goto slow_path_clean;
 664
 665                         /* Partially cloned skb? */
 666                         if (skb_shared(frag))
 667                                 goto slow_path_clean;
 668
 669                         BUG_ON(frag->sk);
 670                         if (skb->sk) {
 671                                 frag->sk = skb->sk;
 672                                 frag->destructor = sock_wfree;
 673                         }
 674                         skb->truesize -= frag->truesize;
 675                 }
 676
 677                 err = 0;
 678                 offset = 0;
 679                 /* BUILD HEADER */
 680
 681                 *prevhdr = NEXTHDR_FRAGMENT;
 682                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 683                 if (!tmp_hdr) {
 684                         err = -ENOMEM;
 685                         goto fail;
 686                 }
 687                 frag = skb_shinfo(skb)->frag_list;
 688                 skb_frag_list_init(skb);
 689
 690                 __skb_pull(skb, hlen);
 691                 fh = __skb_push(skb, sizeof(struct frag_hdr));
 692                 __skb_push(skb, hlen);
 693                 skb_reset_network_header(skb);
 694                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 695
 696                 fh->nexthdr = nexthdr;
 697                 fh->reserved = 0;
 698                 fh->frag_off = htons(IP6_MF);
 699                 fh->identification = frag_id;
 700
 701                 first_len = skb_pagelen(skb);
 702                 skb->data_len = first_len - skb_headlen(skb);
 703                 skb->len = first_len;
 704                 ipv6_hdr(skb)->payload_len = htons(first_len -
 705                                                    sizeof(struct ipv6hdr));
 706
 707                 for (;;) {
 708                         /* Prepare header of the next frame,
 709                          * before previous one went down. */
 710                         if (frag) {
 711                                 frag->ip_summed = CHECKSUM_NONE;
 712                                 skb_reset_transport_header(frag);
 713                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
 714                                 __skb_push(frag, hlen);
 715                                 skb_reset_network_header(frag);
 716                                 memcpy(skb_network_header(frag), tmp_hdr,
 717                                        hlen);
 718                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 719                                 fh->nexthdr = nexthdr;
 720                                 fh->reserved = 0;
 721                                 fh->frag_off = htons(offset);
 722                                 if (frag->next)
 723                                         fh->frag_off |= htons(IP6_MF);
 724                                 fh->identification = frag_id;
 725                                 ipv6_hdr(frag)->payload_len =
 726                                                 htons(frag->len -
 727                                                       sizeof(struct ipv6hdr));
 728                                 ip6_copy_metadata(frag, skb);
 729                         }
 730
 731                         err = output(net, sk, skb);
 732                         if (!err)
 733                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 734                                               IPSTATS_MIB_FRAGCREATES);
 735
 736                         if (err || !frag)
 737                                 break;
 738
 739                         skb = frag;
 740                         frag = skb->next;
 741                         skb->next = NULL;
 742                 }
 743
 744                 kfree(tmp_hdr);
 745
 746                 if (err == 0) {
 747                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 748                                       IPSTATS_MIB_FRAGOKS);
 749                         return 0;
 750                 }
 751
 752                 kfree_skb_list(frag);
 753
 754                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 755                               IPSTATS_MIB_FRAGFAILS);
 756                 return err;
 757
 758 slow_path_clean:
 759                 skb_walk_frags(skb, frag2) {
 760                         if (frag2 == frag)
 761                                 break;
 762                         frag2->sk = NULL;
 763                         frag2->destructor = NULL;
 764                         skb->truesize += frag2->truesize;
 765                 }
 766         }
 767
 768 slow_path:
 769         left = skb->len - hlen;         /* Space per frame */
 770         ptr = hlen;                     /* Where to start from */
 771
 772         /*
 773          *      Fragment the datagram.
 774          */
 775
 776         troom = rt->dst.dev->needed_tailroom;
 777
 778         /*
 779          *      Keep copying data until we run out.
 780          */
 781         while (left > 0)        {
 782                 u8 *fragnexthdr_offset;
 783
 784                 len = left;
 785                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 786                 if (len > mtu)
 787                         len = mtu;
 788                 /* IF: we are not sending up to and including the packet end
 789                    then align the next start on an eight byte boundary */
 790                 if (len < left) {
 791                         len &= ~7;
 792                 }
 793
 794                 /* Allocate buffer */
 795                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 796                                  hroom + troom, GFP_ATOMIC);
 797                 if (!frag) {
 798                         err = -ENOMEM;
 799                         goto fail;
 800                 }
 801
 802                 /*
 803                  *      Set up data on packet
 804                  */
 805
 806                 ip6_copy_metadata(frag, skb);
 807                 skb_reserve(frag, hroom);
 808                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 809                 skb_reset_network_header(frag);
 810                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 811                 frag->transport_header = (frag->network_header + hlen +
 812                                           sizeof(struct frag_hdr));
 813
 814                 /*
 815                  *      Charge the memory for the fragment to any owner
 816                  *      it might possess
 817                  */
 818                 if (skb->sk)
 819                         skb_set_owner_w(frag, skb->sk);
 820
 821                 /*
 822                  *      Copy the packet header into the new buffer.
 823                  */
 824                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 825
 826                 fragnexthdr_offset = skb_network_header(frag);
 827                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
 828                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 829
 830                 /*
 831                  *      Build fragment header.
 832                  */
 833                 fh->nexthdr = nexthdr;
 834                 fh->reserved = 0;
 835                 fh->identification = frag_id;
 836
 837                 /*
 838                  *      Copy a block of the IP datagram.
 839                  */
 840                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 841                                      len));
 842                 left -= len;
 843
 844                 fh->frag_off = htons(offset);
 845                 if (left > 0)
 846                         fh->frag_off |= htons(IP6_MF);
 847                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 848                                                     sizeof(struct ipv6hdr));
 849
 850                 ptr += len;
 851                 offset += len;
 852
 853                 /*
 854                  *      Put this fragment into the sending queue.
 855                  */
 856                 err = output(net, sk, frag);
 857                 if (err)
 858                         goto fail;
 859
 860                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 861                               IPSTATS_MIB_FRAGCREATES);
 862         }
 863         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 864                       IPSTATS_MIB_FRAGOKS);
 865         consume_skb(skb);
 866         return err;
 867
 868 fail_toobig:
 869         if (skb->sk && dst_allfrag(skb_dst(skb)))
 870                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 871
 872         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 873         err = -EMSGSIZE;
 874
 875 fail:
 876         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 877                       IPSTATS_MIB_FRAGFAILS);
 878         kfree_skb(skb);
 879         return err;
 880 }
 881
 882 static inline int ip6_rt_check(const struct rt6key *rt_key,
 883                                const struct in6_addr *fl_addr,
 884                                const struct in6_addr *addr_cache)
 885 {
 886         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 887                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 888 }
 889
 890 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 891                                           struct dst_entry *dst,
 892                                           const struct flowi6 *fl6)
 893 {
 894         struct ipv6_pinfo *np = inet6_sk(sk);
 895         struct rt6_info *rt;
 896
 897         if (!dst)
 898                 goto out;
 899
 900         if (dst->ops->family != AF_INET6) {
 901                 dst_release(dst);
 902                 return NULL;
 903         }
 904
 905         rt = (struct rt6_info *)dst;
 906         /* Yes, checking route validity in not connected
 907          * case is not very simple. Take into account,
 908          * that we do not support routing by source, TOS,
 909          * and MSG_DONTROUTE            --ANK (980726)
 910          *
 911          * 1. ip6_rt_check(): If route was host route,
 912          *    check that cached destination is current.
 913          *    If it is network route, we still may
 914          *    check its validity using saved pointer
 915          *    to the last used address: daddr_cache.
 916          *    We do not want to save whole address now,
 917          *    (because main consumer of this service
 918          *    is tcp, which has not this problem),
 919          *    so that the last trick works only on connected
 920          *    sockets.
 921          * 2. oif also should be the same.
 922          */
 923         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 924 #ifdef CONFIG_IPV6_SUBTREES
 925             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 926 #endif
 927            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 928               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 929                 dst_release(dst);
 930                 dst = NULL;
 931         }
 932
 933 out:
 934         return dst;
 935 }
 936
 937 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 938                                struct dst_entry **dst, struct flowi6 *fl6)
 939 {
 940 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 941         struct neighbour *n;
 942         struct rt6_info *rt;
 943 #endif
 944         int err;
 945         int flags = 0;
 946
 947         /* The correct way to handle this would be to do
 948          * ip6_route_get_saddr, and then ip6_route_output; however,
 949          * the route-specific preferred source forces the
 950          * ip6_route_output call _before_ ip6_route_get_saddr.
 951          *
 952          * In source specific routing (no src=any default route),
 953          * ip6_route_output will fail given src=any saddr, though, so
 954          * that's why we try it again later.
 955          */
 956         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 957                 struct rt6_info *rt;
 958                 bool had_dst = *dst != NULL;
 959
 960                 if (!had_dst)
 961                         *dst = ip6_route_output(net, sk, fl6);
 962                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 963                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 964                                           sk ? inet6_sk(sk)->srcprefs : 0,
 965                                           &fl6->saddr);
 966                 if (err)
 967                         goto out_err_release;
 968
 969                 /* If we had an erroneous initial result, pretend it
 970                  * never existed and let the SA-enabled version take
 971                  * over.
 972                  */
 973                 if (!had_dst && (*dst)->error) {
 974                         dst_release(*dst);
 975                         *dst = NULL;
 976                 }
 977
 978                 if (fl6->flowi6_oif)
 979                         flags |= RT6_LOOKUP_F_IFACE;
 980         }
 981
 982         if (!*dst)
 983                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
 984
 985         err = (*dst)->error;
 986         if (err)
 987                 goto out_err_release;
 988
 989 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 990         /*
 991          * Here if the dst entry we've looked up
 992          * has a neighbour entry that is in the INCOMPLETE
 993          * state and the src address from the flow is
 994          * marked as OPTIMISTIC, we release the found
 995          * dst entry and replace it instead with the
 996          * dst entry of the nexthop router
 997          */
 998         rt = (struct rt6_info *) *dst;
 999         rcu_read_lock_bh();
1000         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1001                                       rt6_nexthop(rt, &fl6->daddr));
1002         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1003         rcu_read_unlock_bh();
1004
1005         if (err) {
1006                 struct inet6_ifaddr *ifp;
1007                 struct flowi6 fl_gw6;
1008                 int redirect;
1009
1010                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1011                                       (*dst)->dev, 1);
1012
1013                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1014                 if (ifp)
1015                         in6_ifa_put(ifp);
1016
1017                 if (redirect) {
1018                         /*
1019                          * We need to get the dst entry for the
1020                          * default router instead
1021                          */
1022                         dst_release(*dst);
1023                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1024                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1025                         *dst = ip6_route_output(net, sk, &fl_gw6);
1026                         err = (*dst)->error;
1027                         if (err)
1028                                 goto out_err_release;
1029                 }
1030         }
1031 #endif
1032         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1033             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1034                 err = -EAFNOSUPPORT;
1035                 goto out_err_release;
1036         }
1037
1038         return 0;
1039
1040 out_err_release:
1041         dst_release(*dst);
1042         *dst = NULL;
1043
1044         if (err == -ENETUNREACH)
1045                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1046         return err;
1047 }
1048
1049 /**
1050  *      ip6_dst_lookup - perform route lookup on flow
1051  *      @sk: socket which provides route info
1052  *      @dst: pointer to dst_entry * for result
1053  *      @fl6: flow to lookup
1054  *
1055  *      This function performs a route lookup on the given flow.
1056  *
1057  *      It returns zero on success, or a standard errno code on error.
1058  */
1059 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1060                    struct flowi6 *fl6)
1061 {
1062         *dst = NULL;
1063         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1064 }
1065 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1066
1067 /**
1068  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1069  *      @sk: socket which provides route info
1070  *      @fl6: flow to lookup
1071  *      @final_dst: final destination address for ipsec lookup
1072  *
1073  *      This function performs a route lookup on the given flow.
1074  *
1075  *      It returns a valid dst pointer on success, or a pointer encoded
1076  *      error code.
1077  */
1078 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1079                                       const struct in6_addr *final_dst)
1080 {
1081         struct dst_entry *dst = NULL;
1082         int err;
1083
1084         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1085         if (err)
1086                 return ERR_PTR(err);
1087         if (final_dst)
1088                 fl6->daddr = *final_dst;
1089
1090         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1091 }
1092 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1093
1094 /**
1095  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1096  *      @sk: socket which provides the dst cache and route info
1097  *      @fl6: flow to lookup
1098  *      @final_dst: final destination address for ipsec lookup
1099  *
1100  *      This function performs a route lookup on the given flow with the
1101  *      possibility of using the cached route in the socket if it is valid.
1102  *      It will take the socket dst lock when operating on the dst cache.
1103  *      As a result, this function can only be used in process context.
1104  *
1105  *      It returns a valid dst pointer on success, or a pointer encoded
1106  *      error code.
1107  */
1108 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1109                                          const struct in6_addr *final_dst)
1110 {
1111         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1112
1113         dst = ip6_sk_dst_check(sk, dst, fl6);
1114         if (!dst)
1115                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1116
1117         return dst;
1118 }
1119 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1120
1121 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1122                                                gfp_t gfp)
1123 {
1124         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1125 }
1126
1127 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1128                                                 gfp_t gfp)
1129 {
1130         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1131 }
1132
1133 static void ip6_append_data_mtu(unsigned int *mtu,
1134                                 int *maxfraglen,
1135                                 unsigned int fragheaderlen,
1136                                 struct sk_buff *skb,
1137                                 struct rt6_info *rt,
1138                                 unsigned int orig_mtu)
1139 {
1140         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1141                 if (!skb) {
1142                         /* first fragment, reserve header_len */
1143                         *mtu = orig_mtu - rt->dst.header_len;
1144
1145                 } else {
1146                         /*
1147                          * this fragment is not first, the headers
1148                          * space is regarded as data space.
1149                          */
1150                         *mtu = orig_mtu;
1151                 }
1152                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1153                               + fragheaderlen - sizeof(struct frag_hdr);
1154         }
1155 }
1156
1157 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1158                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1159                           struct rt6_info *rt, struct flowi6 *fl6)
1160 {
1161         struct ipv6_pinfo *np = inet6_sk(sk);
1162         unsigned int mtu;
1163         struct ipv6_txoptions *opt = ipc6->opt;
1164
1165         /*
1166          * setup for corking
1167          */
1168         if (opt) {
1169                 if (WARN_ON(v6_cork->opt))
1170                         return -EINVAL;
1171
1172                 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1173                 if (unlikely(!v6_cork->opt))
1174                         return -ENOBUFS;
1175
1176                 v6_cork->opt->tot_len = sizeof(*opt);
1177                 v6_cork->opt->opt_flen = opt->opt_flen;
1178                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1179
1180                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1181                                                     sk->sk_allocation);
1182                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1183                         return -ENOBUFS;
1184
1185                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1186                                                     sk->sk_allocation);
1187                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1188                         return -ENOBUFS;
1189
1190                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1191                                                    sk->sk_allocation);
1192                 if (opt->hopopt && !v6_cork->opt->hopopt)
1193                         return -ENOBUFS;
1194
1195                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1196                                                     sk->sk_allocation);
1197                 if (opt->srcrt && !v6_cork->opt->srcrt)
1198                         return -ENOBUFS;
1199
1200                 /* need source address above miyazawa*/
1201         }
1202         dst_hold(&rt->dst);
1203         cork->base.dst = &rt->dst;
1204         cork->fl.u.ip6 = *fl6;
1205         v6_cork->hop_limit = ipc6->hlimit;
1206         v6_cork->tclass = ipc6->tclass;
1207         if (rt->dst.flags & DST_XFRM_TUNNEL)
1208                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1209                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1210         else
1211                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1212                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1213         if (np->frag_size < mtu) {
1214                 if (np->frag_size)
1215                         mtu = np->frag_size;
1216         }
1217         cork->base.fragsize = mtu;
1218         if (dst_allfrag(rt->dst.path))
1219                 cork->base.flags |= IPCORK_ALLFRAG;
1220         cork->base.length = 0;
1221
1222         return 0;
1223 }
1224
1225 static int __ip6_append_data(struct sock *sk,
1226                              struct flowi6 *fl6,
1227                              struct sk_buff_head *queue,
1228                              struct inet_cork *cork,
1229                              struct inet6_cork *v6_cork,
1230                              struct page_frag *pfrag,
1231                              int getfrag(void *from, char *to, int offset,
1232                                          int len, int odd, struct sk_buff *skb),
1233                              void *from, int length, int transhdrlen,
1234                              unsigned int flags, struct ipcm6_cookie *ipc6,
1235                              const struct sockcm_cookie *sockc)
1236 {
1237         struct sk_buff *skb, *skb_prev = NULL;
1238         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1239         int exthdrlen = 0;
1240         int dst_exthdrlen = 0;
1241         int hh_len;
1242         int copy;
1243         int err;
1244         int offset = 0;
1245         __u8 tx_flags = 0;
1246         u32 tskey = 0;
1247         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1248         struct ipv6_txoptions *opt = v6_cork->opt;
1249         int csummode = CHECKSUM_NONE;
1250         unsigned int maxnonfragsize, headersize;
1251
1252         skb = skb_peek_tail(queue);
1253         if (!skb) {
1254                 exthdrlen = opt ? opt->opt_flen : 0;
1255                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1256         }
1257
1258         mtu = cork->fragsize;
1259         orig_mtu = mtu;
1260
1261         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1262
1263         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1264                         (opt ? opt->opt_nflen : 0);
1265         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1266                      sizeof(struct frag_hdr);
1267
1268         headersize = sizeof(struct ipv6hdr) +
1269                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1270                      (dst_allfrag(&rt->dst) ?
1271                       sizeof(struct frag_hdr) : 0) +
1272                      rt->rt6i_nfheader_len;
1273
1274         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1275             (sk->sk_protocol == IPPROTO_UDP ||
1276              sk->sk_protocol == IPPROTO_RAW)) {
1277                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1278                                 sizeof(struct ipv6hdr));
1279                 goto emsgsize;
1280         }
1281
1282         if (ip6_sk_ignore_df(sk))
1283                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1284         else
1285                 maxnonfragsize = mtu;
1286
1287         if (cork->length + length > maxnonfragsize - headersize) {
1288 emsgsize:
1289                 ipv6_local_error(sk, EMSGSIZE, fl6,
1290                                  mtu - headersize +
1291                                  sizeof(struct ipv6hdr));
1292                 return -EMSGSIZE;
1293         }
1294
1295         /* CHECKSUM_PARTIAL only with no extension headers and when
1296          * we are not going to fragment
1297          */
1298         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1299             headersize == sizeof(struct ipv6hdr) &&
1300             length <= mtu - headersize &&
1301             !(flags & MSG_MORE) &&
1302             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1303                 csummode = CHECKSUM_PARTIAL;
1304
1305         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1306                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1307                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1308                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1309                         tskey = sk->sk_tskey++;
1310         }
1311
1312         /*
1313          * Let's try using as much space as possible.
1314          * Use MTU if total length of the message fits into the MTU.
1315          * Otherwise, we need to reserve fragment header and
1316          * fragment alignment (= 8-15 octects, in total).
1317          *
1318          * Note that we may need to "move" the data from the tail of
1319          * of the buffer to the new fragment when we split
1320          * the message.
1321          *
1322          * FIXME: It may be fragmented into multiple chunks
1323          *        at once if non-fragmentable extension headers
1324          *        are too large.
1325          * --yoshfuji
1326          */
1327
1328         cork->length += length;
1329         if (!skb)
1330                 goto alloc_new_skb;
1331
1332         while (length > 0) {
1333                 /* Check if the remaining data fits into current packet. */
1334                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1335                 if (copy < length)
1336                         copy = maxfraglen - skb->len;
1337
1338                 if (copy <= 0) {
1339                         char *data;
1340                         unsigned int datalen;
1341                         unsigned int fraglen;
1342                         unsigned int fraggap;
1343                         unsigned int alloclen;
1344 alloc_new_skb:
1345                         /* There's no room in the current skb */
1346                         if (skb)
1347                                 fraggap = skb->len - maxfraglen;
1348                         else
1349                                 fraggap = 0;
1350                         /* update mtu and maxfraglen if necessary */
1351                         if (!skb || !skb_prev)
1352                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1353                                                     fragheaderlen, skb, rt,
1354                                                     orig_mtu);
1355
1356                         skb_prev = skb;
1357
1358                         /*
1359                          * If remaining data exceeds the mtu,
1360                          * we know we need more fragment(s).
1361                          */
1362                         datalen = length + fraggap;
1363
1364                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1365                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1366                         if ((flags & MSG_MORE) &&
1367                             !(rt->dst.dev->features&NETIF_F_SG))
1368                                 alloclen = mtu;
1369                         else
1370                                 alloclen = datalen + fragheaderlen;
1371
1372                         alloclen += dst_exthdrlen;
1373
1374                         if (datalen != length + fraggap) {
1375                                 /*
1376                                  * this is not the last fragment, the trailer
1377                                  * space is regarded as data space.
1378                                  */
1379                                 datalen += rt->dst.trailer_len;
1380                         }
1381
1382                         alloclen += rt->dst.trailer_len;
1383                         fraglen = datalen + fragheaderlen;
1384
1385                         /*
1386                          * We just reserve space for fragment header.
1387                          * Note: this may be overallocation if the message
1388                          * (without MSG_MORE) fits into the MTU.
1389                          */
1390                         alloclen += sizeof(struct frag_hdr);
1391
1392                         copy = datalen - transhdrlen - fraggap;
1393                         if (copy < 0) {
1394                                 err = -EINVAL;
1395                                 goto error;
1396                         }
1397                         if (transhdrlen) {
1398                                 skb = sock_alloc_send_skb(sk,
1399                                                 alloclen + hh_len,
1400                                                 (flags & MSG_DONTWAIT), &err);
1401                         } else {
1402                                 skb = NULL;
1403                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1404                                     2 * sk->sk_sndbuf)
1405                                         skb = sock_wmalloc(sk,
1406                                                            alloclen + hh_len, 1,
1407                                                            sk->sk_allocation);
1408                                 if (unlikely(!skb))
1409                                         err = -ENOBUFS;
1410                         }
1411                         if (!skb)
1412                                 goto error;
1413                         /*
1414                          *      Fill in the control structures
1415                          */
1416                         skb->protocol = htons(ETH_P_IPV6);
1417                         skb->ip_summed = csummode;
1418                         skb->csum = 0;
1419                         /* reserve for fragmentation and ipsec header */
1420                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1421                                     dst_exthdrlen);
1422
1423                         /* Only the initial fragment is time stamped */
1424                         skb_shinfo(skb)->tx_flags = tx_flags;
1425                         tx_flags = 0;
1426                         skb_shinfo(skb)->tskey = tskey;
1427                         tskey = 0;
1428
1429                         /*
1430                          *      Find where to start putting bytes
1431                          */
1432                         data = skb_put(skb, fraglen);
1433                         skb_set_network_header(skb, exthdrlen);
1434                         data += fragheaderlen;
1435                         skb->transport_header = (skb->network_header +
1436                                                  fragheaderlen);
1437                         if (fraggap) {
1438                                 skb->csum = skb_copy_and_csum_bits(
1439                                         skb_prev, maxfraglen,
1440                                         data + transhdrlen, fraggap, 0);
1441                                 skb_prev->csum = csum_sub(skb_prev->csum,
1442                                                           skb->csum);
1443                                 data += fraggap;
1444                                 pskb_trim_unique(skb_prev, maxfraglen);
1445                         }
1446                         if (copy > 0 &&
1447                             getfrag(from, data + transhdrlen, offset,
1448                                     copy, fraggap, skb) < 0) {
1449                                 err = -EFAULT;
1450                                 kfree_skb(skb);
1451                                 goto error;
1452                         }
1453
1454                         offset += copy;
1455                         length -= datalen - fraggap;
1456                         transhdrlen = 0;
1457                         exthdrlen = 0;
1458                         dst_exthdrlen = 0;
1459
1460                         if ((flags & MSG_CONFIRM) && !skb_prev)
1461                                 skb_set_dst_pending_confirm(skb, 1);
1462
1463                         /*
1464                          * Put the packet on the pending queue
1465                          */
1466                         __skb_queue_tail(queue, skb);
1467                         continue;
1468                 }
1469
1470                 if (copy > length)
1471                         copy = length;
1472
1473                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1474                         unsigned int off;
1475
1476                         off = skb->len;
1477                         if (getfrag(from, skb_put(skb, copy),
1478                                                 offset, copy, off, skb) < 0) {
1479                                 __skb_trim(skb, off);
1480                                 err = -EFAULT;
1481                                 goto error;
1482                         }
1483                 } else {
1484                         int i = skb_shinfo(skb)->nr_frags;
1485
1486                         err = -ENOMEM;
1487                         if (!sk_page_frag_refill(sk, pfrag))
1488                                 goto error;
1489
1490                         if (!skb_can_coalesce(skb, i, pfrag->page,
1491                                               pfrag->offset)) {
1492                                 err = -EMSGSIZE;
1493                                 if (i == MAX_SKB_FRAGS)
1494                                         goto error;
1495
1496                                 __skb_fill_page_desc(skb, i, pfrag->page,
1497                                                      pfrag->offset, 0);
1498                                 skb_shinfo(skb)->nr_frags = ++i;
1499                                 get_page(pfrag->page);
1500                         }
1501                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1502                         if (getfrag(from,
1503                                     page_address(pfrag->page) + pfrag->offset,
1504                                     offset, copy, skb->len, skb) < 0)
1505                                 goto error_efault;
1506
1507                         pfrag->offset += copy;
1508                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1509                         skb->len += copy;
1510                         skb->data_len += copy;
1511                         skb->truesize += copy;
1512                         refcount_add(copy, &sk->sk_wmem_alloc);
1513                 }
1514                 offset += copy;
1515                 length -= copy;
1516         }
1517
1518         return 0;
1519
1520 error_efault:
1521         err = -EFAULT;
1522 error:
1523         cork->length -= length;
1524         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1525         return err;
1526 }
1527
1528 int ip6_append_data(struct sock *sk,
1529                     int getfrag(void *from, char *to, int offset, int len,
1530                                 int odd, struct sk_buff *skb),
1531                     void *from, int length, int transhdrlen,
1532                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1533                     struct rt6_info *rt, unsigned int flags,
1534                     const struct sockcm_cookie *sockc)
1535 {
1536         struct inet_sock *inet = inet_sk(sk);
1537         struct ipv6_pinfo *np = inet6_sk(sk);
1538         int exthdrlen;
1539         int err;
1540
1541         if (flags&MSG_PROBE)
1542                 return 0;
1543         if (skb_queue_empty(&sk->sk_write_queue)) {
1544                 /*
1545                  * setup for corking
1546                  */
1547                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1548                                      ipc6, rt, fl6);
1549                 if (err)
1550                         return err;
1551
1552                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1553                 length += exthdrlen;
1554                 transhdrlen += exthdrlen;
1555         } else {
1556                 fl6 = &inet->cork.fl.u.ip6;
1557                 transhdrlen = 0;
1558         }
1559
1560         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1561                                  &np->cork, sk_page_frag(sk), getfrag,
1562                                  from, length, transhdrlen, flags, ipc6, sockc);
1563 }
1564 EXPORT_SYMBOL_GPL(ip6_append_data);
1565
1566 static void ip6_cork_release(struct inet_cork_full *cork,
1567                              struct inet6_cork *v6_cork)
1568 {
1569         if (v6_cork->opt) {
1570                 kfree(v6_cork->opt->dst0opt);
1571                 kfree(v6_cork->opt->dst1opt);
1572                 kfree(v6_cork->opt->hopopt);
1573                 kfree(v6_cork->opt->srcrt);
1574                 kfree(v6_cork->opt);
1575                 v6_cork->opt = NULL;
1576         }
1577
1578         if (cork->base.dst) {
1579                 dst_release(cork->base.dst);
1580                 cork->base.dst = NULL;
1581                 cork->base.flags &= ~IPCORK_ALLFRAG;
1582         }
1583         memset(&cork->fl, 0, sizeof(cork->fl));
1584 }
1585
1586 struct sk_buff *__ip6_make_skb(struct sock *sk,
1587                                struct sk_buff_head *queue,
1588                                struct inet_cork_full *cork,
1589                                struct inet6_cork *v6_cork)
1590 {
1591         struct sk_buff *skb, *tmp_skb;
1592         struct sk_buff **tail_skb;
1593         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1594         struct ipv6_pinfo *np = inet6_sk(sk);
1595         struct net *net = sock_net(sk);
1596         struct ipv6hdr *hdr;
1597         struct ipv6_txoptions *opt = v6_cork->opt;
1598         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1599         struct flowi6 *fl6 = &cork->fl.u.ip6;
1600         unsigned char proto = fl6->flowi6_proto;
1601
1602         skb = __skb_dequeue(queue);
1603         if (!skb)
1604                 goto out;
1605         tail_skb = &(skb_shinfo(skb)->frag_list);
1606
1607         /* move skb->data to ip header from ext header */
1608         if (skb->data < skb_network_header(skb))
1609                 __skb_pull(skb, skb_network_offset(skb));
1610         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1611                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1612                 *tail_skb = tmp_skb;
1613                 tail_skb = &(tmp_skb->next);
1614                 skb->len += tmp_skb->len;
1615                 skb->data_len += tmp_skb->len;
1616                 skb->truesize += tmp_skb->truesize;
1617                 tmp_skb->destructor = NULL;
1618                 tmp_skb->sk = NULL;
1619         }
1620
1621         /* Allow local fragmentation. */
1622         skb->ignore_df = ip6_sk_ignore_df(sk);
1623
1624         *final_dst = fl6->daddr;
1625         __skb_pull(skb, skb_network_header_len(skb));
1626         if (opt && opt->opt_flen)
1627                 ipv6_push_frag_opts(skb, opt, &proto);
1628         if (opt && opt->opt_nflen)
1629                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1630
1631         skb_push(skb, sizeof(struct ipv6hdr));
1632         skb_reset_network_header(skb);
1633         hdr = ipv6_hdr(skb);
1634
1635         ip6_flow_hdr(hdr, v6_cork->tclass,
1636                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1637                                         ip6_autoflowlabel(net, np), fl6));
1638         hdr->hop_limit = v6_cork->hop_limit;
1639         hdr->nexthdr = proto;
1640         hdr->saddr = fl6->saddr;
1641         hdr->daddr = *final_dst;
1642
1643         skb->priority = sk->sk_priority;
1644         skb->mark = sk->sk_mark;
1645
1646         skb_dst_set(skb, dst_clone(&rt->dst));
1647         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1648         if (proto == IPPROTO_ICMPV6) {
1649                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1650
1651                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1652                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1653         }
1654
1655         ip6_cork_release(cork, v6_cork);
1656 out:
1657         return skb;
1658 }
1659
1660 int ip6_send_skb(struct sk_buff *skb)
1661 {
1662         struct net *net = sock_net(skb->sk);
1663         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1664         int err;
1665
1666         err = ip6_local_out(net, skb->sk, skb);
1667         if (err) {
1668                 if (err > 0)
1669                         err = net_xmit_errno(err);
1670                 if (err)
1671                         IP6_INC_STATS(net, rt->rt6i_idev,
1672                                       IPSTATS_MIB_OUTDISCARDS);
1673         }
1674
1675         return err;
1676 }
1677
1678 int ip6_push_pending_frames(struct sock *sk)
1679 {
1680         struct sk_buff *skb;
1681
1682         skb = ip6_finish_skb(sk);
1683         if (!skb)
1684                 return 0;
1685
1686         return ip6_send_skb(skb);
1687 }
1688 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1689
1690 static void __ip6_flush_pending_frames(struct sock *sk,
1691                                        struct sk_buff_head *queue,
1692                                        struct inet_cork_full *cork,
1693                                        struct inet6_cork *v6_cork)
1694 {
1695         struct sk_buff *skb;
1696
1697         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1698                 if (skb_dst(skb))
1699                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1700                                       IPSTATS_MIB_OUTDISCARDS);
1701                 kfree_skb(skb);
1702         }
1703
1704         ip6_cork_release(cork, v6_cork);
1705 }
1706
1707 void ip6_flush_pending_frames(struct sock *sk)
1708 {
1709         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1710                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1711 }
1712 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1713
1714 struct sk_buff *ip6_make_skb(struct sock *sk,
1715                              int getfrag(void *from, char *to, int offset,
1716                                          int len, int odd, struct sk_buff *skb),
1717                              void *from, int length, int transhdrlen,
1718                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1719                              struct rt6_info *rt, unsigned int flags,
1720                              const struct sockcm_cookie *sockc)
1721 {
1722         struct inet_cork_full cork;
1723         struct inet6_cork v6_cork;
1724         struct sk_buff_head queue;
1725         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1726         int err;
1727
1728         if (flags & MSG_PROBE)
1729                 return NULL;
1730
1731         __skb_queue_head_init(&queue);
1732
1733         cork.base.flags = 0;
1734         cork.base.addr = 0;
1735         cork.base.opt = NULL;
1736         v6_cork.opt = NULL;
1737         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1738         if (err)
1739                 return ERR_PTR(err);
1740
1741         if (ipc6->dontfrag < 0)
1742                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1743
1744         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1745                                 &current->task_frag, getfrag, from,
1746                                 length + exthdrlen, transhdrlen + exthdrlen,
1747                                 flags, ipc6, sockc);
1748         if (err) {
1749                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1750                 return ERR_PTR(err);
1751         }
1752
1753         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1754 }