1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2013 Nicira, Inc.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
45 #include <net/dst_metadata.h>
47 #if IS_ENABLED(CONFIG_IPV6)
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
55 return hash_32((__force u32)key ^ (__force u32)remote,
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60 __be16 flags, __be32 key)
62 if (p->i_flags & TUNNEL_KEY) {
63 if (flags & TUNNEL_KEY)
64 return key == p->i_key;
66 /* key expected, none present */
69 return !(flags & TUNNEL_KEY);
72 /* Fallback tunnel: no source, no destination, no key, no options
75 We require exact key match i.e. if a key is present in packet
76 it will match only tunnel with the same key; if it is not present,
77 it will match only keyless tunnel.
79 All keysless packets, if not matched configured keyless tunnels
80 will match fallback tunnel.
81 Given src, dst and key, find appropriate for input tunnel.
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84 int link, __be16 flags,
85 __be32 remote, __be32 local,
88 struct ip_tunnel *t, *cand = NULL;
89 struct hlist_head *head;
90 struct net_device *ndev;
93 hash = ip_tunnel_hash(key, remote);
94 head = &itn->tunnels[hash];
96 hlist_for_each_entry_rcu(t, head, hash_node) {
97 if (local != t->parms.iph.saddr ||
98 remote != t->parms.iph.daddr ||
99 !(t->dev->flags & IFF_UP))
102 if (!ip_tunnel_key_match(&t->parms, flags, key))
105 if (READ_ONCE(t->parms.link) == link)
110 hlist_for_each_entry_rcu(t, head, hash_node) {
111 if (remote != t->parms.iph.daddr ||
112 t->parms.iph.saddr != 0 ||
113 !(t->dev->flags & IFF_UP))
116 if (!ip_tunnel_key_match(&t->parms, flags, key))
119 if (READ_ONCE(t->parms.link) == link)
125 hash = ip_tunnel_hash(key, 0);
126 head = &itn->tunnels[hash];
128 hlist_for_each_entry_rcu(t, head, hash_node) {
129 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
130 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
133 if (!(t->dev->flags & IFF_UP))
136 if (!ip_tunnel_key_match(&t->parms, flags, key))
139 if (READ_ONCE(t->parms.link) == link)
145 hlist_for_each_entry_rcu(t, head, hash_node) {
146 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
147 t->parms.iph.saddr != 0 ||
148 t->parms.iph.daddr != 0 ||
149 !(t->dev->flags & IFF_UP))
152 if (READ_ONCE(t->parms.link) == link)
161 t = rcu_dereference(itn->collect_md_tun);
162 if (t && t->dev->flags & IFF_UP)
165 ndev = READ_ONCE(itn->fb_tunnel_dev);
166 if (ndev && ndev->flags & IFF_UP)
167 return netdev_priv(ndev);
171 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
174 struct ip_tunnel_parm *parms)
178 __be32 i_key = parms->i_key;
180 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
181 remote = parms->iph.daddr;
185 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
188 h = ip_tunnel_hash(i_key, remote);
189 return &itn->tunnels[h];
192 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194 struct hlist_head *head = ip_bucket(itn, &t->parms);
197 rcu_assign_pointer(itn->collect_md_tun, t);
198 hlist_add_head_rcu(&t->hash_node, head);
201 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
204 rcu_assign_pointer(itn->collect_md_tun, NULL);
205 hlist_del_init_rcu(&t->hash_node);
208 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
209 struct ip_tunnel_parm *parms,
212 __be32 remote = parms->iph.daddr;
213 __be32 local = parms->iph.saddr;
214 __be32 key = parms->i_key;
215 __be16 flags = parms->i_flags;
216 int link = parms->link;
217 struct ip_tunnel *t = NULL;
218 struct hlist_head *head = ip_bucket(itn, parms);
220 hlist_for_each_entry_rcu(t, head, hash_node) {
221 if (local == t->parms.iph.saddr &&
222 remote == t->parms.iph.daddr &&
223 link == READ_ONCE(t->parms.link) &&
224 type == t->dev->type &&
225 ip_tunnel_key_match(&t->parms, flags, key))
231 static struct net_device *__ip_tunnel_create(struct net *net,
232 const struct rtnl_link_ops *ops,
233 struct ip_tunnel_parm *parms)
236 struct ip_tunnel *tunnel;
237 struct net_device *dev;
241 if (parms->name[0]) {
242 if (!dev_valid_name(parms->name))
244 strscpy(name, parms->name, IFNAMSIZ);
246 if (strlen(ops->kind) > (IFNAMSIZ - 3))
248 strcpy(name, ops->kind);
253 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
258 dev_net_set(dev, net);
260 dev->rtnl_link_ops = ops;
262 tunnel = netdev_priv(dev);
263 tunnel->parms = *parms;
266 err = register_netdevice(dev);
278 static int ip_tunnel_bind_dev(struct net_device *dev)
280 struct net_device *tdev = NULL;
281 struct ip_tunnel *tunnel = netdev_priv(dev);
282 const struct iphdr *iph;
283 int hlen = LL_MAX_HEADER;
284 int mtu = ETH_DATA_LEN;
285 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287 iph = &tunnel->parms.iph;
289 /* Guess output device to choose reasonable mtu and needed_headroom */
294 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
295 iph->saddr, tunnel->parms.o_key,
296 RT_TOS(iph->tos), dev_net(dev),
297 tunnel->parms.link, tunnel->fwmark, 0, 0);
298 rt = ip_route_output_key(tunnel->net, &fl4);
304 if (dev->type != ARPHRD_ETHER)
305 dev->flags |= IFF_POINTOPOINT;
307 dst_cache_reset(&tunnel->dst_cache);
310 if (!tdev && tunnel->parms.link)
311 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
314 hlen = tdev->hard_header_len + tdev->needed_headroom;
315 mtu = min(tdev->mtu, IP_MAX_MTU);
318 dev->needed_headroom = t_hlen + hlen;
319 mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
321 if (mtu < IPV4_MIN_MTU)
327 static struct ip_tunnel *ip_tunnel_create(struct net *net,
328 struct ip_tunnel_net *itn,
329 struct ip_tunnel_parm *parms)
331 struct ip_tunnel *nt;
332 struct net_device *dev;
337 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339 return ERR_CAST(dev);
341 mtu = ip_tunnel_bind_dev(dev);
342 err = dev_set_mtu(dev, mtu);
344 goto err_dev_set_mtu;
346 nt = netdev_priv(dev);
347 t_hlen = nt->hlen + sizeof(struct iphdr);
348 dev->min_mtu = ETH_MIN_MTU;
349 dev->max_mtu = IP_MAX_MTU - t_hlen;
350 if (dev->type == ARPHRD_ETHER)
351 dev->max_mtu -= dev->hard_header_len;
353 ip_tunnel_add(itn, nt);
357 unregister_netdevice(dev);
361 void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
363 const struct iphdr *iph = ip_hdr(skb);
364 const struct udphdr *udph;
366 if (iph->protocol != IPPROTO_UDP)
369 udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
370 info->encap.sport = udph->source;
371 info->encap.dport = udph->dest;
373 EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
375 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
376 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
379 const struct iphdr *iph = ip_hdr(skb);
382 #ifdef CONFIG_NET_IPGRE_BROADCAST
383 if (ipv4_is_multicast(iph->daddr)) {
384 DEV_STATS_INC(tunnel->dev, multicast);
385 skb->pkt_type = PACKET_BROADCAST;
389 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
390 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
391 DEV_STATS_INC(tunnel->dev, rx_crc_errors);
392 DEV_STATS_INC(tunnel->dev, rx_errors);
396 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
397 if (!(tpi->flags&TUNNEL_SEQ) ||
398 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399 DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
400 DEV_STATS_INC(tunnel->dev, rx_errors);
403 tunnel->i_seqno = ntohl(tpi->seq) + 1;
406 /* Save offset of outer header relative to skb->head,
407 * because we are going to reset the network header to the inner header
408 * and might change skb->head.
410 nh = skb_network_header(skb) - skb->head;
412 skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
414 if (!pskb_inet_may_pull(skb)) {
415 DEV_STATS_INC(tunnel->dev, rx_length_errors);
416 DEV_STATS_INC(tunnel->dev, rx_errors);
419 iph = (struct iphdr *)(skb->head + nh);
421 err = IP_ECN_decapsulate(iph, skb);
424 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
425 &iph->saddr, iph->tos);
427 DEV_STATS_INC(tunnel->dev, rx_frame_errors);
428 DEV_STATS_INC(tunnel->dev, rx_errors);
433 dev_sw_netstats_rx_add(tunnel->dev, skb->len);
434 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
436 if (tunnel->dev->type == ARPHRD_ETHER) {
437 skb->protocol = eth_type_trans(skb, tunnel->dev);
438 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
440 skb->dev = tunnel->dev;
444 skb_dst_set(skb, (struct dst_entry *)tun_dst);
446 gro_cells_receive(&tunnel->gro_cells, skb);
451 dst_release((struct dst_entry *)tun_dst);
455 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
457 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
460 if (num >= MAX_IPTUN_ENCAP_OPS)
463 return !cmpxchg((const struct ip_tunnel_encap_ops **)
467 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
469 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
474 if (num >= MAX_IPTUN_ENCAP_OPS)
477 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
479 ops, NULL) == ops) ? 0 : -1;
485 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
487 int ip_tunnel_encap_setup(struct ip_tunnel *t,
488 struct ip_tunnel_encap *ipencap)
492 memset(&t->encap, 0, sizeof(t->encap));
494 hlen = ip_encap_hlen(ipencap);
498 t->encap.type = ipencap->type;
499 t->encap.sport = ipencap->sport;
500 t->encap.dport = ipencap->dport;
501 t->encap.flags = ipencap->flags;
503 t->encap_hlen = hlen;
504 t->hlen = t->encap_hlen + t->tun_hlen;
508 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
510 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
511 struct rtable *rt, __be16 df,
512 const struct iphdr *inner_iph,
513 int tunnel_hlen, __be32 dst, bool md)
515 struct ip_tunnel *tunnel = netdev_priv(dev);
519 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
520 pkt_size = skb->len - tunnel_hlen;
521 pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
524 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
525 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
527 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
530 if (skb_valid_dst(skb))
531 skb_dst_update_pmtu_no_confirm(skb, mtu);
533 if (skb->protocol == htons(ETH_P_IP)) {
534 if (!skb_is_gso(skb) &&
535 (inner_iph->frag_off & htons(IP_DF)) &&
537 icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
541 #if IS_ENABLED(CONFIG_IPV6)
542 else if (skb->protocol == htons(ETH_P_IPV6)) {
543 struct rt6_info *rt6;
546 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
548 daddr = md ? dst : tunnel->parms.iph.daddr;
550 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
551 mtu >= IPV6_MIN_MTU) {
552 if ((daddr && !ipv4_is_multicast(daddr)) ||
553 rt6->rt6i_dst.plen == 128) {
554 rt6->rt6i_flags |= RTF_MODIFIED;
555 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
559 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
561 icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
569 static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
571 /* we must cap headroom to some upperlimit, else pskb_expand_head
572 * will overflow header offsets in skb_headers_offset_update().
574 static const unsigned int max_allowed = 512;
576 if (headroom > max_allowed)
577 headroom = max_allowed;
579 if (headroom > READ_ONCE(dev->needed_headroom))
580 WRITE_ONCE(dev->needed_headroom, headroom);
583 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
584 u8 proto, int tunnel_hlen)
586 struct ip_tunnel *tunnel = netdev_priv(dev);
587 u32 headroom = sizeof(struct iphdr);
588 struct ip_tunnel_info *tun_info;
589 const struct ip_tunnel_key *key;
590 const struct iphdr *inner_iph;
591 struct rtable *rt = NULL;
597 tun_info = skb_tunnel_info(skb);
598 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
599 ip_tunnel_info_af(tun_info) != AF_INET))
601 key = &tun_info->key;
602 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
603 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
606 if (skb->protocol == htons(ETH_P_IP))
607 tos = inner_iph->tos;
608 else if (skb->protocol == htons(ETH_P_IPV6))
609 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
611 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
612 tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
613 dev_net(dev), 0, skb->mark, skb_get_hash(skb),
617 tunnel_hlen = ip_encap_hlen(&tun_info->encap);
619 if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
622 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
624 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
626 rt = ip_route_output_key(tunnel->net, &fl4);
628 DEV_STATS_INC(dev, tx_carrier_errors);
632 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
635 if (rt->dst.dev == dev) {
637 DEV_STATS_INC(dev, collisions);
641 if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
643 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
644 key->u.ipv4.dst, true)) {
649 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
652 if (skb->protocol == htons(ETH_P_IP))
653 ttl = inner_iph->ttl;
654 else if (skb->protocol == htons(ETH_P_IPV6))
655 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
657 ttl = ip4_dst_hoplimit(&rt->dst);
660 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
661 if (skb_cow_head(skb, headroom)) {
666 ip_tunnel_adj_headroom(dev, headroom);
668 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
669 df, !net_eq(tunnel->net, dev_net(dev)));
672 DEV_STATS_INC(dev, tx_errors);
675 DEV_STATS_INC(dev, tx_dropped);
679 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
681 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
682 const struct iphdr *tnl_params, u8 protocol)
684 struct ip_tunnel *tunnel = netdev_priv(dev);
685 struct ip_tunnel_info *tun_info = NULL;
686 const struct iphdr *inner_iph;
687 unsigned int max_headroom; /* The extra header space needed */
688 struct rtable *rt = NULL; /* Route to the other host */
689 __be16 payload_protocol;
690 bool use_cache = false;
698 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
699 connected = (tunnel->parms.iph.daddr != 0);
700 payload_protocol = skb_protocol(skb, true);
702 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
704 dst = tnl_params->daddr;
709 DEV_STATS_INC(dev, tx_fifo_errors);
713 tun_info = skb_tunnel_info(skb);
714 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
715 ip_tunnel_info_af(tun_info) == AF_INET &&
716 tun_info->key.u.ipv4.dst) {
717 dst = tun_info->key.u.ipv4.dst;
720 } else if (payload_protocol == htons(ETH_P_IP)) {
721 rt = skb_rtable(skb);
722 dst = rt_nexthop(rt, inner_iph->daddr);
724 #if IS_ENABLED(CONFIG_IPV6)
725 else if (payload_protocol == htons(ETH_P_IPV6)) {
726 const struct in6_addr *addr6;
727 struct neighbour *neigh;
728 bool do_tx_error_icmp;
731 neigh = dst_neigh_lookup(skb_dst(skb),
732 &ipv6_hdr(skb)->daddr);
736 addr6 = (const struct in6_addr *)&neigh->primary_key;
737 addr_type = ipv6_addr_type(addr6);
739 if (addr_type == IPV6_ADDR_ANY) {
740 addr6 = &ipv6_hdr(skb)->daddr;
741 addr_type = ipv6_addr_type(addr6);
744 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
745 do_tx_error_icmp = true;
747 do_tx_error_icmp = false;
748 dst = addr6->s6_addr32[3];
750 neigh_release(neigh);
751 if (do_tx_error_icmp)
762 tos = tnl_params->tos;
765 if (payload_protocol == htons(ETH_P_IP)) {
766 tos = inner_iph->tos;
768 } else if (payload_protocol == htons(ETH_P_IPV6)) {
769 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
774 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
775 tunnel->parms.o_key, RT_TOS(tos),
776 dev_net(dev), READ_ONCE(tunnel->parms.link),
777 tunnel->fwmark, skb_get_hash(skb), 0);
779 if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
782 if (connected && md) {
783 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
785 rt = dst_cache_get_ip4(&tun_info->dst_cache,
788 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
793 rt = ip_route_output_key(tunnel->net, &fl4);
796 DEV_STATS_INC(dev, tx_carrier_errors);
800 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
802 else if (!md && connected)
803 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
807 if (rt->dst.dev == dev) {
809 DEV_STATS_INC(dev, collisions);
813 df = tnl_params->frag_off;
814 if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
815 df |= (inner_iph->frag_off & htons(IP_DF));
817 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
822 if (tunnel->err_count > 0) {
823 if (time_before(jiffies,
824 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
827 dst_link_failure(skb);
829 tunnel->err_count = 0;
832 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
833 ttl = tnl_params->ttl;
835 if (payload_protocol == htons(ETH_P_IP))
836 ttl = inner_iph->ttl;
837 #if IS_ENABLED(CONFIG_IPV6)
838 else if (payload_protocol == htons(ETH_P_IPV6))
839 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
842 ttl = ip4_dst_hoplimit(&rt->dst);
845 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
846 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
848 if (skb_cow_head(skb, max_headroom)) {
850 DEV_STATS_INC(dev, tx_dropped);
855 ip_tunnel_adj_headroom(dev, max_headroom);
857 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
858 df, !net_eq(tunnel->net, dev_net(dev)));
861 #if IS_ENABLED(CONFIG_IPV6)
863 dst_link_failure(skb);
866 DEV_STATS_INC(dev, tx_errors);
869 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
871 static void ip_tunnel_update(struct ip_tunnel_net *itn,
873 struct net_device *dev,
874 struct ip_tunnel_parm *p,
878 ip_tunnel_del(itn, t);
879 t->parms.iph.saddr = p->iph.saddr;
880 t->parms.iph.daddr = p->iph.daddr;
881 t->parms.i_key = p->i_key;
882 t->parms.o_key = p->o_key;
883 if (dev->type != ARPHRD_ETHER) {
884 __dev_addr_set(dev, &p->iph.saddr, 4);
885 memcpy(dev->broadcast, &p->iph.daddr, 4);
887 ip_tunnel_add(itn, t);
889 t->parms.iph.ttl = p->iph.ttl;
890 t->parms.iph.tos = p->iph.tos;
891 t->parms.iph.frag_off = p->iph.frag_off;
893 if (t->parms.link != p->link || t->fwmark != fwmark) {
896 WRITE_ONCE(t->parms.link, p->link);
898 mtu = ip_tunnel_bind_dev(dev);
902 dst_cache_reset(&t->dst_cache);
903 netdev_state_change(dev);
906 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
909 struct ip_tunnel *t = netdev_priv(dev);
910 struct net *net = t->net;
911 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
915 if (dev == itn->fb_tunnel_dev) {
916 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
918 t = netdev_priv(dev);
920 memcpy(p, &t->parms, sizeof(*p));
926 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
929 p->iph.frag_off |= htons(IP_DF);
930 if (!(p->i_flags & VTI_ISVTI)) {
931 if (!(p->i_flags & TUNNEL_KEY))
933 if (!(p->o_flags & TUNNEL_KEY))
937 t = ip_tunnel_find(itn, p, itn->type);
939 if (cmd == SIOCADDTUNNEL) {
941 t = ip_tunnel_create(net, itn, p);
942 err = PTR_ERR_OR_ZERO(t);
949 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
956 unsigned int nflags = 0;
958 if (ipv4_is_multicast(p->iph.daddr))
959 nflags = IFF_BROADCAST;
960 else if (p->iph.daddr)
961 nflags = IFF_POINTOPOINT;
963 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
968 t = netdev_priv(dev);
974 ip_tunnel_update(itn, t, dev, p, true, 0);
982 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
985 if (dev == itn->fb_tunnel_dev) {
987 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
991 if (t == netdev_priv(itn->fb_tunnel_dev))
995 unregister_netdevice(dev);
1006 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
1008 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
1009 void __user *data, int cmd)
1011 struct ip_tunnel_parm p;
1014 if (copy_from_user(&p, data, sizeof(p)))
1016 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
1017 if (!err && copy_to_user(data, &p, sizeof(p)))
1021 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
1023 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1025 struct ip_tunnel *tunnel = netdev_priv(dev);
1026 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1027 int max_mtu = IP_MAX_MTU - t_hlen;
1029 if (dev->type == ARPHRD_ETHER)
1030 max_mtu -= dev->hard_header_len;
1032 if (new_mtu < ETH_MIN_MTU)
1035 if (new_mtu > max_mtu) {
1045 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1047 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1049 return __ip_tunnel_change_mtu(dev, new_mtu, true);
1051 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1053 static void ip_tunnel_dev_free(struct net_device *dev)
1055 struct ip_tunnel *tunnel = netdev_priv(dev);
1057 gro_cells_destroy(&tunnel->gro_cells);
1058 dst_cache_destroy(&tunnel->dst_cache);
1059 free_percpu(dev->tstats);
1062 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1064 struct ip_tunnel *tunnel = netdev_priv(dev);
1065 struct ip_tunnel_net *itn;
1067 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1069 if (itn->fb_tunnel_dev != dev) {
1070 ip_tunnel_del(itn, netdev_priv(dev));
1071 unregister_netdevice_queue(dev, head);
1074 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1076 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1078 struct ip_tunnel *tunnel = netdev_priv(dev);
1082 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1084 int ip_tunnel_get_iflink(const struct net_device *dev)
1086 const struct ip_tunnel *tunnel = netdev_priv(dev);
1088 return READ_ONCE(tunnel->parms.link);
1090 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1092 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1093 struct rtnl_link_ops *ops, char *devname)
1095 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1096 struct ip_tunnel_parm parms;
1099 itn->rtnl_link_ops = ops;
1100 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1101 INIT_HLIST_HEAD(&itn->tunnels[i]);
1103 if (!ops || !net_has_fallback_tunnels(net)) {
1104 struct ip_tunnel_net *it_init_net;
1106 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1107 itn->type = it_init_net->type;
1108 itn->fb_tunnel_dev = NULL;
1112 memset(&parms, 0, sizeof(parms));
1114 strscpy(parms.name, devname, IFNAMSIZ);
1117 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1118 /* FB netdevice is special: we have one, and only one per netns.
1119 * Allowing to move it to another netns is clearly unsafe.
1121 if (!IS_ERR(itn->fb_tunnel_dev)) {
1122 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1123 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1124 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1125 itn->type = itn->fb_tunnel_dev->type;
1129 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1131 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1133 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1134 struct list_head *head,
1135 struct rtnl_link_ops *ops)
1137 struct net_device *dev, *aux;
1140 for_each_netdev_safe(net, dev, aux)
1141 if (dev->rtnl_link_ops == ops)
1142 unregister_netdevice_queue(dev, head);
1144 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1145 struct ip_tunnel *t;
1146 struct hlist_node *n;
1147 struct hlist_head *thead = &itn->tunnels[h];
1149 hlist_for_each_entry_safe(t, n, thead, hash_node)
1150 /* If dev is in the same netns, it has already
1151 * been added to the list by the previous loop.
1153 if (!net_eq(dev_net(t->dev), net))
1154 unregister_netdevice_queue(t->dev, head);
1158 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1159 struct rtnl_link_ops *ops,
1160 struct list_head *dev_to_kill)
1162 struct ip_tunnel_net *itn;
1166 list_for_each_entry(net, net_list, exit_list) {
1167 itn = net_generic(net, id);
1168 ip_tunnel_destroy(net, itn, dev_to_kill, ops);
1171 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1173 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1174 struct ip_tunnel_parm *p, __u32 fwmark)
1176 struct ip_tunnel *nt;
1177 struct net *net = dev_net(dev);
1178 struct ip_tunnel_net *itn;
1182 nt = netdev_priv(dev);
1183 itn = net_generic(net, nt->ip_tnl_net_id);
1185 if (nt->collect_md) {
1186 if (rtnl_dereference(itn->collect_md_tun))
1189 if (ip_tunnel_find(itn, p, dev->type))
1195 nt->fwmark = fwmark;
1196 err = register_netdevice(dev);
1198 goto err_register_netdevice;
1200 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1201 eth_hw_addr_random(dev);
1203 mtu = ip_tunnel_bind_dev(dev);
1205 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1207 if (dev->type == ARPHRD_ETHER)
1208 max -= dev->hard_header_len;
1210 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1213 err = dev_set_mtu(dev, mtu);
1215 goto err_dev_set_mtu;
1217 ip_tunnel_add(itn, nt);
1221 unregister_netdevice(dev);
1222 err_register_netdevice:
1225 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1227 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1228 struct ip_tunnel_parm *p, __u32 fwmark)
1230 struct ip_tunnel *t;
1231 struct ip_tunnel *tunnel = netdev_priv(dev);
1232 struct net *net = tunnel->net;
1233 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1235 if (dev == itn->fb_tunnel_dev)
1238 t = ip_tunnel_find(itn, p, dev->type);
1246 if (dev->type != ARPHRD_ETHER) {
1247 unsigned int nflags = 0;
1249 if (ipv4_is_multicast(p->iph.daddr))
1250 nflags = IFF_BROADCAST;
1251 else if (p->iph.daddr)
1252 nflags = IFF_POINTOPOINT;
1254 if ((dev->flags ^ nflags) &
1255 (IFF_POINTOPOINT | IFF_BROADCAST))
1260 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1263 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1265 int ip_tunnel_init(struct net_device *dev)
1267 struct ip_tunnel *tunnel = netdev_priv(dev);
1268 struct iphdr *iph = &tunnel->parms.iph;
1271 dev->needs_free_netdev = true;
1272 dev->priv_destructor = ip_tunnel_dev_free;
1273 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1277 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1279 free_percpu(dev->tstats);
1283 err = gro_cells_init(&tunnel->gro_cells, dev);
1285 dst_cache_destroy(&tunnel->dst_cache);
1286 free_percpu(dev->tstats);
1291 tunnel->net = dev_net(dev);
1292 strcpy(tunnel->parms.name, dev->name);
1296 if (tunnel->collect_md)
1297 netif_keep_dst(dev);
1298 netdev_lockdep_set_classes(dev);
1301 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1303 void ip_tunnel_uninit(struct net_device *dev)
1305 struct ip_tunnel *tunnel = netdev_priv(dev);
1306 struct net *net = tunnel->net;
1307 struct ip_tunnel_net *itn;
1309 itn = net_generic(net, tunnel->ip_tnl_net_id);
1310 ip_tunnel_del(itn, netdev_priv(dev));
1311 if (itn->fb_tunnel_dev == dev)
1312 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1314 dst_cache_reset(&tunnel->dst_cache);
1316 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1318 /* Do least required initialization, rest of init is done in tunnel_init call */
1319 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1321 struct ip_tunnel *tunnel = netdev_priv(dev);
1322 tunnel->ip_tnl_net_id = net_id;
1324 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1326 MODULE_DESCRIPTION("IPv4 tunnel implementation library");
1327 MODULE_LICENSE("GPL");